urls: fix wikipedia url detection with parenthesis

Fixes: f0df4aa218 ("Strip common punctuations from URLs") Fixes: https://github.com/damus-io/damus/issues/1027 Closes: https://github.com/damus-io/damus/pull/1063 Changelog-Fixed: Fix wikipedia url detection with parenthesis
2024-09-19 19:46:51 +00:00 · 2023-08-06 13:47:33 -07:00 · 2023-08-06 13:47:33 -07:00 · 53e9269da6
commit 53e9269da6
parent 85930df8e3
2 changed files with 69 additions and 10 deletions
--- a/damus-c/cursor.h
+++ b/damus-c/cursor.h
@ -447,12 +447,8 @@ static inline int is_left_boundary(char c) {
    return is_right_boundary(c) || is_utf8_byte(c);
 }
 static inline int is_invalid_url_ending(char c) {
    return c == '!' || c == '?' || c == ')' || c == '.' || c == ',' || c == ';';
 }
 static inline int is_alphanumeric(char c) {
-    return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
 }
 static inline int consume_until_boundary(struct cursor *cur) {
--- a/damus-c/damus.c
+++ b/damus-c/damus.c
@ -104,6 +104,69 @@ static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8
    return add_block(blocks, b);
 }
 static int consume_url_fragment(struct cursor *cur)
 {
    int c;
    if ((c = peek_char(cur, 0)) < 0)
        return 1;
    if (c != '#' && c != '?') {
        return 1;
    }
    cur->p++;
    return consume_until_whitespace(cur, 1);
 }
 static int consume_url_path(struct cursor *cur)
 {
    int c;
    if ((c = peek_char(cur, 0)) < 0)
        return 1;
    if (c != '/') {
        return 1;
    }
    while (cur->p < cur->end) {
        c = *cur->p;
        if (c == '?' || c == '#' || is_whitespace(c)) {
            return 1;
        }
        cur->p++;
    }
    return 1;
 }
 static int consume_url_host(struct cursor *cur)
 {
 	char c;
 	int count = 0;
 	while (cur->p < cur->end) {
 		c = *cur->p;
 		// TODO: handle IDNs
        if (is_alphanumeric(c) || c == '.' || c == '-')
 		{
 			count++;
 			cur->p++;
 			continue;
 		}
 		return count != 0;
 	}
 	// this means the end of the URL hostname is the end of the buffer and we finished
 	return count != 0;
 }
 static int parse_url(struct cursor *cur, struct note_block *block) {
    u8 *start = cur->p;
@ -122,14 +185,14 @@ static int parse_url(struct cursor *cur, struct note_block *block) {
        }
    }
-    if (!consume_until_whitespace(cur, 1)) {
+    if (!(consume_url_host(cur) &&
          consume_url_path(cur) &&
          consume_url_fragment(cur)))
    {
        cur->p = start;
        return 0;
    }
    // strip any unwanted characters
    while(is_invalid_url_ending(peek_char(cur, -1))) cur->p--;
    block->type = BLOCK_URL;
    block->block.str.start = (const char *)start;
    block->block.str.end = (const char *)cur->p;