From e547e26d99536e1f48c346ba8ead87a5770f97e5 Mon Sep 17 00:00:00 2001 From: kernelkind Date: Thu, 21 Dec 2023 14:40:06 -0500 Subject: [PATCH] Handle period at end of URL Fix parsing URL when encountering a period at the end of the url by setting it as disallowed from being present at the end of a URL. Some characters are disallowed to be present at the end of URLs. Presently, the period character is the only disallowed character. A character is the last character in the URL if it is followed by is_whitespace() or if it's the last character in the string. Closes: https://github.com/damus-io/damus/issues/1638 LNURL1DP68GURN8GHJ7EM9W3SKCCNE9E3K7MF0D3H82UNVWQHKWUN9V4HXGCTHDC6RZVGR8SW3G Signed-off-by: kernelkind Reviewed-by: William Casarin Signed-off-by: William Casarin --- damus-c/cursor.h | 43 ++++++++++++++++++++++++++++ damus-c/damus.c | 6 ++-- damusTests/UrlTests.swift | 60 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/damus-c/cursor.h b/damus-c/cursor.h index 77e54144..78aca7fb 100644 --- a/damus-c/cursor.h +++ b/damus-c/cursor.h @@ -489,6 +489,32 @@ static inline int is_whitespace(int c) { return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; } + +static inline int next_char_is_whitespace(unsigned char *curChar, unsigned char *endChar) { + unsigned char * next = curChar + 1; + if(next > endChar) return 0; + else if(next == endChar) return 1; + return is_whitespace(*next); +} + +static int char_disallowed_at_end_url(char c){ + return c == '.'; +} + +static inline int is_final_url_char(unsigned char *curChar, unsigned char *endChar){ + if(is_whitespace(*curChar)){ + return 1; + } + else if(next_char_is_whitespace(curChar, endChar)) { + // next char is whitespace so this char could be the final char in the url + return char_disallowed_at_end_url(*curChar); + } + else{ + // next char isn't whitespace so it can't be a final char + return 0; + } +} + static inline int is_underscore(int c) { return c == '_'; } @@ -670,6 +696,23 @@ static inline int consume_until_whitespace(struct cursor *cur, int or_end) { return or_end; } +static inline int consume_until_end_url(struct cursor *cur, int or_end) { + char c; + int consumedAtLeastOne = 0; + + while (cur->p < cur->end) { + c = *cur->p; + + if (is_final_url_char(cur->p, cur->end)) + return consumedAtLeastOne; + + cur->p++; + consumedAtLeastOne = 1; + } + + return or_end; +} + static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) { char c; int consumedAtLeastOne = 0; diff --git a/damus-c/damus.c b/damus-c/damus.c index 6121e768..47eac06e 100644 --- a/damus-c/damus.c +++ b/damus-c/damus.c @@ -117,7 +117,7 @@ static int consume_url_fragment(struct cursor *cur) cur->p++; - return consume_until_whitespace(cur, 1); + return consume_until_end_url(cur, 1); } static int consume_url_path(struct cursor *cur) @@ -134,7 +134,7 @@ static int consume_url_path(struct cursor *cur) while (cur->p < cur->end) { c = *cur->p; - if (c == '?' || c == '#' || is_whitespace(c)) { + if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) { return 1; } @@ -152,7 +152,7 @@ static int consume_url_host(struct cursor *cur) while (cur->p < cur->end) { c = *cur->p; // TODO: handle IDNs - if (is_alphanumeric(c) || c == '.' || c == '-') + if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end)) { count++; cur->p++; diff --git a/damusTests/UrlTests.swift b/damusTests/UrlTests.swift index 7ebcc513..f9fa09ac 100644 --- a/damusTests/UrlTests.swift +++ b/damusTests/UrlTests.swift @@ -100,5 +100,63 @@ final class UrlTests: XCTestCase { XCTAssertEqual(blocks[1].asURL, testURL) XCTAssertEqual(blocks[2].asText, " this is not a hashtag!") } - + + func testParseURL_OneURLEndPeriodSimple_RemovesPeriod(){ + testParseURL(inputURLString: "http://example.com.", expectedURLs: "http://example.com") + } + + func testParseURL_OneURL_RemovesPeriod(){ + testParseURL(inputURLString: "http://example.com/.test", expectedURLs: "http://example.com/.test") + } + + func testParseURL_OneURLEndPeriodAndSpaceSimple_RemovesPeriod(){ + testParseURL(inputURLString: "http://example.com. ", expectedURLs: "http://example.com") + } + + func testParseURL_OneURLEndPeriodComplex_RemovesPeriod(){ + testParseURL(inputURLString: "http://example.com/test.", expectedURLs: "http://example.com/test") + } + + func testParseURL_TwoURLEndPeriodSimple_RemovesPeriods(){ + testParseURL(inputURLString: "http://example.com. http://example.com.", expectedURLs: "http://example.com", "http://example.com") + } + + func testParseURL_ThreeURLEndPeriodSimple_RemovesPeriods(){ + testParseURL(inputURLString: "http://example.com. http://example.com. http://example.com.", expectedURLs: "http://example.com", "http://example.com", "http://example.com") + } + + func testParseURL_TwoURLEndPeriodFirstComplexSecondSimple_RemovesPeriods(){ + testParseURL(inputURLString: "http://example.com/test. http://example.com.", expectedURLs: "http://example.com/test", "http://example.com") + } + + func testParseURL_TwoURLEndPeriodFirstSimpleSecondComplex_RemovesPeriods(){ + testParseURL(inputURLString: "http://example.com. http://example.com/test.", expectedURLs: "http://example.com", "http://example.com/test") + } + + func testParseURL_TwoURLEndPeriodFirstComplexSecondComplex_RemovesPeriods(){ + testParseURL(inputURLString: "http://example.com/test. http://example.com/test.", expectedURLs: "http://example.com/test", "http://example.com/test") + } + + func testParseURL_OneURLEndPeriodSerachQuery_RemovesPeriod(){ + testParseURL(inputURLString: "https://www.example.com/search?q=test+query.", expectedURLs: "https://www.example.com/search?q=test+query") + } +} + +func testParseURL(inputURLString: String, expectedURLs: String...) { + let parsedURL: [Block] = parse_note_content(content: .content(inputURLString, nil)).blocks.filter { + $0.isURL + } + + if(expectedURLs.count != parsedURL.count) { + XCTFail() + } + + for i in 0..