fix URL parsing edge cases (#360)

This should fix the following:
- URLs in parentheses
- URLs at the end of a sentence
This commit is contained in:
Sam Samskies 2023-02-27 09:21:38 -10:00 committed by GitHub
parent f934dcd092
commit 2782f24690
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 50 additions and 10 deletions

View File

@ -84,13 +84,6 @@ export const EmailRegex =
// eslint-disable-next-line no-useless-escape
/^(([^<>()\[\]\\.,;:\s@"]+(\.[^<>()\[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
/**
* Generic URL regex
*/
export const UrlRegex =
// eslint-disable-next-line no-useless-escape
/((?:http|ftp|https):\/\/(?:[\w+?\.\w+])+(?:[a-zA-Z0-9\~\!\@\#\$\%\^\&\*\(\)_\-\=\+\\\/\?\.\:\;\'\,]*)?)/i;
/**
* Extract file extensions regex
*/

View File

@ -4,8 +4,8 @@ import { Link } from "react-router-dom";
import ReactMarkdown from "react-markdown";
import { visit, SKIP } from "unist-util-visit";
import { UrlRegex, MentionRegex, InvoiceRegex, HashtagRegex } from "Const";
import { eventLink, hexToBech32, unwrap } from "Util";
import { MentionRegex, InvoiceRegex, HashtagRegex } from "Const";
import { eventLink, hexToBech32, splitByUrl, unwrap } from "Util";
import Invoice from "Element/Invoice";
import Hashtag from "Element/Hashtag";
@ -36,7 +36,7 @@ export default function Text({ content, tags, creator, users }: TextProps) {
return fragments
.map(f => {
if (typeof f === "string") {
return f.split(UrlRegex).map(a => {
return splitByUrl(f).map(a => {
if (a.startsWith("http")) {
return <HyperText key={a} link={a} creator={creator} />;
}

View File

@ -0,0 +1,40 @@
import { splitByUrl } from "./Util";
describe("splitByUrl", () => {
it("should split a string by URLs", () => {
const inputStr =
"@npub1q6mcr8t not https://example.com- sure what your stack is, https://example.com but I made a https://example.com! simple example (https://example.com) of how https://example.com/yo-yo https://example.example.com to do this https://example.com, https://example.com?q=asdf for Next.js apps hosted on Vercel https://example.com. Scarcity in money provides the incentive to create abundance in other things as there is a mechanism to reliably store value. https://i.imgur.com/rkqhjeq.png Every form of money that could be inflated by way of force or technological advancement has been.";
const expectedOutput = [
"@npub1q6mcr8t not ",
"https://example.com-",
" sure what your stack is, ",
"https://example.com",
" but I made a ",
"https://example.com",
"! simple example (",
"https://example.com",
") of how ",
"https://example.com/yo-yo",
" ",
"https://example.example.com",
" to do this ",
"https://example.com",
", ",
"https://example.com?q=asdf",
" for Next.js apps hosted on Vercel ",
"https://example.com",
". Scarcity in money provides the incentive to create abundance in other things as there is a mechanism to reliably store value. ",
"https://i.imgur.com/rkqhjeq.png",
" Every form of money that could be inflated by way of force or technological advancement has been.",
];
expect(splitByUrl(inputStr)).toEqual(expectedOutput);
});
it("should return an array with a single string if no URLs are found", () => {
const inputStr = "This is a regular string with no URLs";
const expectedOutput = ["This is a regular string with no URLs"];
expect(splitByUrl(inputStr)).toEqual(expectedOutput);
});
});

View File

@ -222,3 +222,10 @@ export function tagFilterOfTextRepost(note: TaggedRawEvent, id?: u256): (tag: st
export function groupByPubkey(acc: Record<HexKey, MetadataCache>, user: MetadataCache) {
return { ...acc, [user.pubkey]: user };
}
export function splitByUrl(str: string) {
const urlRegex =
/((?:http|ftp|https):\/\/(?:[\w+?.\w+])+(?:[a-zA-Z0-9~!@#$%^&*()_\-=+\\/?.:;',]*)?(?:[-A-Za-z0-9+&@#/%=~_|]))/i;
return str.split(urlRegex);
}