Improves the speed of the text parser.

2024-09-19 19:46:35 +00:00 · 2024-02-21 19:18:30 -05:00 · 2024-02-21 19:18:30 -05:00 · 5b77e39c8b
commit 5b77e39c8b
parent 5886c866d3
2 changed files with 299 additions and 63 deletions
--- a/benchmark/src/androidTest/java/com/vitorpamplona/amethyst/benchmark/RichTextParserBenchmark.kt
+++ b/benchmark/src/androidTest/java/com/vitorpamplona/amethyst/benchmark/RichTextParserBenchmark.kt
@ -0,0 +1,194 @@
+/**
+ * Copyright (c) 2024 Vitor Pamplona
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
+ * Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+package com.vitorpamplona.amethyst.benchmark
+
+import androidx.benchmark.junit4.BenchmarkRule
+import androidx.benchmark.junit4.measureRepeated
+import androidx.test.ext.junit.runners.AndroidJUnit4
+import com.linkedin.urls.detection.UrlDetector
+import com.linkedin.urls.detection.UrlDetectorOptions
+import com.vitorpamplona.amethyst.commons.HashTagSegment
+import com.vitorpamplona.amethyst.commons.ImageSegment
+import com.vitorpamplona.amethyst.commons.LinkSegment
+import com.vitorpamplona.amethyst.commons.RichTextParser
+import com.vitorpamplona.quartz.events.EmptyTagList
+import junit.framework.TestCase.assertNull
+import junit.framework.TestCase.assertTrue
+import org.junit.Rule
+import org.junit.Test
+import org.junit.runner.RunWith
+
+@RunWith(AndroidJUnit4::class)
+class RichTextParserBenchmark {
+    @get:Rule
+    val benchmarkRule = BenchmarkRule()
+
+    @Test
+    fun parseApkUrl() {
+        benchmarkRule.measureRepeated {
+            assertNull(
+                RichTextParser().parseMediaUrl(
+                    "https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk",
+                    EmptyTagList,
+                ),
+            )
+        }
+    }
+
+    @Test
+    fun parseImageUrl() {
+        benchmarkRule.measureRepeated {
+            assertTrue(
+                RichTextParser().parseText(
+                    "first https://m.primal.net/HeKw.jpg second",
+                    EmptyTagList,
+                ).paragraphs[0].words[1] is ImageSegment,
+            )
+        }
+    }
+
+    @Test
+    fun parseNoSchemeUrl() {
+        benchmarkRule.measureRepeated {
+            assertTrue(
+                RichTextParser().parseText(
+                    "first amethyst.social second",
+                    EmptyTagList,
+                ).paragraphs[0].words[1] is LinkSegment,
+            )
+        }
+    }
+
+    @Test
+    fun parseHashtag() {
+        benchmarkRule.measureRepeated {
+            assertTrue(
+                RichTextParser().parseText(
+                    "first #amethyst second",
+                    EmptyTagList,
+                ).paragraphs[0].words[1] is HashTagSegment,
+            )
+        }
+    }
+
+    @Test
+    fun computeTestCase1All() {
+        benchmarkRule.measureRepeated {
+            RichTextParser().parseText(testCase1, EmptyTagList)
+        }
+    }
+
+    @Test
+    fun computeTestCase2All() {
+        benchmarkRule.measureRepeated {
+            RichTextParser().parseText(testCase2, EmptyTagList)
+        }
+    }
+
+    @Test
+    fun computeTestCase2UrlDetector() {
+        benchmarkRule.measureRepeated {
+            UrlDetector(testCase2, UrlDetectorOptions.Default).detect()
+        }
+    }
+
+    @Test
+    fun computeTestCase2ParseUrls() {
+        benchmarkRule.measureRepeated {
+            RichTextParser().parseValidUrls(testCase2)
+        }
+    }
+
+    @Test
+    fun computeTestCase3All() {
+        benchmarkRule.measureRepeated {
+            RichTextParser().parseText(testCase3, EmptyTagList)
+        }
+    }
+
+    val testCase1 = """
+#Amethyst v0.83.10
+
+تحديث جديد لـ Amethyst بإصدار 0.83.10 مع تعديلات وإضافات جديدة
+
+: NIP-92 إصلاحات الأخطاء
+
+ الإضافات الجديدة:
+ - يتضمن رابط المنتج في الرسالة الأولى من المشتري في السوق
+ - يضيف دعمًا لـ NIP-92 في الرسائل العامة والرسائل المباشرة الجديدة (NIP-17).  يبقى NIP-54 في NIP-04 DMs
+ - إضافة التمرير الأفقي إلى أزرار الإجراءات في شاشة النشر الجديد لإصلاح الأزرار المخفية جزئيًا في الشاشات الصغيرة/الرفيعة.
+
+ اصلاحات الشوائب:
+ - إصلاحات التعطل مع مبلغ Zap مخصص غير صالح
+ - يعمل على إصلاح مشكلات إعادة اتصال التتابع عندما يقوم المرحل بإغلاق الاتصال
+ - إصلاح الحشو العلوي للملاحظة المقتبسة في المنشور
+ - تحسين استخدام الذاكرة للمستخدم المرئي وعلامة URL في المشاركات الجديدة
+
+ الترجمات المحدثة:
+ - الفارسية بواسطة
+ - الفرنسية والإنجليزية، المملكة المتحدة بواسطة
+ - الأوكرانية
+ - الإسبانية والإسبانية والمكسيك والإسبانية والولايات المتحدة بواسطة
+ - العربية
+
+ تحسينات جودة الكود:
+ - تحديثات لنظام Android Studio 2023.1.1 Patch 2
+
+
+
+
+nostr:nevent1qqszq7kl888sw0c5rpvepn8w373zt0jrw8864x8lkauxxw335s66rzgppemhxue69uhkummn9ekx7mp0qgsyvrp9u6p0mfur9dfdru3d853tx9mdjuhkphxuxgfwmryja7zsvhqrqsqqqqqpaax7m2
+"""
+
+    val testCase2 = """
+#Amethyst v0.83.10: NIP-92 and Bug Fixes
+
+New Additions:
+- Includes a link to the product in the first message from the buyer in the marketplace
+- Adds support for NIP-92 in public messages and new DMs (NIP-17). NIP-54 stays in NIP-04 DMs
+- Adds Horizontal Scroll to the action buttons in the New Post screen to partially fix hidden buttons in small/thin screens.
+
+Bugfixes:
+- Fixes crash with an invalid custom Zap Amount
+- Fixes relay re-connection issues when the relay closes a connection
+- Fixes the top padding of the quoted note in a post
+- Optimizes memory use of the visual user and url tagger in new posts
+
+Updated translations:
+- Persian by nostr:npub1cpazafytvafazxkjn43zjfwtfzatfz508r54f6z6a3rf2ws8223qc3xxpk
+- French and English, United Kingdom by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
+- Ukrainian by crowdin.com/profile/liizzzz
+- Spanish, Spanish, Mexico and Spanish, United States by nostr:npub1luhyzgce7qtcs6r6v00ryjxza8av8u4dzh3avg0zks38tjktnmxspxq903
+- Arabic by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
+
+Code Quality Improvements:
+- Updates to Android Studio 2023.1.1 Patch 2
+
+Download:
+- [Play Edition](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk )
+- [FOSS Edition - No translations](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-fdroid-universal-v0.83.10.apk )
+"""
+
+    val testCase3 = """#100aDayUntil100k
+Day 5 ✔️
+
+Seems like they may be getting easier"""
+}
--- a/commons/src/main/java/com/vitorpamplona/amethyst/commons/RichTextParser.kt
+++ b/commons/src/main/java/com/vitorpamplona/amethyst/commons/RichTextParser.kt
@ -74,29 +74,35 @@ class RichTextParser() {
        }
    }

+    fun parseValidUrls(content: String): LinkedHashSet<String> {
+        val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
+
+        return urls.mapNotNullTo(LinkedHashSet(urls.size)) {
+            if (it.originalUrl.contains("@")) {
+                if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
+                    null
+                } else {
+                    it.originalUrl
+                }
+            } else if (isNumber(it.originalUrl)) {
+                null // avoids urls that look like 123.22
+            } else if (it.originalUrl.contains("。")) {
+                null // avoids Japanese characters as fake urls
+            } else {
+                if (HTTPRegex.matches(it.originalUrl)) {
+                    it.originalUrl
+                } else {
+                    null
+                }
+            }
+        }
+    }
+
    fun parseText(
        content: String,
        tags: ImmutableListOfLists<String>,
    ): RichTextViewerState {
-        val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
-
-        val urlSet =
-            urls.mapNotNullTo(LinkedHashSet(urls.size)) {
-                // removes e-mails
-                if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
-                    null
-                } else if (isNumber(it.originalUrl)) {
-                    null
-                } else if (it.originalUrl.contains("。")) {
-                    null
-                } else {
-                    if (HTTPRegex.matches(it.originalUrl)) {
-                        it.originalUrl
-                    } else {
-                        null
-                    }
-                }
-            }
+        val urlSet = parseValidUrls(content)

        val imagesForPager =
            urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url }
@ -153,8 +159,29 @@ class RichTextParser() {
        return paragraphSegments.toImmutableList()
    }

-    fun isNumber(word: String): Boolean {
-        return numberPattern.matcher(word).matches()
+    private fun isNumber(word: String) = numberPattern.matcher(word).matches()
+
+    private fun isPhoneNumberChar(c: Char): Boolean {
+        return when (c) {
+            in '0'..'9' -> true
+            '-' -> true
+            ' ' -> true
+            '.' -> true
+            else -> false
+        }
+    }
+
+    fun isPotentialPhoneNumber(word: String): Boolean {
+        if (word.length !in 7..14) return false
+        var isPotentialNumber = true
+
+        for (c in word) {
+            if (!isPhoneNumberChar(c)) {
+                isPotentialNumber = false
+                break
+            }
+        }
+        return isPotentialNumber
    }

    fun isDate(word: String): Boolean {
@ -172,46 +199,48 @@ class RichTextParser() {
        emojis: Map<String, String>,
        tags: ImmutableListOfLists<String>,
    ): Segment {
-        val emailMatcher = Patterns.EMAIL_ADDRESS.matcher(word)
-        val phoneMatcher = Patterns.PHONE.matcher(word)
-        val schemelessMatcher = noProtocolUrlValidator.matcher(word)
+        if (word.isEmpty()) return RegularTextSegment(word)

-        return if (word.isEmpty()) {
-            RegularTextSegment(word)
-        } else if (images.contains(word)) {
-            ImageSegment(word)
-        } else if (urls.contains(word)) {
-            LinkSegment(word)
-        } else if (emojis.any { word.contains(it.key) }) {
-            EmojiSegment(word)
-        } else if (word.startsWith("lnbc", true)) {
-            InvoiceSegment(word)
-        } else if (word.startsWith("lnurl", true)) {
-            WithdrawSegment(word)
-        } else if (word.startsWith("cashuA", true)) {
-            CashuSegment(word)
-        } else if (emailMatcher.matches()) {
-            EmailSegment(word)
-        } else if (word.length in 7..14 && !isDate(word) && phoneMatcher.matches()) {
-            PhoneSegment(word)
-        } else if (startsWithNIP19Scheme(word)) {
-            BechSegment(word)
-        } else if (word.startsWith("#")) {
-            parseHash(word, tags)
-        } else if (word.contains(".") && schemelessMatcher.find()) {
-            val url = schemelessMatcher.group(1) // url
-            val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
-            val pattern =
-                """^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
-                    .toRegex(RegexOption.IGNORE_CASE)
-            if (pattern.find(word) != null) {
-                SchemelessUrlSegment(word, url, additionalChars)
-            } else {
-                RegularTextSegment(word)
-            }
-        } else {
-            RegularTextSegment(word)
+        if (images.contains(word)) return ImageSegment(word)
+
+        if (urls.contains(word)) return LinkSegment(word)
+
+        if (word.startsWith(":") && emojis.any { word.contains(it.key) }) return EmojiSegment(word)
+
+        if (word.startsWith("lnbc", true)) return InvoiceSegment(word)
+
+        if (word.startsWith("lnurl", true)) return WithdrawSegment(word)
+
+        if (word.startsWith("cashuA", true)) return CashuSegment(word)
+
+        if (startsWithNIP19Scheme(word)) return BechSegment(word)
+
+        if (word.startsWith("#")) return parseHash(word, tags)
+
+        if (word.contains("@")) {
+            if (Patterns.EMAIL_ADDRESS.matcher(word).matches()) return EmailSegment(word)
        }
+
+        if (isPotentialPhoneNumber(word) && !isDate(word)) {
+            if (Patterns.PHONE.matcher(word).matches()) return PhoneSegment(word)
+        }
+
+        val indexOfPeriod = word.indexOf(".")
+        if (indexOfPeriod > 0 && indexOfPeriod < word.length - 1) { // periods cannot be the last one
+            val schemelessMatcher = noProtocolUrlValidator.matcher(word)
+            if (schemelessMatcher.find()) {
+                val url = schemelessMatcher.group(1) // url
+                val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
+                val pattern =
+                    """^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
+                        .toRegex(RegexOption.IGNORE_CASE)
+                if (pattern.find(word) != null && url != null) {
+                    return SchemelessUrlSegment(word, url, additionalChars)
+                }
+            }
+        }
+
+        return RegularTextSegment(word)
    }

    private fun parseHash(
@ -289,7 +318,11 @@ class RichTextParser() {
        val hashTagsPattern: Pattern =
            Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE)

-        val acceptedNIP19schemes = listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1")
+        val acceptedNIP19schemes =
+            listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") +
+                listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1").map {
+                    it.uppercase()
+                }

        private fun removeQueryParamsForExtensionComparison(fullUrl: String): String {
            return if (fullUrl.contains("?")) {
@ -344,9 +377,18 @@ class RichTextParser() {
        }

        fun startsWithNIP19Scheme(word: String): Boolean {
-            val cleaned = word.lowercase().removePrefix("@").removePrefix("nostr:").removePrefix("@")
-
-            return acceptedNIP19schemes.any { cleaned.startsWith(it) }
+            if (word.isEmpty()) return false
+            return if (word[0] == 'n' || word[0] == 'N') {
+                if (word.startsWith("nostr:n") || word.startsWith("NOSTR:N")) {
+                    acceptedNIP19schemes.any { word.startsWith(it, 6) }
+                } else {
+                    acceptedNIP19schemes.any { word.startsWith(it) }
+                }
+            } else if (word[0] == '@') {
+                acceptedNIP19schemes.any { word.startsWith(it, 1) }
+            } else {
+                false
+            }
        }

        fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()