Improves the speed of the text parser.

This commit is contained in:
Vitor Pamplona 2024-02-21 19:18:30 -05:00
parent 5886c866d3
commit 5b77e39c8b
2 changed files with 299 additions and 63 deletions

View File

@ -0,0 +1,194 @@
/**
* Copyright (c) 2024 Vitor Pamplona
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package com.vitorpamplona.amethyst.benchmark
import androidx.benchmark.junit4.BenchmarkRule
import androidx.benchmark.junit4.measureRepeated
import androidx.test.ext.junit.runners.AndroidJUnit4
import com.linkedin.urls.detection.UrlDetector
import com.linkedin.urls.detection.UrlDetectorOptions
import com.vitorpamplona.amethyst.commons.HashTagSegment
import com.vitorpamplona.amethyst.commons.ImageSegment
import com.vitorpamplona.amethyst.commons.LinkSegment
import com.vitorpamplona.amethyst.commons.RichTextParser
import com.vitorpamplona.quartz.events.EmptyTagList
import junit.framework.TestCase.assertNull
import junit.framework.TestCase.assertTrue
import org.junit.Rule
import org.junit.Test
import org.junit.runner.RunWith
@RunWith(AndroidJUnit4::class)
class RichTextParserBenchmark {
@get:Rule
val benchmarkRule = BenchmarkRule()
@Test
fun parseApkUrl() {
benchmarkRule.measureRepeated {
assertNull(
RichTextParser().parseMediaUrl(
"https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk",
EmptyTagList,
),
)
}
}
@Test
fun parseImageUrl() {
benchmarkRule.measureRepeated {
assertTrue(
RichTextParser().parseText(
"first https://m.primal.net/HeKw.jpg second",
EmptyTagList,
).paragraphs[0].words[1] is ImageSegment,
)
}
}
@Test
fun parseNoSchemeUrl() {
benchmarkRule.measureRepeated {
assertTrue(
RichTextParser().parseText(
"first amethyst.social second",
EmptyTagList,
).paragraphs[0].words[1] is LinkSegment,
)
}
}
@Test
fun parseHashtag() {
benchmarkRule.measureRepeated {
assertTrue(
RichTextParser().parseText(
"first #amethyst second",
EmptyTagList,
).paragraphs[0].words[1] is HashTagSegment,
)
}
}
@Test
fun computeTestCase1All() {
benchmarkRule.measureRepeated {
RichTextParser().parseText(testCase1, EmptyTagList)
}
}
@Test
fun computeTestCase2All() {
benchmarkRule.measureRepeated {
RichTextParser().parseText(testCase2, EmptyTagList)
}
}
@Test
fun computeTestCase2UrlDetector() {
benchmarkRule.measureRepeated {
UrlDetector(testCase2, UrlDetectorOptions.Default).detect()
}
}
@Test
fun computeTestCase2ParseUrls() {
benchmarkRule.measureRepeated {
RichTextParser().parseValidUrls(testCase2)
}
}
@Test
fun computeTestCase3All() {
benchmarkRule.measureRepeated {
RichTextParser().parseText(testCase3, EmptyTagList)
}
}
val testCase1 = """
#Amethyst v0.83.10
تحديث جديد لـ Amethyst بإصدار 0.83.10 مع تعديلات وإضافات جديدة
: NIP-92 إصلاحات الأخطاء
الإضافات الجديدة:
- يتضمن رابط المنتج في الرسالة الأولى من المشتري في السوق
- يضيف دعمًا لـ NIP-92 في الرسائل العامة والرسائل المباشرة الجديدة (NIP-17). يبقى NIP-54 في NIP-04 DMs
- إضافة التمرير الأفقي إلى أزرار الإجراءات في شاشة النشر الجديد لإصلاح الأزرار المخفية جزئيًا في الشاشات الصغيرة/الرفيعة.
اصلاحات الشوائب:
- إصلاحات التعطل مع مبلغ Zap مخصص غير صالح
- يعمل على إصلاح مشكلات إعادة اتصال التتابع عندما يقوم المرحل بإغلاق الاتصال
- إصلاح الحشو العلوي للملاحظة المقتبسة في المنشور
- تحسين استخدام الذاكرة للمستخدم المرئي وعلامة URL في المشاركات الجديدة
الترجمات المحدثة:
- الفارسية بواسطة
- الفرنسية والإنجليزية، المملكة المتحدة بواسطة
- الأوكرانية
- الإسبانية والإسبانية والمكسيك والإسبانية والولايات المتحدة بواسطة
- العربية
تحسينات جودة الكود:
- تحديثات لنظام Android Studio 2023.1.1 Patch 2
nostr:nevent1qqszq7kl888sw0c5rpvepn8w373zt0jrw8864x8lkauxxw335s66rzgppemhxue69uhkummn9ekx7mp0qgsyvrp9u6p0mfur9dfdru3d853tx9mdjuhkphxuxgfwmryja7zsvhqrqsqqqqqpaax7m2
"""
val testCase2 = """
#Amethyst v0.83.10: NIP-92 and Bug Fixes
New Additions:
- Includes a link to the product in the first message from the buyer in the marketplace
- Adds support for NIP-92 in public messages and new DMs (NIP-17). NIP-54 stays in NIP-04 DMs
- Adds Horizontal Scroll to the action buttons in the New Post screen to partially fix hidden buttons in small/thin screens.
Bugfixes:
- Fixes crash with an invalid custom Zap Amount
- Fixes relay re-connection issues when the relay closes a connection
- Fixes the top padding of the quoted note in a post
- Optimizes memory use of the visual user and url tagger in new posts
Updated translations:
- Persian by nostr:npub1cpazafytvafazxkjn43zjfwtfzatfz508r54f6z6a3rf2ws8223qc3xxpk
- French and English, United Kingdom by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
- Ukrainian by crowdin.com/profile/liizzzz
- Spanish, Spanish, Mexico and Spanish, United States by nostr:npub1luhyzgce7qtcs6r6v00ryjxza8av8u4dzh3avg0zks38tjktnmxspxq903
- Arabic by nostr:npub13qtw3yu0uc9r4yj5x0rhgy8nj5q0uyeq0pavkgt9ly69uuzxgkfqwvx23t
Code Quality Improvements:
- Updates to Android Studio 2023.1.1 Patch 2
Download:
- [Play Edition](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-googleplay-universal-v0.83.10.apk )
- [FOSS Edition - No translations](https://github.com/vitorpamplona/amethyst/releases/download/v0.83.10/amethyst-fdroid-universal-v0.83.10.apk )
"""
val testCase3 = """#100aDayUntil100k
Day 5
Seems like they may be getting easier"""
}

View File

@ -74,29 +74,35 @@ class RichTextParser() {
}
}
fun parseValidUrls(content: String): LinkedHashSet<String> {
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
return urls.mapNotNullTo(LinkedHashSet(urls.size)) {
if (it.originalUrl.contains("@")) {
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
null
} else {
it.originalUrl
}
} else if (isNumber(it.originalUrl)) {
null // avoids urls that look like 123.22
} else if (it.originalUrl.contains("")) {
null // avoids Japanese characters as fake urls
} else {
if (HTTPRegex.matches(it.originalUrl)) {
it.originalUrl
} else {
null
}
}
}
}
fun parseText(
content: String,
tags: ImmutableListOfLists<String>,
): RichTextViewerState {
val urls = UrlDetector(content, UrlDetectorOptions.Default).detect()
val urlSet =
urls.mapNotNullTo(LinkedHashSet(urls.size)) {
// removes e-mails
if (Patterns.EMAIL_ADDRESS.matcher(it.originalUrl).matches()) {
null
} else if (isNumber(it.originalUrl)) {
null
} else if (it.originalUrl.contains("")) {
null
} else {
if (HTTPRegex.matches(it.originalUrl)) {
it.originalUrl
} else {
null
}
}
}
val urlSet = parseValidUrls(content)
val imagesForPager =
urlSet.mapNotNull { fullUrl -> parseMediaUrl(fullUrl, tags) }.associateBy { it.url }
@ -153,8 +159,29 @@ class RichTextParser() {
return paragraphSegments.toImmutableList()
}
fun isNumber(word: String): Boolean {
return numberPattern.matcher(word).matches()
private fun isNumber(word: String) = numberPattern.matcher(word).matches()
private fun isPhoneNumberChar(c: Char): Boolean {
return when (c) {
in '0'..'9' -> true
'-' -> true
' ' -> true
'.' -> true
else -> false
}
}
fun isPotentialPhoneNumber(word: String): Boolean {
if (word.length !in 7..14) return false
var isPotentialNumber = true
for (c in word) {
if (!isPhoneNumberChar(c)) {
isPotentialNumber = false
break
}
}
return isPotentialNumber
}
fun isDate(word: String): Boolean {
@ -172,46 +199,48 @@ class RichTextParser() {
emojis: Map<String, String>,
tags: ImmutableListOfLists<String>,
): Segment {
val emailMatcher = Patterns.EMAIL_ADDRESS.matcher(word)
val phoneMatcher = Patterns.PHONE.matcher(word)
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
if (word.isEmpty()) return RegularTextSegment(word)
return if (word.isEmpty()) {
RegularTextSegment(word)
} else if (images.contains(word)) {
ImageSegment(word)
} else if (urls.contains(word)) {
LinkSegment(word)
} else if (emojis.any { word.contains(it.key) }) {
EmojiSegment(word)
} else if (word.startsWith("lnbc", true)) {
InvoiceSegment(word)
} else if (word.startsWith("lnurl", true)) {
WithdrawSegment(word)
} else if (word.startsWith("cashuA", true)) {
CashuSegment(word)
} else if (emailMatcher.matches()) {
EmailSegment(word)
} else if (word.length in 7..14 && !isDate(word) && phoneMatcher.matches()) {
PhoneSegment(word)
} else if (startsWithNIP19Scheme(word)) {
BechSegment(word)
} else if (word.startsWith("#")) {
parseHash(word, tags)
} else if (word.contains(".") && schemelessMatcher.find()) {
val url = schemelessMatcher.group(1) // url
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
val pattern =
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
.toRegex(RegexOption.IGNORE_CASE)
if (pattern.find(word) != null) {
SchemelessUrlSegment(word, url, additionalChars)
} else {
RegularTextSegment(word)
}
} else {
RegularTextSegment(word)
if (images.contains(word)) return ImageSegment(word)
if (urls.contains(word)) return LinkSegment(word)
if (word.startsWith(":") && emojis.any { word.contains(it.key) }) return EmojiSegment(word)
if (word.startsWith("lnbc", true)) return InvoiceSegment(word)
if (word.startsWith("lnurl", true)) return WithdrawSegment(word)
if (word.startsWith("cashuA", true)) return CashuSegment(word)
if (startsWithNIP19Scheme(word)) return BechSegment(word)
if (word.startsWith("#")) return parseHash(word, tags)
if (word.contains("@")) {
if (Patterns.EMAIL_ADDRESS.matcher(word).matches()) return EmailSegment(word)
}
if (isPotentialPhoneNumber(word) && !isDate(word)) {
if (Patterns.PHONE.matcher(word).matches()) return PhoneSegment(word)
}
val indexOfPeriod = word.indexOf(".")
if (indexOfPeriod > 0 && indexOfPeriod < word.length - 1) { // periods cannot be the last one
val schemelessMatcher = noProtocolUrlValidator.matcher(word)
if (schemelessMatcher.find()) {
val url = schemelessMatcher.group(1) // url
val additionalChars = schemelessMatcher.group(4).ifEmpty { null } // additional chars
val pattern =
"""^([A-Za-z0-9-_]+(\.[A-Za-z0-9-_]+)+)(:[0-9]+)?(/[^?#]*)?(\?[^#]*)?(#.*)?"""
.toRegex(RegexOption.IGNORE_CASE)
if (pattern.find(word) != null && url != null) {
return SchemelessUrlSegment(word, url, additionalChars)
}
}
}
return RegularTextSegment(word)
}
private fun parseHash(
@ -289,7 +318,11 @@ class RichTextParser() {
val hashTagsPattern: Pattern =
Pattern.compile("#([^\\s!@#\$%^&*()=+./,\\[{\\]};:'\"?><]+)(.*)", Pattern.CASE_INSENSITIVE)
val acceptedNIP19schemes = listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1")
val acceptedNIP19schemes =
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1") +
listOf("npub1", "naddr1", "note1", "nprofile1", "nevent1").map {
it.uppercase()
}
private fun removeQueryParamsForExtensionComparison(fullUrl: String): String {
return if (fullUrl.contains("?")) {
@ -344,9 +377,18 @@ class RichTextParser() {
}
fun startsWithNIP19Scheme(word: String): Boolean {
val cleaned = word.lowercase().removePrefix("@").removePrefix("nostr:").removePrefix("@")
return acceptedNIP19schemes.any { cleaned.startsWith(it) }
if (word.isEmpty()) return false
return if (word[0] == 'n' || word[0] == 'N') {
if (word.startsWith("nostr:n") || word.startsWith("NOSTR:N")) {
acceptedNIP19schemes.any { word.startsWith(it, 6) }
} else {
acceptedNIP19schemes.any { word.startsWith(it) }
}
} else if (word[0] == '@') {
acceptedNIP19schemes.any { word.startsWith(it, 1) }
} else {
false
}
}
fun isUrlWithoutScheme(url: String) = noProtocolUrlValidator.matcher(url).matches()