From 3434c31487e503f1fd4111d3166fc4407fc8cb07 Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Sun, 24 Mar 2024 23:42:54 +0900 Subject: [PATCH 1/7] fix garbled URL preview for non-UTF-8 HTML --- .../service/previews/UrlPreviewUtils.kt | 197 +++++++++++++----- 1 file changed, 143 insertions(+), 54 deletions(-) diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index ab0e1f9b4..392082b14 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -27,10 +27,19 @@ import kotlinx.coroutines.withContext import okhttp3.MediaType import okhttp3.MediaType.Companion.toMediaType import okhttp3.Request +import okhttp3.ResponseBody +import okio.BufferedSource +import okio.ByteString.Companion.decodeHex +import okio.Options import org.jsoup.Jsoup import org.jsoup.nodes.Document +import java.io.ByteArrayInputStream +import java.io.IOException +import java.nio.charset.Charset private const val ELEMENT_TAG_META = "meta" +private const val ATTRIBUTE_VALUE_CHARSET = "charset" +private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" @@ -99,10 +108,8 @@ suspend fun getDocument( ?: throw IllegalArgumentException( "Website returned unknown mimetype: ${it.headers.get("Content-Type")}", ) - if (mimeType.type == "text" && mimeType.subtype == "html") { - val document = Jsoup.parse(it.body.string()) - parseHtml(url, document, mimeType) + parseHtml(url, it.body, mimeType) } else if (mimeType.type == "image") { UrlInfoItem(url, image = url, mimeType = mimeType) } else if (mimeType.type == "video") { @@ -120,65 +127,147 @@ suspend fun getDocument( suspend fun parseHtml( url: String, - document: Document, + body: ResponseBody, type: MediaType, ): UrlInfoItem = withContext(Dispatchers.IO) { - val metaTags = document.getElementsByTag(ELEMENT_TAG_META) + val source = body.source() - var title: String = "" - var description: String = "" - var image: String = "" + // sniff charset from Content-Type header or BOM + val sniffedCharset = type.charset() ?: source.readBomAsCharset() + if (sniffedCharset != null) { + val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url) + return@withContext parseUrlInfo(url, doc, type) + } - metaTags.forEach { - when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } + // if sniffing was failed, detect charset from content + val bodyBytes = source.readByteArray() + val charset = detectCharset(bodyBytes, url) + val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url) + return@withContext parseUrlInfo(url, doc, type) + } - when (it.attr(ATTRIBUTE_VALUE_NAME)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +private val UNICODE_BOMS = + Options.of( + // UTF-8 + "efbbbf".decodeHex(), + // UTF-16BE + "feff".decodeHex(), + // UTF-16LE + "fffe".decodeHex(), + // UTF-32BE + "0000ffff".decodeHex(), + // UTF-32LE + "ffff0000".decodeHex(), + ) - when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { - in META_X_TITLE -> - if (title.isEmpty()) { - title = it.attr(CONTENT) - } - in META_X_DESCRIPTION -> - if (description.isEmpty()) { - description = it.attr(CONTENT) - } - in META_X_IMAGE -> - if (image.isEmpty()) { - image = it.attr(CONTENT) - } - } +@Throws(IOException::class) +private fun BufferedSource.readBomAsCharset(): Charset? { + return when (select(UNICODE_BOMS)) { + 0 -> Charsets.UTF_8 + 1 -> Charsets.UTF_16BE + 2 -> Charsets.UTF_16LE + 3 -> Charsets.UTF_32BE + 4 -> Charsets.UTF_32LE + -1 -> null + else -> throw AssertionError() + } +} - if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { - return@withContext UrlInfoItem(url, title, description, image, type) +private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""") + +private fun detectCharset( + bodyBytes: ByteArray, + url: String, +): Charset { + // tentatively decode response body as UTF-8 + val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url) + + tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta -> + val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) + if (charsetAttr.isNotEmpty()) { + runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let { + return it } } - return@withContext UrlInfoItem(url, title, description, image, type) + if (meta.attr(ATTRIBUTE_VALUE_HTTP_EQUIV).lowercase() == "content-type") { + RE_CONTENT_TYPE_CHARSET.find(meta.attr(CONTENT)) + ?.let { + runCatching { Charset.forName(it.groupValues[1]) }.getOrNull() + }?.let { + return it + } + } } + return Charset.forName("utf-8") +} + +private fun parseUrlInfo( + url: String, + document: Document, + type: MediaType, +): UrlInfoItem { + val metaTags = document.getElementsByTag(ELEMENT_TAG_META) + + var title: String = "" + var description: String = "" + var image: String = "" + + metaTags.forEach { + when (it.attr(ATTRIBUTE_VALUE_PROPERTY)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + when (it.attr(ATTRIBUTE_VALUE_NAME)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + when (it.attr(ATTRIBUTE_VALUE_ITEMPROP)) { + in META_X_TITLE -> + if (title.isEmpty()) { + title = it.attr(CONTENT) + } + + in META_X_DESCRIPTION -> + if (description.isEmpty()) { + description = it.attr(CONTENT) + } + + in META_X_IMAGE -> + if (image.isEmpty()) { + image = it.attr(CONTENT) + } + } + + if (title.isNotEmpty() && description.isNotEmpty() && image.isNotEmpty()) { + return UrlInfoItem(url, title, description, image, type) + } + } + return UrlInfoItem(url, title, description, image, type) +} From d0aa7430ca56d411b36ec890df47c50d0dff7883 Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Mon, 25 Mar 2024 23:09:31 +0900 Subject: [PATCH 2/7] optimize HTML charset detection --- .../service/previews/UrlPreviewUtils.kt | 183 ++++++++++++++++-- 1 file changed, 167 insertions(+), 16 deletions(-) diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index 392082b14..e01054201 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -22,6 +22,7 @@ package com.vitorpamplona.amethyst.service.previews import com.vitorpamplona.amethyst.service.HttpClientManager import com.vitorpamplona.amethyst.service.checkNotInMainThread +import kotlinx.collections.immutable.toImmutableMap import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import okhttp3.MediaType @@ -34,15 +35,14 @@ import okio.Options import org.jsoup.Jsoup import org.jsoup.nodes.Document import java.io.ByteArrayInputStream -import java.io.IOException import java.nio.charset.Charset private const val ELEMENT_TAG_META = "meta" -private const val ATTRIBUTE_VALUE_CHARSET = "charset" -private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" +private const val ATTRIBUTE_VALUE_CHARSET = "charset" +private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" // for Charsets.UTF_8 @@ -174,16 +174,13 @@ private fun BufferedSource.readBomAsCharset(): Charset? { } } -private val RE_CONTENT_TYPE_CHARSET = Regex("""charset\s*=\s*([^;]+)""") +private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""") -private fun detectCharset( - bodyBytes: ByteArray, - url: String, -): Charset { - // tentatively decode response body as UTF-8 - val tentativeDoc = Jsoup.parse(ByteArrayInputStream(bodyBytes), "utf-8", url) - - tentativeDoc.getElementsByTag(ELEMENT_TAG_META).forEach { meta -> +private fun detectCharset(bodyBytes: ByteArray): Charset { + // try to detect charset from meta tags parsed from first 1024 bytes of body + val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8")) + val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence()) + metaTags.forEach { meta -> val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) if (charsetAttr.isNotEmpty()) { runCatching { Charset.forName(charsetAttr) }.getOrNull()?.let { @@ -199,6 +196,7 @@ private fun detectCharset( } } } + // defaults to UTF-8 return Charset.forName("utf-8") } @@ -271,3 +269,156 @@ private fun parseUrlInfo( } return UrlInfoItem(url, title, description, image, type) } + +private class MetaTag(private val attrs: Map) { + fun attr(name: String): String = attrs[name.lowercase()] ?: "" +} + +// map of HTML element attribute name to its value, with some guarantees: +// - attribute names are compared in a case-insensitive manner +// - attribute names never duplicate +private class Attrs { + private val attrs = mutableMapOf() + + fun add(attr: Pair) { + val name = attr.first.lowercase() + if (attrs.containsKey(name)) { + throw IllegalArgumentException("duplicated attribute name: $name") + } + attrs += Pair(name, attr.second) + } + + fun freeze(): Map = attrs.toImmutableMap() +} + +// parser for parsing a partial HTML document into meta tags +private object MetaTagsParser { + private val RE_META = Regex("""""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) + + private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') + private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') + + fun parse(input: String): Sequence = + RE_META.findAll(input).map { + MetaTag(parseAttrs(it.groupValues[1])) + } + + private enum class State { + NAME, + BEFORE_EQ, + AFTER_EQ, + VALUE, + SPACE, + } + + private fun parseAttrs(input: String): Map { + val attrs = Attrs() + + var state = State.NAME + var nameBegin = 0 + var nameEnd = 0 + var valueBegin = 0 + var valueQuote: Char? = null + + input.forEachIndexed { i, c -> + when (state) { + State.NAME -> { + when { + c == '=' -> { + nameEnd = i + state = State.AFTER_EQ + } + + c.isWhitespace() -> { + nameEnd = i + state = State.BEFORE_EQ + } + + NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { + throw IllegalArgumentException("meta has invalid attributes part") + } + } + } + + State.BEFORE_EQ -> { + when { + c == '=' -> { + state = State.AFTER_EQ + } + + c.isWhitespace() -> {} + else -> throw IllegalArgumentException("meta has invalid attributes part") + } + } + + State.AFTER_EQ -> { + when { + c.isWhitespace() -> {} + c == '\'' || c == '"' -> { + valueBegin = i + 1 + valueQuote = c + state = State.VALUE + } + + else -> { + valueBegin = i + valueQuote = null + state = State.VALUE + } + } + } + + State.VALUE -> { + var attr: Pair? = null + when { + valueQuote != null -> { + if (c == valueQuote) { + attr = + Pair( + input.slice(nameBegin until nameEnd), + input.slice(valueBegin until i), + ) + } + } + + valueQuote == null -> { + when { + c.isWhitespace() -> { + attr = + Pair( + input.slice(nameBegin until nameEnd), + input.slice(valueBegin until i), + ) + } + + i == input.length - 1 -> { + attr = + Pair( + input.slice(nameBegin until nameEnd), + input.slice(valueBegin..i), + ) + } + + NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> { + throw IllegalArgumentException("meta has invalid attributes part") + } + } + } + } + if (attr != null) { + attrs.add(attr) + state = State.SPACE + } + } + + State.SPACE -> { + if (!c.isWhitespace()) { + nameBegin = i + state = State.NAME + } + } + } + } + return attrs.freeze() + } +} From 042579ddfbbec53611a5261ecca7ca79350df737 Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Mon, 25 Mar 2024 23:53:00 +0900 Subject: [PATCH 3/7] remove unnecessary items from mata-tag canditates for getting URL info --- .../service/previews/UrlPreviewUtils.kt | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index e01054201..0ad798166 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -48,48 +48,24 @@ private const val ATTRIBUTE_VALUE_HTTP_EQUIV = "http-equiv" private val META_X_TITLE = arrayOf( "og:title", - "\"og:title\"", - "'og:title'", - "name", - "\"name\"", - "'name'", "twitter:title", - "\"twitter:title\"", - "'twitter:title'", "title", - "\"title\"", - "'title'", ) // for Date: Tue, 26 Mar 2024 02:31:06 +0900 Subject: [PATCH 4/7] parse HTML as little as possible --- .../service/previews/UrlPreviewUtils.kt | 70 ++++++++++++++----- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index 0ad798166..6fee16f92 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -32,12 +32,8 @@ import okhttp3.ResponseBody import okio.BufferedSource import okio.ByteString.Companion.decodeHex import okio.Options -import org.jsoup.Jsoup -import org.jsoup.nodes.Document -import java.io.ByteArrayInputStream import java.nio.charset.Charset -private const val ELEMENT_TAG_META = "meta" private const val ATTRIBUTE_VALUE_PROPERTY = "property" private const val ATTRIBUTE_VALUE_NAME = "name" private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop" @@ -112,15 +108,15 @@ suspend fun parseHtml( // sniff charset from Content-Type header or BOM val sniffedCharset = type.charset() ?: source.readBomAsCharset() if (sniffedCharset != null) { - val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url) - return@withContext parseUrlInfo(url, doc, type) + val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents()) + return@withContext parseUrlInfo(url, metaTags, type) } // if sniffing was failed, detect charset from content val bodyBytes = source.readByteArray() val charset = detectCharset(bodyBytes) - val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url) - return@withContext parseUrlInfo(url, doc, type) + val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents()) + return@withContext parseUrlInfo(url, metaTags, type) } // taken from okhttp @@ -178,11 +174,9 @@ private fun detectCharset(bodyBytes: ByteArray): Charset { private fun parseUrlInfo( url: String, - document: Document, + metaTags: Sequence, type: MediaType, ): UrlInfoItem { - val metaTags = document.getElementsByTag(ELEMENT_TAG_META) - var title: String = "" var description: String = "" var image: String = "" @@ -246,14 +240,57 @@ private fun parseUrlInfo( return UrlInfoItem(url, title, description, image, type) } +// HTML parsing stuff +private val RE_HEAD = Regex("""(.*?)""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) + +private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: "" + private class MetaTag(private val attrs: Map) { fun attr(name: String): String = attrs[name.lowercase()] ?: "" } -// map of HTML element attribute name to its value, with some guarantees: -// - attribute names are compared in a case-insensitive manner +// map of HTML element attribute name to its value, with additional logics: +// - attribute names are matched in a case-insensitive manner // - attribute names never duplicate +// - commonly used character references in attribute values are resolved private class Attrs { + companion object { + val RE_CHAR_REF = Regex("""&(\w+)(;?)""") + val BASE_CHAR_REFS = + mapOf( + "amp" to "&", + "AMP" to "&", + "quot" to "\"", + "QUOT" to "\"", + "lt" to "<", + "LT" to "<", + "gt" to ">", + "GT" to ">", + ) + val CHAR_REFS = + mapOf( + "apos" to "'", + "equals" to "=", + "grave" to "`", + "DiacriticalGrave" to "`", + ) + + fun replaceCharRefs(match: MatchResult): String { + val bcr = BASE_CHAR_REFS[match.groupValues[1]] + if (bcr != null) { + return bcr + } + // non-base char refs must be terminated by ';' + if (match.groupValues[2].isNotEmpty()) { + val cr = CHAR_REFS[match.groupValues[1]] + if (cr != null) { + return cr + } + } + return match.value + } + } + private val attrs = mutableMapOf() fun add(attr: Pair) { @@ -261,7 +298,8 @@ private class Attrs { if (attrs.containsKey(name)) { throw IllegalArgumentException("duplicated attribute name: $name") } - attrs += Pair(name, attr.second) + val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs) + attrs += Pair(name, value) } fun freeze(): Map = attrs.toImmutableMap() @@ -275,8 +313,8 @@ private object MetaTagsParser { private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') fun parse(input: String): Sequence = - RE_META.findAll(input).map { - MetaTag(parseAttrs(it.groupValues[1])) + RE_META.findAll(input).mapNotNull { + runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull() } private enum class State { From bffb9f3778e9301ebf63d8c72a7184cb3276555a Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Tue, 26 Mar 2024 02:42:15 +0900 Subject: [PATCH 5/7] remove jsoup from dependencies --- app/build.gradle | 3 --- gradle/libs.versions.toml | 1 - 2 files changed, 4 deletions(-) diff --git a/app/build.gradle b/app/build.gradle index cad079522..06b822a7e 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -205,9 +205,6 @@ dependencies { // Websockets API implementation libs.okhttp - // HTML Parsing for Link Preview - implementation libs.jsoup - // Encrypted Key Storage implementation libs.androidx.security.crypto.ktx diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 43a6e6388..163f1ea39 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -93,7 +93,6 @@ google-mlkit-language-id = { group = "com.google.mlkit", name = "language-id", v google-mlkit-translate = { group = "com.google.mlkit", name = "translate", version.ref = "translate" } jackson-module-kotlin = { group = "com.fasterxml.jackson.module", name = "jackson-module-kotlin", version.ref = "jacksonModuleKotlin" } jna = { group = "net.java.dev.jna", name = "jna", version.ref = "jna" } -jsoup = { group = "org.jsoup", name = "jsoup", version.ref = "jsoup" } junit = { group = "junit", name = "junit", version.ref = "junit" } kotlinx-collections-immutable = { group = "org.jetbrains.kotlinx", name = "kotlinx-collections-immutable", version.ref = "kotlinxCollectionsImmutable" } lazysodium-android = { group = "com.goterl", name = "lazysodium-android", version.ref = "lazysodiumAndroid" } From a71ce69cab66bd00565c060a367bb497b459743b Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Wed, 27 Mar 2024 00:47:35 +0900 Subject: [PATCH 6/7] support tags in quoted attribute value --- .../service/previews/MetaTagsParser.kt | 311 ++++++++++++++++++ .../service/previews/UrlPreviewUtils.kt | 217 +----------- .../service/previews/MetaTagsParserTest.kt | 81 +++++ 3 files changed, 400 insertions(+), 209 deletions(-) create mode 100644 app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt create mode 100644 app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt new file mode 100644 index 000000000..5245e92c0 --- /dev/null +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParser.kt @@ -0,0 +1,311 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.service.previews + +import kotlinx.collections.immutable.toImmutableMap +import java.lang.StringBuilder + +internal data class MetaTag(private val attrs: Map) { + fun attr(name: String): String = attrs[name.lowercase()] ?: "" +} + +// parse a partial HTML document and extract meta tags +internal object MetaTagsParser { + private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') + private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') + + fun parse(input: String): Sequence = + sequence { + val s = TagScanner(input) + while (!s.exhausted()) { + val t = s.nextTag() ?: continue + if (t.name == "/head") { + break + } + if (t.name == "meta") { + val attrs = parseAttrs(t.attrPart) ?: continue + yield(MetaTag(attrs)) + } + } + } + + private data class RawTag(val name: String, val attrPart: String) + + private class TagScanner(private val input: String) { + var p = 0 + + fun exhausted(): Boolean = p >= input.length + + private fun peek(): Char = input[p] + + private fun consume(): Char { + return input[p++] + } + + private fun consumeChar(c: Char): Boolean { + if (this.peek() == c) { + this.consume() + return true + } + return false + } + + private fun skipSpaces() { + while (!this.exhausted() && this.peek().isWhitespace()) { + this.consume() + } + } + + private fun skipUntil(c: Char) { + while (!this.exhausted() && this.peek() != c) { + this.consume() + } + } + + private fun readWhile(pred: (Char) -> Boolean): String { + val sb = StringBuilder() + while (!this.exhausted() && pred(this.peek())) { + sb.append(this.consume()) + } + return sb.toString() + } + + fun nextTag(): RawTag? { + skipUntil('<') + consume() + + // read tag name + val name = StringBuilder() + if (consumeChar('/')) { + name.append('/') + } + val n = readWhile { !it.isWhitespace() && it != '>' } + skipSpaces() + + // read until end of tag + val attrsPart = StringBuilder() + var quote: Char? = null + while (!exhausted()) { + val c = consume() + when { + // `/>` out of quote -> end of tag + quote == null && c == '/' && peek() == '>' -> { + consume() + break + } + // `>` out of quote -> end of tag + quote == null && c == '>' -> { + break + } + // entering quote + quote == null && (c == '\'' || c == '"') -> { + quote = c + } + // leaving quote + quote != null && c == quote -> { + quote = null + } + } + attrsPart.append(c) + } + + if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) { + return null + } + return RawTag(name.append(n).toString().lowercase(), attrsPart.toString()) + } + } + + // map of HTML element attribute name to its value, with additional logics: + // - attribute names are matched in a case-insensitive manner + // - attribute names never duplicate + // - commonly used character references in attribute values are resolved + private class Attrs { + companion object { + val RE_CHAR_REF = Regex("""&(\w+)(;?)""") + val BASE_CHAR_REFS = + mapOf( + "amp" to "&", + "AMP" to "&", + "quot" to "\"", + "QUOT" to "\"", + "lt" to "<", + "LT" to "<", + "gt" to ">", + "GT" to ">", + ) + val CHAR_REFS = + mapOf( + "apos" to "'", + "equals" to "=", + "grave" to "`", + "DiacriticalGrave" to "`", + ) + + fun replaceCharRefs(match: MatchResult): String { + val bcr = BASE_CHAR_REFS[match.groupValues[1]] + if (bcr != null) { + return bcr + } + // non-base char refs must be terminated by ';' + if (match.groupValues[2].isNotEmpty()) { + val cr = CHAR_REFS[match.groupValues[1]] + if (cr != null) { + return cr + } + } + return match.value + } + } + + private val attrs = mutableMapOf() + + fun add(attr: Pair) { + val name = attr.first.lowercase() + if (attrs.containsKey(name)) { + throw IllegalArgumentException("duplicated attribute name: $name") + } + val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs) + attrs += Pair(name, value) + } + + fun freeze(): Map = attrs.toImmutableMap() + } + + private enum class State { + NAME, + BEFORE_EQ, + AFTER_EQ, + VALUE, + SPACE, + } + + private fun parseAttrs(input: String): Map? { + val attrs = Attrs() + + var state = State.NAME + var nameBegin = 0 + var nameEnd = 0 + var valueBegin = 0 + var valueQuote: Char? = null + + input.forEachIndexed { i, c -> + when (state) { + State.NAME -> { + when { + c == '=' -> { + nameEnd = i + state = State.AFTER_EQ + } + + c.isWhitespace() -> { + nameEnd = i + state = State.BEFORE_EQ + } + + NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { + return null + } + } + } + + State.BEFORE_EQ -> { + when { + c == '=' -> { + state = State.AFTER_EQ + } + + c.isWhitespace() -> {} + else -> return null + } + } + + State.AFTER_EQ -> { + when { + c.isWhitespace() -> {} + c == '\'' || c == '"' -> { + valueBegin = i + 1 + valueQuote = c + state = State.VALUE + } + + else -> { + valueBegin = i + valueQuote = null + state = State.VALUE + } + } + } + + State.VALUE -> { + var attr: Pair? = null + when { + valueQuote != null -> { + if (c == valueQuote) { + attr = + Pair( + input.slice(nameBegin.. { + when { + c.isWhitespace() -> { + attr = + Pair( + input.slice(nameBegin.. { + attr = + Pair( + input.slice(nameBegin.. { + return null + } + } + } + } + if (attr != null) { + runCatching { attrs.add(attr) }.getOrNull() ?: return null + state = State.SPACE + } + } + + State.SPACE -> { + if (!c.isWhitespace()) { + nameBegin = i + state = State.NAME + } + } + } + } + return attrs.freeze() + } +} diff --git a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt index 6fee16f92..71553b77b 100644 --- a/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt +++ b/app/src/main/java/com/vitorpamplona/amethyst/service/previews/UrlPreviewUtils.kt @@ -22,13 +22,11 @@ package com.vitorpamplona.amethyst.service.previews import com.vitorpamplona.amethyst.service.HttpClientManager import com.vitorpamplona.amethyst.service.checkNotInMainThread -import kotlinx.collections.immutable.toImmutableMap import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import okhttp3.MediaType import okhttp3.MediaType.Companion.toMediaType import okhttp3.Request -import okhttp3.ResponseBody import okio.BufferedSource import okio.ByteString.Companion.decodeHex import okio.Options @@ -81,7 +79,7 @@ suspend fun getDocument( "Website returned unknown mimetype: ${it.headers["Content-Type"]}", ) if (mimeType.type == "text" && mimeType.subtype == "html") { - parseHtml(url, it.body, mimeType) + parseHtml(url, it.body.source(), mimeType) } else if (mimeType.type == "image") { UrlInfoItem(url, image = url, mimeType = mimeType) } else if (mimeType.type == "video") { @@ -99,24 +97,22 @@ suspend fun getDocument( suspend fun parseHtml( url: String, - body: ResponseBody, + source: BufferedSource, type: MediaType, ): UrlInfoItem = withContext(Dispatchers.IO) { - val source = body.source() - // sniff charset from Content-Type header or BOM val sniffedCharset = type.charset() ?: source.readBomAsCharset() if (sniffedCharset != null) { - val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents()) - return@withContext parseUrlInfo(url, metaTags, type) + val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset)) + return@withContext extractUrlInfo(url, metaTags, type) } // if sniffing was failed, detect charset from content val bodyBytes = source.readByteArray() val charset = detectCharset(bodyBytes) - val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents()) - return@withContext parseUrlInfo(url, metaTags, type) + val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset)) + return@withContext extractUrlInfo(url, metaTags, type) } // taken from okhttp @@ -151,7 +147,7 @@ private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""") private fun detectCharset(bodyBytes: ByteArray): Charset { // try to detect charset from meta tags parsed from first 1024 bytes of body val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8")) - val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence()) + val metaTags = MetaTagsParser.parse(firstPart) metaTags.forEach { meta -> val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET) if (charsetAttr.isNotEmpty()) { @@ -172,7 +168,7 @@ private fun detectCharset(bodyBytes: ByteArray): Charset { return Charset.forName("utf-8") } -private fun parseUrlInfo( +private fun extractUrlInfo( url: String, metaTags: Sequence, type: MediaType, @@ -239,200 +235,3 @@ private fun parseUrlInfo( } return UrlInfoItem(url, title, description, image, type) } - -// HTML parsing stuff -private val RE_HEAD = Regex("""(.*?)""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) - -private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: "" - -private class MetaTag(private val attrs: Map) { - fun attr(name: String): String = attrs[name.lowercase()] ?: "" -} - -// map of HTML element attribute name to its value, with additional logics: -// - attribute names are matched in a case-insensitive manner -// - attribute names never duplicate -// - commonly used character references in attribute values are resolved -private class Attrs { - companion object { - val RE_CHAR_REF = Regex("""&(\w+)(;?)""") - val BASE_CHAR_REFS = - mapOf( - "amp" to "&", - "AMP" to "&", - "quot" to "\"", - "QUOT" to "\"", - "lt" to "<", - "LT" to "<", - "gt" to ">", - "GT" to ">", - ) - val CHAR_REFS = - mapOf( - "apos" to "'", - "equals" to "=", - "grave" to "`", - "DiacriticalGrave" to "`", - ) - - fun replaceCharRefs(match: MatchResult): String { - val bcr = BASE_CHAR_REFS[match.groupValues[1]] - if (bcr != null) { - return bcr - } - // non-base char refs must be terminated by ';' - if (match.groupValues[2].isNotEmpty()) { - val cr = CHAR_REFS[match.groupValues[1]] - if (cr != null) { - return cr - } - } - return match.value - } - } - - private val attrs = mutableMapOf() - - fun add(attr: Pair) { - val name = attr.first.lowercase() - if (attrs.containsKey(name)) { - throw IllegalArgumentException("duplicated attribute name: $name") - } - val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs) - attrs += Pair(name, value) - } - - fun freeze(): Map = attrs.toImmutableMap() -} - -// parser for parsing a partial HTML document into meta tags -private object MetaTagsParser { - private val RE_META = Regex("""""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL)) - - private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/') - private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`') - - fun parse(input: String): Sequence = - RE_META.findAll(input).mapNotNull { - runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull() - } - - private enum class State { - NAME, - BEFORE_EQ, - AFTER_EQ, - VALUE, - SPACE, - } - - private fun parseAttrs(input: String): Map { - val attrs = Attrs() - - var state = State.NAME - var nameBegin = 0 - var nameEnd = 0 - var valueBegin = 0 - var valueQuote: Char? = null - - input.forEachIndexed { i, c -> - when (state) { - State.NAME -> { - when { - c == '=' -> { - nameEnd = i - state = State.AFTER_EQ - } - - c.isWhitespace() -> { - nameEnd = i - state = State.BEFORE_EQ - } - - NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> { - throw IllegalArgumentException("meta has invalid attributes part") - } - } - } - - State.BEFORE_EQ -> { - when { - c == '=' -> { - state = State.AFTER_EQ - } - - c.isWhitespace() -> {} - else -> throw IllegalArgumentException("meta has invalid attributes part") - } - } - - State.AFTER_EQ -> { - when { - c.isWhitespace() -> {} - c == '\'' || c == '"' -> { - valueBegin = i + 1 - valueQuote = c - state = State.VALUE - } - - else -> { - valueBegin = i - valueQuote = null - state = State.VALUE - } - } - } - - State.VALUE -> { - var attr: Pair? = null - when { - valueQuote != null -> { - if (c == valueQuote) { - attr = - Pair( - input.slice(nameBegin until nameEnd), - input.slice(valueBegin until i), - ) - } - } - - valueQuote == null -> { - when { - c.isWhitespace() -> { - attr = - Pair( - input.slice(nameBegin until nameEnd), - input.slice(valueBegin until i), - ) - } - - i == input.length - 1 -> { - attr = - Pair( - input.slice(nameBegin until nameEnd), - input.slice(valueBegin..i), - ) - } - - NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> { - throw IllegalArgumentException("meta has invalid attributes part") - } - } - } - } - if (attr != null) { - attrs.add(attr) - state = State.SPACE - } - } - - State.SPACE -> { - if (!c.isWhitespace()) { - nameBegin = i - state = State.NAME - } - } - } - } - return attrs.freeze() - } -} diff --git a/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt new file mode 100644 index 000000000..ef5cc8311 --- /dev/null +++ b/app/src/test/java/com/vitorpamplona/amethyst/service/previews/MetaTagsParserTest.kt @@ -0,0 +1,81 @@ +/** + * Copyright (c) 2024 Vitor Pamplona + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package com.vitorpamplona.amethyst.service.previews + +import org.junit.Assert.assertEquals +import org.junit.Test + +class MetaTagsParserTest { + @Test + fun testParse() { + val input = + """ + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + """.trimMargin() + + val exp = + listOf( + listOf("charset" to "utf-8"), + listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"), + listOf("property" to "og:title", "content" to "title"), + listOf("property" to "og:description", "content" to "description"), + listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"), + listOf("name" to "newline", "content" to "newline"), + listOf("name" to "space before gt"), + listOf("name" to "space before ="), + listOf("name" to "space after ="), + listOf("name" to "CAPITAL"), + listOf("name" to "character reference", "content" to ""), + listOf("name" to "attr value with end of head doesn't harm", "content" to "bang!"), + ) + + val metaTags = MetaTagsParser.parse(input).toList() + println(metaTags) + assertEquals(exp.size, metaTags.size) + metaTags.zip(exp).forEach { (meta, expAttrs) -> + expAttrs.forEach { (name, expValue) -> + assertEquals(expValue, meta.attr(name)) + } + } + } +} From 4f84fad0cdba718c68651d5767f1edee9e5965a1 Mon Sep 17 00:00:00 2001 From: jiftechnify Date: Wed, 27 Mar 2024 01:03:29 +0900 Subject: [PATCH 7/7] remove jsoup version --- gradle/libs.versions.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 163f1ea39..9125241ef 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -20,7 +20,6 @@ fragmentKtx = "1.6.2" gms = "4.4.1" jacksonModuleKotlin = "2.17.0" jna = "5.14.0" -jsoup = "1.17.2" junit = "4.13.2" kotlin = "1.9.22" kotlinxCollectionsImmutable = "0.3.7"