mirror of
https://github.com/vitorpamplona/amethyst.git
synced 2024-10-01 17:30:50 +00:00
parse HTML as little as possible
This commit is contained in:
parent
042579ddfb
commit
e11961695f
@ -32,12 +32,8 @@ import okhttp3.ResponseBody
|
|||||||
import okio.BufferedSource
|
import okio.BufferedSource
|
||||||
import okio.ByteString.Companion.decodeHex
|
import okio.ByteString.Companion.decodeHex
|
||||||
import okio.Options
|
import okio.Options
|
||||||
import org.jsoup.Jsoup
|
|
||||||
import org.jsoup.nodes.Document
|
|
||||||
import java.io.ByteArrayInputStream
|
|
||||||
import java.nio.charset.Charset
|
import java.nio.charset.Charset
|
||||||
|
|
||||||
private const val ELEMENT_TAG_META = "meta"
|
|
||||||
private const val ATTRIBUTE_VALUE_PROPERTY = "property"
|
private const val ATTRIBUTE_VALUE_PROPERTY = "property"
|
||||||
private const val ATTRIBUTE_VALUE_NAME = "name"
|
private const val ATTRIBUTE_VALUE_NAME = "name"
|
||||||
private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
|
private const val ATTRIBUTE_VALUE_ITEMPROP = "itemprop"
|
||||||
@ -112,15 +108,15 @@ suspend fun parseHtml(
|
|||||||
// sniff charset from Content-Type header or BOM
|
// sniff charset from Content-Type header or BOM
|
||||||
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
|
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
|
||||||
if (sniffedCharset != null) {
|
if (sniffedCharset != null) {
|
||||||
val doc = Jsoup.parse(source.inputStream(), sniffedCharset.name(), url)
|
val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents())
|
||||||
return@withContext parseUrlInfo(url, doc, type)
|
return@withContext parseUrlInfo(url, metaTags, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
// if sniffing was failed, detect charset from content
|
// if sniffing was failed, detect charset from content
|
||||||
val bodyBytes = source.readByteArray()
|
val bodyBytes = source.readByteArray()
|
||||||
val charset = detectCharset(bodyBytes)
|
val charset = detectCharset(bodyBytes)
|
||||||
val doc = Jsoup.parse(ByteArrayInputStream(bodyBytes), charset.name(), url)
|
val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents())
|
||||||
return@withContext parseUrlInfo(url, doc, type)
|
return@withContext parseUrlInfo(url, metaTags, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
// taken from okhttp
|
// taken from okhttp
|
||||||
@ -178,11 +174,9 @@ private fun detectCharset(bodyBytes: ByteArray): Charset {
|
|||||||
|
|
||||||
private fun parseUrlInfo(
|
private fun parseUrlInfo(
|
||||||
url: String,
|
url: String,
|
||||||
document: Document,
|
metaTags: Sequence<MetaTag>,
|
||||||
type: MediaType,
|
type: MediaType,
|
||||||
): UrlInfoItem {
|
): UrlInfoItem {
|
||||||
val metaTags = document.getElementsByTag(ELEMENT_TAG_META)
|
|
||||||
|
|
||||||
var title: String = ""
|
var title: String = ""
|
||||||
var description: String = ""
|
var description: String = ""
|
||||||
var image: String = ""
|
var image: String = ""
|
||||||
@ -246,14 +240,57 @@ private fun parseUrlInfo(
|
|||||||
return UrlInfoItem(url, title, description, image, type)
|
return UrlInfoItem(url, title, description, image, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HTML parsing stuff
|
||||||
|
private val RE_HEAD = Regex("""<head\s*>(.*?)</head\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
|
||||||
|
|
||||||
|
private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: ""
|
||||||
|
|
||||||
private class MetaTag(private val attrs: Map<String, String>) {
|
private class MetaTag(private val attrs: Map<String, String>) {
|
||||||
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
|
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
|
||||||
}
|
}
|
||||||
|
|
||||||
// map of HTML element attribute name to its value, with some guarantees:
|
// map of HTML element attribute name to its value, with additional logics:
|
||||||
// - attribute names are compared in a case-insensitive manner
|
// - attribute names are matched in a case-insensitive manner
|
||||||
// - attribute names never duplicate
|
// - attribute names never duplicate
|
||||||
|
// - commonly used character references in attribute values are resolved
|
||||||
private class Attrs {
|
private class Attrs {
|
||||||
|
companion object {
|
||||||
|
val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
|
||||||
|
val BASE_CHAR_REFS =
|
||||||
|
mapOf(
|
||||||
|
"amp" to "&",
|
||||||
|
"AMP" to "&",
|
||||||
|
"quot" to "\"",
|
||||||
|
"QUOT" to "\"",
|
||||||
|
"lt" to "<",
|
||||||
|
"LT" to "<",
|
||||||
|
"gt" to ">",
|
||||||
|
"GT" to ">",
|
||||||
|
)
|
||||||
|
val CHAR_REFS =
|
||||||
|
mapOf(
|
||||||
|
"apos" to "'",
|
||||||
|
"equals" to "=",
|
||||||
|
"grave" to "`",
|
||||||
|
"DiacriticalGrave" to "`",
|
||||||
|
)
|
||||||
|
|
||||||
|
fun replaceCharRefs(match: MatchResult): String {
|
||||||
|
val bcr = BASE_CHAR_REFS[match.groupValues[1]]
|
||||||
|
if (bcr != null) {
|
||||||
|
return bcr
|
||||||
|
}
|
||||||
|
// non-base char refs must be terminated by ';'
|
||||||
|
if (match.groupValues[2].isNotEmpty()) {
|
||||||
|
val cr = CHAR_REFS[match.groupValues[1]]
|
||||||
|
if (cr != null) {
|
||||||
|
return cr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return match.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private val attrs = mutableMapOf<String, String>()
|
private val attrs = mutableMapOf<String, String>()
|
||||||
|
|
||||||
fun add(attr: Pair<String, String>) {
|
fun add(attr: Pair<String, String>) {
|
||||||
@ -261,7 +298,8 @@ private class Attrs {
|
|||||||
if (attrs.containsKey(name)) {
|
if (attrs.containsKey(name)) {
|
||||||
throw IllegalArgumentException("duplicated attribute name: $name")
|
throw IllegalArgumentException("duplicated attribute name: $name")
|
||||||
}
|
}
|
||||||
attrs += Pair(name, attr.second)
|
val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs)
|
||||||
|
attrs += Pair(name, value)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun freeze(): Map<String, String> = attrs.toImmutableMap()
|
fun freeze(): Map<String, String> = attrs.toImmutableMap()
|
||||||
@ -275,8 +313,8 @@ private object MetaTagsParser {
|
|||||||
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
|
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
|
||||||
|
|
||||||
fun parse(input: String): Sequence<MetaTag> =
|
fun parse(input: String): Sequence<MetaTag> =
|
||||||
RE_META.findAll(input).map {
|
RE_META.findAll(input).mapNotNull {
|
||||||
MetaTag(parseAttrs(it.groupValues[1]))
|
runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull()
|
||||||
}
|
}
|
||||||
|
|
||||||
private enum class State {
|
private enum class State {
|
||||||
|
Loading…
Reference in New Issue
Block a user