mirror of
https://github.com/vitorpamplona/amethyst.git
synced 2024-10-01 17:30:50 +00:00
support tags in quoted attribute value
This commit is contained in:
parent
bffb9f3778
commit
a71ce69cab
@ -0,0 +1,311 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) 2024 Vitor Pamplona
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
* this software and associated documentation files (the "Software"), to deal in
|
||||||
|
* the Software without restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
|
||||||
|
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
* subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||||
|
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||||
|
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package com.vitorpamplona.amethyst.service.previews
|
||||||
|
|
||||||
|
import kotlinx.collections.immutable.toImmutableMap
|
||||||
|
import java.lang.StringBuilder
|
||||||
|
|
||||||
|
internal data class MetaTag(private val attrs: Map<String, String>) {
|
||||||
|
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse a partial HTML document and extract meta tags
|
||||||
|
internal object MetaTagsParser {
|
||||||
|
private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
|
||||||
|
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
|
||||||
|
|
||||||
|
fun parse(input: String): Sequence<MetaTag> =
|
||||||
|
sequence {
|
||||||
|
val s = TagScanner(input)
|
||||||
|
while (!s.exhausted()) {
|
||||||
|
val t = s.nextTag() ?: continue
|
||||||
|
if (t.name == "/head") {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if (t.name == "meta") {
|
||||||
|
val attrs = parseAttrs(t.attrPart) ?: continue
|
||||||
|
yield(MetaTag(attrs))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private data class RawTag(val name: String, val attrPart: String)
|
||||||
|
|
||||||
|
private class TagScanner(private val input: String) {
|
||||||
|
var p = 0
|
||||||
|
|
||||||
|
fun exhausted(): Boolean = p >= input.length
|
||||||
|
|
||||||
|
private fun peek(): Char = input[p]
|
||||||
|
|
||||||
|
private fun consume(): Char {
|
||||||
|
return input[p++]
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun consumeChar(c: Char): Boolean {
|
||||||
|
if (this.peek() == c) {
|
||||||
|
this.consume()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun skipSpaces() {
|
||||||
|
while (!this.exhausted() && this.peek().isWhitespace()) {
|
||||||
|
this.consume()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun skipUntil(c: Char) {
|
||||||
|
while (!this.exhausted() && this.peek() != c) {
|
||||||
|
this.consume()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun readWhile(pred: (Char) -> Boolean): String {
|
||||||
|
val sb = StringBuilder()
|
||||||
|
while (!this.exhausted() && pred(this.peek())) {
|
||||||
|
sb.append(this.consume())
|
||||||
|
}
|
||||||
|
return sb.toString()
|
||||||
|
}
|
||||||
|
|
||||||
|
fun nextTag(): RawTag? {
|
||||||
|
skipUntil('<')
|
||||||
|
consume()
|
||||||
|
|
||||||
|
// read tag name
|
||||||
|
val name = StringBuilder()
|
||||||
|
if (consumeChar('/')) {
|
||||||
|
name.append('/')
|
||||||
|
}
|
||||||
|
val n = readWhile { !it.isWhitespace() && it != '>' }
|
||||||
|
skipSpaces()
|
||||||
|
|
||||||
|
// read until end of tag
|
||||||
|
val attrsPart = StringBuilder()
|
||||||
|
var quote: Char? = null
|
||||||
|
while (!exhausted()) {
|
||||||
|
val c = consume()
|
||||||
|
when {
|
||||||
|
// `/>` out of quote -> end of tag
|
||||||
|
quote == null && c == '/' && peek() == '>' -> {
|
||||||
|
consume()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// `>` out of quote -> end of tag
|
||||||
|
quote == null && c == '>' -> {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// entering quote
|
||||||
|
quote == null && (c == '\'' || c == '"') -> {
|
||||||
|
quote = c
|
||||||
|
}
|
||||||
|
// leaving quote
|
||||||
|
quote != null && c == quote -> {
|
||||||
|
quote = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
attrsPart.append(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!n.matches(Regex("""[0-9a-zA-Z]+"""))) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
return RawTag(name.append(n).toString().lowercase(), attrsPart.toString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// map of HTML element attribute name to its value, with additional logics:
|
||||||
|
// - attribute names are matched in a case-insensitive manner
|
||||||
|
// - attribute names never duplicate
|
||||||
|
// - commonly used character references in attribute values are resolved
|
||||||
|
private class Attrs {
|
||||||
|
companion object {
|
||||||
|
val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
|
||||||
|
val BASE_CHAR_REFS =
|
||||||
|
mapOf(
|
||||||
|
"amp" to "&",
|
||||||
|
"AMP" to "&",
|
||||||
|
"quot" to "\"",
|
||||||
|
"QUOT" to "\"",
|
||||||
|
"lt" to "<",
|
||||||
|
"LT" to "<",
|
||||||
|
"gt" to ">",
|
||||||
|
"GT" to ">",
|
||||||
|
)
|
||||||
|
val CHAR_REFS =
|
||||||
|
mapOf(
|
||||||
|
"apos" to "'",
|
||||||
|
"equals" to "=",
|
||||||
|
"grave" to "`",
|
||||||
|
"DiacriticalGrave" to "`",
|
||||||
|
)
|
||||||
|
|
||||||
|
fun replaceCharRefs(match: MatchResult): String {
|
||||||
|
val bcr = BASE_CHAR_REFS[match.groupValues[1]]
|
||||||
|
if (bcr != null) {
|
||||||
|
return bcr
|
||||||
|
}
|
||||||
|
// non-base char refs must be terminated by ';'
|
||||||
|
if (match.groupValues[2].isNotEmpty()) {
|
||||||
|
val cr = CHAR_REFS[match.groupValues[1]]
|
||||||
|
if (cr != null) {
|
||||||
|
return cr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return match.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private val attrs = mutableMapOf<String, String>()
|
||||||
|
|
||||||
|
fun add(attr: Pair<String, String>) {
|
||||||
|
val name = attr.first.lowercase()
|
||||||
|
if (attrs.containsKey(name)) {
|
||||||
|
throw IllegalArgumentException("duplicated attribute name: $name")
|
||||||
|
}
|
||||||
|
val value = attr.second.replace(RE_CHAR_REF, Companion::replaceCharRefs)
|
||||||
|
attrs += Pair(name, value)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun freeze(): Map<String, String> = attrs.toImmutableMap()
|
||||||
|
}
|
||||||
|
|
||||||
|
private enum class State {
|
||||||
|
NAME,
|
||||||
|
BEFORE_EQ,
|
||||||
|
AFTER_EQ,
|
||||||
|
VALUE,
|
||||||
|
SPACE,
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun parseAttrs(input: String): Map<String, String>? {
|
||||||
|
val attrs = Attrs()
|
||||||
|
|
||||||
|
var state = State.NAME
|
||||||
|
var nameBegin = 0
|
||||||
|
var nameEnd = 0
|
||||||
|
var valueBegin = 0
|
||||||
|
var valueQuote: Char? = null
|
||||||
|
|
||||||
|
input.forEachIndexed { i, c ->
|
||||||
|
when (state) {
|
||||||
|
State.NAME -> {
|
||||||
|
when {
|
||||||
|
c == '=' -> {
|
||||||
|
nameEnd = i
|
||||||
|
state = State.AFTER_EQ
|
||||||
|
}
|
||||||
|
|
||||||
|
c.isWhitespace() -> {
|
||||||
|
nameEnd = i
|
||||||
|
state = State.BEFORE_EQ
|
||||||
|
}
|
||||||
|
|
||||||
|
NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
State.BEFORE_EQ -> {
|
||||||
|
when {
|
||||||
|
c == '=' -> {
|
||||||
|
state = State.AFTER_EQ
|
||||||
|
}
|
||||||
|
|
||||||
|
c.isWhitespace() -> {}
|
||||||
|
else -> return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
State.AFTER_EQ -> {
|
||||||
|
when {
|
||||||
|
c.isWhitespace() -> {}
|
||||||
|
c == '\'' || c == '"' -> {
|
||||||
|
valueBegin = i + 1
|
||||||
|
valueQuote = c
|
||||||
|
state = State.VALUE
|
||||||
|
}
|
||||||
|
|
||||||
|
else -> {
|
||||||
|
valueBegin = i
|
||||||
|
valueQuote = null
|
||||||
|
state = State.VALUE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
State.VALUE -> {
|
||||||
|
var attr: Pair<String, String>? = null
|
||||||
|
when {
|
||||||
|
valueQuote != null -> {
|
||||||
|
if (c == valueQuote) {
|
||||||
|
attr =
|
||||||
|
Pair(
|
||||||
|
input.slice(nameBegin..<nameEnd),
|
||||||
|
input.slice(valueBegin..<i),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
valueQuote == null -> {
|
||||||
|
when {
|
||||||
|
c.isWhitespace() -> {
|
||||||
|
attr =
|
||||||
|
Pair(
|
||||||
|
input.slice(nameBegin..<nameEnd),
|
||||||
|
input.slice(valueBegin..<i),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
i == input.length - 1 -> {
|
||||||
|
attr =
|
||||||
|
Pair(
|
||||||
|
input.slice(nameBegin..<nameEnd),
|
||||||
|
input.slice(valueBegin..i),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (attr != null) {
|
||||||
|
runCatching { attrs.add(attr) }.getOrNull() ?: return null
|
||||||
|
state = State.SPACE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
State.SPACE -> {
|
||||||
|
if (!c.isWhitespace()) {
|
||||||
|
nameBegin = i
|
||||||
|
state = State.NAME
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return attrs.freeze()
|
||||||
|
}
|
||||||
|
}
|
@ -22,13 +22,11 @@ package com.vitorpamplona.amethyst.service.previews
|
|||||||
|
|
||||||
import com.vitorpamplona.amethyst.service.HttpClientManager
|
import com.vitorpamplona.amethyst.service.HttpClientManager
|
||||||
import com.vitorpamplona.amethyst.service.checkNotInMainThread
|
import com.vitorpamplona.amethyst.service.checkNotInMainThread
|
||||||
import kotlinx.collections.immutable.toImmutableMap
|
|
||||||
import kotlinx.coroutines.Dispatchers
|
import kotlinx.coroutines.Dispatchers
|
||||||
import kotlinx.coroutines.withContext
|
import kotlinx.coroutines.withContext
|
||||||
import okhttp3.MediaType
|
import okhttp3.MediaType
|
||||||
import okhttp3.MediaType.Companion.toMediaType
|
import okhttp3.MediaType.Companion.toMediaType
|
||||||
import okhttp3.Request
|
import okhttp3.Request
|
||||||
import okhttp3.ResponseBody
|
|
||||||
import okio.BufferedSource
|
import okio.BufferedSource
|
||||||
import okio.ByteString.Companion.decodeHex
|
import okio.ByteString.Companion.decodeHex
|
||||||
import okio.Options
|
import okio.Options
|
||||||
@ -81,7 +79,7 @@ suspend fun getDocument(
|
|||||||
"Website returned unknown mimetype: ${it.headers["Content-Type"]}",
|
"Website returned unknown mimetype: ${it.headers["Content-Type"]}",
|
||||||
)
|
)
|
||||||
if (mimeType.type == "text" && mimeType.subtype == "html") {
|
if (mimeType.type == "text" && mimeType.subtype == "html") {
|
||||||
parseHtml(url, it.body, mimeType)
|
parseHtml(url, it.body.source(), mimeType)
|
||||||
} else if (mimeType.type == "image") {
|
} else if (mimeType.type == "image") {
|
||||||
UrlInfoItem(url, image = url, mimeType = mimeType)
|
UrlInfoItem(url, image = url, mimeType = mimeType)
|
||||||
} else if (mimeType.type == "video") {
|
} else if (mimeType.type == "video") {
|
||||||
@ -99,24 +97,22 @@ suspend fun getDocument(
|
|||||||
|
|
||||||
suspend fun parseHtml(
|
suspend fun parseHtml(
|
||||||
url: String,
|
url: String,
|
||||||
body: ResponseBody,
|
source: BufferedSource,
|
||||||
type: MediaType,
|
type: MediaType,
|
||||||
): UrlInfoItem =
|
): UrlInfoItem =
|
||||||
withContext(Dispatchers.IO) {
|
withContext(Dispatchers.IO) {
|
||||||
val source = body.source()
|
|
||||||
|
|
||||||
// sniff charset from Content-Type header or BOM
|
// sniff charset from Content-Type header or BOM
|
||||||
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
|
val sniffedCharset = type.charset() ?: source.readBomAsCharset()
|
||||||
if (sniffedCharset != null) {
|
if (sniffedCharset != null) {
|
||||||
val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset).headTagContents())
|
val metaTags = MetaTagsParser.parse(source.readByteArray().toString(sniffedCharset))
|
||||||
return@withContext parseUrlInfo(url, metaTags, type)
|
return@withContext extractUrlInfo(url, metaTags, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
// if sniffing was failed, detect charset from content
|
// if sniffing was failed, detect charset from content
|
||||||
val bodyBytes = source.readByteArray()
|
val bodyBytes = source.readByteArray()
|
||||||
val charset = detectCharset(bodyBytes)
|
val charset = detectCharset(bodyBytes)
|
||||||
val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset).headTagContents())
|
val metaTags = MetaTagsParser.parse(bodyBytes.toString(charset))
|
||||||
return@withContext parseUrlInfo(url, metaTags, type)
|
return@withContext extractUrlInfo(url, metaTags, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
// taken from okhttp
|
// taken from okhttp
|
||||||
@ -151,7 +147,7 @@ private val RE_CONTENT_TYPE_CHARSET = Regex("""charset=([^;]+)""")
|
|||||||
private fun detectCharset(bodyBytes: ByteArray): Charset {
|
private fun detectCharset(bodyBytes: ByteArray): Charset {
|
||||||
// try to detect charset from meta tags parsed from first 1024 bytes of body
|
// try to detect charset from meta tags parsed from first 1024 bytes of body
|
||||||
val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8"))
|
val firstPart = String(bodyBytes, 0, 1024, Charset.forName("utf-8"))
|
||||||
val metaTags = runCatching { MetaTagsParser.parse(firstPart) }.getOrDefault(emptySequence())
|
val metaTags = MetaTagsParser.parse(firstPart)
|
||||||
metaTags.forEach { meta ->
|
metaTags.forEach { meta ->
|
||||||
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
|
val charsetAttr = meta.attr(ATTRIBUTE_VALUE_CHARSET)
|
||||||
if (charsetAttr.isNotEmpty()) {
|
if (charsetAttr.isNotEmpty()) {
|
||||||
@ -172,7 +168,7 @@ private fun detectCharset(bodyBytes: ByteArray): Charset {
|
|||||||
return Charset.forName("utf-8")
|
return Charset.forName("utf-8")
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun parseUrlInfo(
|
private fun extractUrlInfo(
|
||||||
url: String,
|
url: String,
|
||||||
metaTags: Sequence<MetaTag>,
|
metaTags: Sequence<MetaTag>,
|
||||||
type: MediaType,
|
type: MediaType,
|
||||||
@ -239,200 +235,3 @@ private fun parseUrlInfo(
|
|||||||
}
|
}
|
||||||
return UrlInfoItem(url, title, description, image, type)
|
return UrlInfoItem(url, title, description, image, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTML parsing stuff
|
|
||||||
private val RE_HEAD = Regex("""<head\s*>(.*?)</head\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
|
|
||||||
|
|
||||||
private fun String.headTagContents(): String = RE_HEAD.find(this)?.groupValues?.get(1) ?: ""
|
|
||||||
|
|
||||||
private class MetaTag(private val attrs: Map<String, String>) {
|
|
||||||
fun attr(name: String): String = attrs[name.lowercase()] ?: ""
|
|
||||||
}
|
|
||||||
|
|
||||||
// map of HTML element attribute name to its value, with additional logics:
|
|
||||||
// - attribute names are matched in a case-insensitive manner
|
|
||||||
// - attribute names never duplicate
|
|
||||||
// - commonly used character references in attribute values are resolved
|
|
||||||
private class Attrs {
|
|
||||||
companion object {
|
|
||||||
val RE_CHAR_REF = Regex("""&(\w+)(;?)""")
|
|
||||||
val BASE_CHAR_REFS =
|
|
||||||
mapOf(
|
|
||||||
"amp" to "&",
|
|
||||||
"AMP" to "&",
|
|
||||||
"quot" to "\"",
|
|
||||||
"QUOT" to "\"",
|
|
||||||
"lt" to "<",
|
|
||||||
"LT" to "<",
|
|
||||||
"gt" to ">",
|
|
||||||
"GT" to ">",
|
|
||||||
)
|
|
||||||
val CHAR_REFS =
|
|
||||||
mapOf(
|
|
||||||
"apos" to "'",
|
|
||||||
"equals" to "=",
|
|
||||||
"grave" to "`",
|
|
||||||
"DiacriticalGrave" to "`",
|
|
||||||
)
|
|
||||||
|
|
||||||
fun replaceCharRefs(match: MatchResult): String {
|
|
||||||
val bcr = BASE_CHAR_REFS[match.groupValues[1]]
|
|
||||||
if (bcr != null) {
|
|
||||||
return bcr
|
|
||||||
}
|
|
||||||
// non-base char refs must be terminated by ';'
|
|
||||||
if (match.groupValues[2].isNotEmpty()) {
|
|
||||||
val cr = CHAR_REFS[match.groupValues[1]]
|
|
||||||
if (cr != null) {
|
|
||||||
return cr
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return match.value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private val attrs = mutableMapOf<String, String>()
|
|
||||||
|
|
||||||
fun add(attr: Pair<String, String>) {
|
|
||||||
val name = attr.first.lowercase()
|
|
||||||
if (attrs.containsKey(name)) {
|
|
||||||
throw IllegalArgumentException("duplicated attribute name: $name")
|
|
||||||
}
|
|
||||||
val value = attr.second.replace(RE_CHAR_REF, Attrs::replaceCharRefs)
|
|
||||||
attrs += Pair(name, value)
|
|
||||||
}
|
|
||||||
|
|
||||||
fun freeze(): Map<String, String> = attrs.toImmutableMap()
|
|
||||||
}
|
|
||||||
|
|
||||||
// parser for parsing a partial HTML document into meta tags
|
|
||||||
private object MetaTagsParser {
|
|
||||||
private val RE_META = Regex("""<meta\s+(.+?)\s*>""", setOf(RegexOption.IGNORE_CASE, RegexOption.DOT_MATCHES_ALL))
|
|
||||||
|
|
||||||
private val NON_ATTR_NAME_CHARS = setOf(Char(0x0), '"', '\'', '>', '/')
|
|
||||||
private val NON_UNQUOTED_ATTR_VALUE_CHARS = setOf('"', '\'', '=', '>', '<', '`')
|
|
||||||
|
|
||||||
fun parse(input: String): Sequence<MetaTag> =
|
|
||||||
RE_META.findAll(input).mapNotNull {
|
|
||||||
runCatching { MetaTag(parseAttrs(it.groupValues[1])) }.getOrNull()
|
|
||||||
}
|
|
||||||
|
|
||||||
private enum class State {
|
|
||||||
NAME,
|
|
||||||
BEFORE_EQ,
|
|
||||||
AFTER_EQ,
|
|
||||||
VALUE,
|
|
||||||
SPACE,
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun parseAttrs(input: String): Map<String, String> {
|
|
||||||
val attrs = Attrs()
|
|
||||||
|
|
||||||
var state = State.NAME
|
|
||||||
var nameBegin = 0
|
|
||||||
var nameEnd = 0
|
|
||||||
var valueBegin = 0
|
|
||||||
var valueQuote: Char? = null
|
|
||||||
|
|
||||||
input.forEachIndexed { i, c ->
|
|
||||||
when (state) {
|
|
||||||
State.NAME -> {
|
|
||||||
when {
|
|
||||||
c == '=' -> {
|
|
||||||
nameEnd = i
|
|
||||||
state = State.AFTER_EQ
|
|
||||||
}
|
|
||||||
|
|
||||||
c.isWhitespace() -> {
|
|
||||||
nameEnd = i
|
|
||||||
state = State.BEFORE_EQ
|
|
||||||
}
|
|
||||||
|
|
||||||
NON_ATTR_NAME_CHARS.contains(c) || c.isISOControl() || !c.isDefined() -> {
|
|
||||||
throw IllegalArgumentException("meta has invalid attributes part")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
State.BEFORE_EQ -> {
|
|
||||||
when {
|
|
||||||
c == '=' -> {
|
|
||||||
state = State.AFTER_EQ
|
|
||||||
}
|
|
||||||
|
|
||||||
c.isWhitespace() -> {}
|
|
||||||
else -> throw IllegalArgumentException("meta has invalid attributes part")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
State.AFTER_EQ -> {
|
|
||||||
when {
|
|
||||||
c.isWhitespace() -> {}
|
|
||||||
c == '\'' || c == '"' -> {
|
|
||||||
valueBegin = i + 1
|
|
||||||
valueQuote = c
|
|
||||||
state = State.VALUE
|
|
||||||
}
|
|
||||||
|
|
||||||
else -> {
|
|
||||||
valueBegin = i
|
|
||||||
valueQuote = null
|
|
||||||
state = State.VALUE
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
State.VALUE -> {
|
|
||||||
var attr: Pair<String, String>? = null
|
|
||||||
when {
|
|
||||||
valueQuote != null -> {
|
|
||||||
if (c == valueQuote) {
|
|
||||||
attr =
|
|
||||||
Pair(
|
|
||||||
input.slice(nameBegin until nameEnd),
|
|
||||||
input.slice(valueBegin until i),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
valueQuote == null -> {
|
|
||||||
when {
|
|
||||||
c.isWhitespace() -> {
|
|
||||||
attr =
|
|
||||||
Pair(
|
|
||||||
input.slice(nameBegin until nameEnd),
|
|
||||||
input.slice(valueBegin until i),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
i == input.length - 1 -> {
|
|
||||||
attr =
|
|
||||||
Pair(
|
|
||||||
input.slice(nameBegin until nameEnd),
|
|
||||||
input.slice(valueBegin..i),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
NON_UNQUOTED_ATTR_VALUE_CHARS.contains(c) -> {
|
|
||||||
throw IllegalArgumentException("meta has invalid attributes part")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (attr != null) {
|
|
||||||
attrs.add(attr)
|
|
||||||
state = State.SPACE
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
State.SPACE -> {
|
|
||||||
if (!c.isWhitespace()) {
|
|
||||||
nameBegin = i
|
|
||||||
state = State.NAME
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return attrs.freeze()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -0,0 +1,81 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) 2024 Vitor Pamplona
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
* this software and associated documentation files (the "Software"), to deal in
|
||||||
|
* the Software without restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
|
||||||
|
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
* subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
||||||
|
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||||
|
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package com.vitorpamplona.amethyst.service.previews
|
||||||
|
|
||||||
|
import org.junit.Assert.assertEquals
|
||||||
|
import org.junit.Test
|
||||||
|
|
||||||
|
class MetaTagsParserTest {
|
||||||
|
@Test
|
||||||
|
fun testParse() {
|
||||||
|
val input =
|
||||||
|
"""<html>
|
||||||
|
| <head>
|
||||||
|
| <meta charset="utf-8">
|
||||||
|
| <meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||||
|
| <meta property="og:title" content=title>
|
||||||
|
| <meta property="og:description" content='description'>
|
||||||
|
| <meta property="og:image" content="https://example.com/img/foo.png">
|
||||||
|
| <!-- edge cases -->
|
||||||
|
| <meta
|
||||||
|
| name="newline"
|
||||||
|
| content="newline"
|
||||||
|
| >
|
||||||
|
| <meta name="space before gt" >
|
||||||
|
| <meta name ="space before =">
|
||||||
|
| <meta name= "space after =">
|
||||||
|
| <META NAME="CAPITAL">
|
||||||
|
| <meta name="character reference" content="<meta>">
|
||||||
|
| <meta name="attr value with end of head doesn't harm" content="<head>bang!</head>">
|
||||||
|
| <meta name="ignore tags with duplicated attr" name="dup">
|
||||||
|
| </head>
|
||||||
|
| <body>
|
||||||
|
| <meta name="ignore meta tags in body">
|
||||||
|
| </body>
|
||||||
|
|</html>
|
||||||
|
""".trimMargin()
|
||||||
|
|
||||||
|
val exp =
|
||||||
|
listOf(
|
||||||
|
listOf("charset" to "utf-8"),
|
||||||
|
listOf("http-equiv" to "content-type", "content" to "text/html; charset=utf-8"),
|
||||||
|
listOf("property" to "og:title", "content" to "title"),
|
||||||
|
listOf("property" to "og:description", "content" to "description"),
|
||||||
|
listOf("property" to "og:image", "content" to "https://example.com/img/foo.png"),
|
||||||
|
listOf("name" to "newline", "content" to "newline"),
|
||||||
|
listOf("name" to "space before gt"),
|
||||||
|
listOf("name" to "space before ="),
|
||||||
|
listOf("name" to "space after ="),
|
||||||
|
listOf("name" to "CAPITAL"),
|
||||||
|
listOf("name" to "character reference", "content" to "<meta>"),
|
||||||
|
listOf("name" to "attr value with end of head doesn't harm", "content" to "<head>bang!</head>"),
|
||||||
|
)
|
||||||
|
|
||||||
|
val metaTags = MetaTagsParser.parse(input).toList()
|
||||||
|
println(metaTags)
|
||||||
|
assertEquals(exp.size, metaTags.size)
|
||||||
|
metaTags.zip(exp).forEach { (meta, expAttrs) ->
|
||||||
|
expAttrs.forEach { (name, expValue) ->
|
||||||
|
assertEquals(expValue, meta.attr(name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user