refactor: check for dead links

This commit is contained in:
kieran 2024-10-08 10:58:54 +01:00
parent 491f2d3482
commit 25b694594b
No known key found for this signature in database
GPG Key ID: DE71CEB3925BE941
3 changed files with 172 additions and 12 deletions

132
Cargo.lock generated
View File

@ -134,6 +134,12 @@ dependencies = [
"windows-targets",
]
[[package]]
name = "base64"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bytes"
version = "1.7.2"
@ -277,6 +283,17 @@ version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
[[package]]
name = "getrandom"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "gimli"
version = "0.31.0"
@ -360,6 +377,12 @@ version = "0.2.159"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "memchr"
version = "2.7.4"
@ -390,6 +413,7 @@ dependencies = [
"serde_json",
"tokio",
"tokio-stream",
"ureq",
"url",
]
@ -408,6 +432,12 @@ dependencies = [
"memchr",
]
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "percent-encoding"
version = "2.3.1"
@ -479,12 +509,59 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "ring"
version = "0.17.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
dependencies = [
"cc",
"cfg-if",
"getrandom",
"libc",
"spin",
"untrusted",
"windows-sys",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustls"
version = "0.23.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8"
dependencies = [
"log",
"once_cell",
"ring",
"rustls-pki-types",
"rustls-webpki",
"subtle",
"zeroize",
]
[[package]]
name = "rustls-pki-types"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55"
[[package]]
name = "rustls-webpki"
version = "0.102.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
dependencies = [
"ring",
"rustls-pki-types",
"untrusted",
]
[[package]]
name = "ryu"
version = "1.0.18"
@ -529,12 +606,24 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "spin"
version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "2.0.79"
@ -622,6 +711,28 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
[[package]]
name = "untrusted"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "ureq"
version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a"
dependencies = [
"base64",
"flate2",
"log",
"once_cell",
"rustls",
"rustls-pki-types",
"url",
"webpki-roots",
]
[[package]]
name = "url"
version = "2.5.2"
@ -639,6 +750,21 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "webpki-roots"
version = "0.26.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
@ -712,6 +838,12 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "zeroize"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
[[package]]
name = "zstd"
version = "0.13.2"

View File

@ -23,3 +23,4 @@ tokio-stream = "0.1.16"
async-stream = "0.3.5"
regex = "1.11.0"
url = "2.5.2"
ureq = "2.10.1"

View File

@ -5,6 +5,7 @@ use regex::Regex;
use serde::Serialize;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
use tokio_stream::StreamExt;
@ -81,12 +82,11 @@ async fn media_report(dir: PathBuf) -> Result<(), anyhow::Error> {
let mut binding = NostrCursor::new(dir.clone());
let mut cursor = Box::pin(binding.walk());
let mut link_heads: HashMap<Url, bool> = HashMap::new();
let media_regex = Regex::new(
r"/((?:http|ftp|https|nostr|web\+nostr|magnet|lnurl[p|w]?):/?/?[\w+?.]+(?:[\p{L}\p{N}~!@#$%^&*()_\-=+\\/?.:;',]*)?[-a-z0-9+&@#/%=~()_|])/iu",
r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9\(\)]{1,6}\b(?:[-a-zA-Z0-9\(\)!@:%_\+.~#?&\/\/=]*)",
)?;
let file_exts = vec![
".webp", ".jpg", ".jpeg", ".bmp", ".png", ".gif", ".webm", ".mp4", ".mov", ".mkv",
];
let file_ext = Regex::new(r"\.[a-zA-Z]{1,5}$")?;
let mut notes = 0u64;
while let Some(Ok(e)) = cursor.next().await {
if e.kind != 1 {
@ -94,13 +94,12 @@ async fn media_report(dir: PathBuf) -> Result<(), anyhow::Error> {
}
notes += 1;
for text in media_regex.split(e.content.as_str()) {
for text in media_regex.find_iter(e.content.as_str()) {
let text = text.as_str().trim();
if let Ok(u) = Url::parse(text) {
let ext = match file_exts
.iter()
.find(|e| text.to_ascii_lowercase().ends_with(*e))
{
Some(ext) => ext,
let ext = match file_ext.find(u.path()) {
Some(ext) => ext.as_str(),
None => continue,
};
let host = match u.host_str() {
@ -112,13 +111,40 @@ async fn media_report(dir: PathBuf) -> Result<(), anyhow::Error> {
if let Some(imeta) = e.tags.iter().find(|e| e[0] == "imeta") {
if let Some(size) = imeta.iter().find(|a| a.starts_with("size")) {
let size_n = size.split(" ").last().unwrap().parse::<u64>()?;
if let Ok(size_n) = size.split(" ").last().unwrap().parse::<u64>() {
inc_map(&mut report.hosts_size, host, size_n);
}
}
inc_map(&mut report.hosts_imeta, host, 1);
} else {
inc_map(&mut report.hosts_no_imeta, host, 1);
}
if let Some(hr) = link_heads.get(&u) {
if *hr {
inc_map(&mut report.hosts_dead, host, 1);
}
} else {
print!("Testing link {text} = ");
match ureq::head(text)
.timeout(Duration::from_secs(5))
.call() {
Ok(rsp) => {
println!("{}", rsp.status());
if rsp.status() > 300 {
inc_map(&mut report.hosts_dead, host, 1);
link_heads.insert(u, true);
} else {
link_heads.insert(u, false);
}
}
Err(_) => {
println!("500");
inc_map(&mut report.hosts_dead, host, 1);
link_heads.insert(u, true);
}
}
}
}
}
}
@ -156,6 +182,7 @@ where
#[derive(Serialize, Default)]
struct MediaReport {
pub hosts_count: HashMap<String, u64>,
pub hosts_dead: HashMap<String, u64>,
pub hosts_size: HashMap<String, u64>,
pub hosts_imeta: HashMap<String, u64>,
pub hosts_no_imeta: HashMap<String, u64>,