From 25b694594b8a2cb6631e9e45d2c2b07357cfac6c Mon Sep 17 00:00:00 2001 From: kieran Date: Tue, 8 Oct 2024 10:58:54 +0100 Subject: [PATCH] refactor: check for dead links --- Cargo.lock | 132 ++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/bin/main.rs | 51 ++++++++++++++----- 3 files changed, 172 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8de7312..2418c0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -134,6 +134,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bytes" version = "1.7.2" @@ -277,6 +283,17 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "gimli" version = "0.31.0" @@ -360,6 +377,12 @@ version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + [[package]] name = "memchr" version = "2.7.4" @@ -390,6 +413,7 @@ dependencies = [ "serde_json", "tokio", "tokio-stream", + "ureq", "url", ] @@ -408,6 +432,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + [[package]] name = "percent-encoding" version = "2.3.1" @@ -479,12 +509,59 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys", +] + [[package]] name = "rustc-demangle" version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustls" +version = "0.23.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" + +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "ryu" version = "1.0.18" @@ -529,12 +606,24 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.79" @@ -622,6 +711,28 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" +dependencies = [ + "base64", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "url", + "webpki-roots", +] + [[package]] name = "url" version = "2.5.2" @@ -639,6 +750,21 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "webpki-roots" +version = "0.26.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -712,6 +838,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + [[package]] name = "zstd" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index 832615d..79f6a5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,4 @@ tokio-stream = "0.1.16" async-stream = "0.3.5" regex = "1.11.0" url = "2.5.2" +ureq = "2.10.1" diff --git a/src/bin/main.rs b/src/bin/main.rs index 8eeb7f5..b1cd7cd 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -5,6 +5,7 @@ use regex::Regex; use serde::Serialize; use std::collections::HashMap; use std::path::PathBuf; +use std::time::Duration; use tokio::fs::File; use tokio::io::AsyncWriteExt; use tokio_stream::StreamExt; @@ -81,12 +82,11 @@ async fn media_report(dir: PathBuf) -> Result<(), anyhow::Error> { let mut binding = NostrCursor::new(dir.clone()); let mut cursor = Box::pin(binding.walk()); + let mut link_heads: HashMap = HashMap::new(); let media_regex = Regex::new( - r"/((?:http|ftp|https|nostr|web\+nostr|magnet|lnurl[p|w]?):/?/?[\w+?.]+(?:[\p{L}\p{N}~!@#$%^&*()_\-=+\\/?.:;',]*)?[-a-z0-9+&@#/%=~()_|])/iu", + r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9\(\)]{1,6}\b(?:[-a-zA-Z0-9\(\)!@:%_\+.~#?&\/\/=]*)", )?; - let file_exts = vec![ - ".webp", ".jpg", ".jpeg", ".bmp", ".png", ".gif", ".webm", ".mp4", ".mov", ".mkv", - ]; + let file_ext = Regex::new(r"\.[a-zA-Z]{1,5}$")?; let mut notes = 0u64; while let Some(Ok(e)) = cursor.next().await { if e.kind != 1 { @@ -94,13 +94,12 @@ async fn media_report(dir: PathBuf) -> Result<(), anyhow::Error> { } notes += 1; - for text in media_regex.split(e.content.as_str()) { + for text in media_regex.find_iter(e.content.as_str()) { + let text = text.as_str().trim(); + if let Ok(u) = Url::parse(text) { - let ext = match file_exts - .iter() - .find(|e| text.to_ascii_lowercase().ends_with(*e)) - { - Some(ext) => ext, + let ext = match file_ext.find(u.path()) { + Some(ext) => ext.as_str(), None => continue, }; let host = match u.host_str() { @@ -112,13 +111,40 @@ async fn media_report(dir: PathBuf) -> Result<(), anyhow::Error> { if let Some(imeta) = e.tags.iter().find(|e| e[0] == "imeta") { if let Some(size) = imeta.iter().find(|a| a.starts_with("size")) { - let size_n = size.split(" ").last().unwrap().parse::()?; - inc_map(&mut report.hosts_size, host, size_n); + if let Ok(size_n) = size.split(" ").last().unwrap().parse::() { + inc_map(&mut report.hosts_size, host, size_n); + } } inc_map(&mut report.hosts_imeta, host, 1); } else { inc_map(&mut report.hosts_no_imeta, host, 1); } + + if let Some(hr) = link_heads.get(&u) { + if *hr { + inc_map(&mut report.hosts_dead, host, 1); + } + } else { + print!("Testing link {text} = "); + match ureq::head(text) + .timeout(Duration::from_secs(5)) + .call() { + Ok(rsp) => { + println!("{}", rsp.status()); + if rsp.status() > 300 { + inc_map(&mut report.hosts_dead, host, 1); + link_heads.insert(u, true); + } else { + link_heads.insert(u, false); + } + } + Err(_) => { + println!("500"); + inc_map(&mut report.hosts_dead, host, 1); + link_heads.insert(u, true); + } + } + } } } } @@ -156,6 +182,7 @@ where #[derive(Serialize, Default)] struct MediaReport { pub hosts_count: HashMap, + pub hosts_dead: HashMap, pub hosts_size: HashMap, pub hosts_imeta: HashMap, pub hosts_no_imeta: HashMap,