diff --git a/.gitignore b/.gitignore index 400ff9bf..fa561541 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ flamegraph.svg candle-wasm-example/*.wav candle-wasm-example/*.safetensors +candle-wasm-example/package-lock.json diff --git a/candle-wasm-example/Cargo.toml b/candle-wasm-example/Cargo.toml index e4a2319c..a76ce940 100644 --- a/candle-wasm-example/Cargo.toml +++ b/candle-wasm-example/Cargo.toml @@ -16,17 +16,25 @@ crate-type = ["cdylib"] [dependencies] candle = { path = "../candle-core" } candle-nn = { path = "../candle-nn" } -wasm-bindgen = "0.2.87" -getrandom = { version = "0.2", features = ["js"] } +num-traits = "0.2.15" tokenizers = { version = "0.13.3", default-features=false, features=["unstable_wasm"] } + +# App crates. +anyhow = "1.0.71" +log = "0.4" +rand = "0.8.5" serde = { version = "1.0.166", features = ["derive"] } serde_json = "1.0.99" wav = "1.0.0" -rand = "0.8.5" -num-traits = "0.2.15" -anyhow = "1.0.71" + +# Wasm specific crates. +getrandom = { version = "0.2", features = ["js"] } +gloo = "0.8" js-sys = "0.3.64" +wasm-bindgen = "0.2.87" wasm-bindgen-futures = "0.4.37" +wasm-logger = "0.2" +yew = { version = "0.20.0", features = ["csr"] } [dependencies.web-sys] version = "0.3.64" diff --git a/candle-wasm-example/index.html b/candle-wasm-example/index.html index 2ba74929..a878197c 100644 --- a/candle-wasm-example/index.html +++ b/candle-wasm-example/index.html @@ -1,9 +1,16 @@ - + - - Hello Candle - Rust - + + Welcome to Candle! + + + + + + + + - + diff --git a/candle-wasm-example/index.js b/candle-wasm-example/index.js deleted file mode 100644 index 74bb5e99..00000000 --- a/candle-wasm-example/index.js +++ /dev/null @@ -1,8 +0,0 @@ -import init from "./pkg/candle_wasm.js"; - -const runWasm = async () => { - const candleWasm = await init("./pkg/candle_wasm_bg.wasm"); - candleWasm.test_fn(); - await candleWasm.run_fn(); -}; -runWasm(); diff --git a/candle-wasm-example/main.js b/candle-wasm-example/main.js new file mode 100644 index 00000000..d2d7de05 --- /dev/null +++ b/candle-wasm-example/main.js @@ -0,0 +1,6 @@ +import init, { run_app } from './pkg/candle_wasm_example.js'; +async function main() { + await init('/pkg/candle_wasm_example_bg.wasm'); + run_app(); +} +main() diff --git a/candle-wasm-example/src/app.rs b/candle-wasm-example/src/app.rs new file mode 100644 index 00000000..617c838d --- /dev/null +++ b/candle-wasm-example/src/app.rs @@ -0,0 +1,450 @@ +use crate::model::{Config, Whisper}; +use anyhow::Error as E; +use candle::{DType, Device, Tensor}; +use candle_nn::VarBuilder; +use js_sys::Date; +use rand::distributions::Distribution; +use tokenizers::Tokenizer; +use wasm_bindgen::prelude::*; +use wasm_bindgen_futures::JsFuture; +use yew::{html, Component, Context, Html}; + +const SAMPLE_NAMES: [&str; 6] = [ + "jfk.wav", "a13.wav", "gb0.wav", "gb1.wav", "hp0.wav", "mm0.wav", +]; + +pub const DTYPE: DType = DType::F32; + +// Audio parameters. +pub const SAMPLE_RATE: usize = 16000; +pub const N_FFT: usize = 400; +pub const N_MELS: usize = 80; +pub const HOP_LENGTH: usize = 160; +pub const CHUNK_LENGTH: usize = 30; +pub const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk +pub const N_FRAMES: usize = N_SAMPLES / HOP_LENGTH; // 3000 frames in a mel spectrogram input +pub const N_SAMPLES_PER_TOKEN: usize = HOP_LENGTH * 2; // the initial convolutions has stride 2 +pub const FRAMES_PER_SECOND: usize = SAMPLE_RATE / HOP_LENGTH; // 10ms per audio frame +pub const TOKENS_PER_SECOND: usize = SAMPLE_RATE / N_SAMPLES_PER_TOKEN; // 20ms per audio token + +pub const NO_SPEECH_THRESHOLD: f64 = 0.6; +pub const LOGPROB_THRESHOLD: f64 = -1.0; +pub const TEMPERATURES: [f64; 6] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]; +pub const COMPRESSION_RATIO_THRESHOLD: f64 = 2.4; + +// Tokenizer dependent bits. +pub const SOT_TOKEN: u32 = 50257; +pub const EOT_TOKEN: u32 = 50256; +pub const NO_SPEECH_TOKEN: u32 = 50361; +pub const NO_TIMESTAMP_TOKEN: u32 = 50362; +// From the _get_suppress_tokens function + 50362 (no timestamp) +// https://github.com/openai/whisper/blob/f572f2161ba831bae131364c3bffdead7af6d210/whisper/decoding.py#L605 +pub const SUPPRESS_TOKENS: [u32; 91] = [ + 1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, + 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, + 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, + 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, + 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, + 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361, 50362, +]; + +#[wasm_bindgen] +extern "C" { + // Use `js_namespace` here to bind `console.log(..)` instead of just + // `log(..)` + #[wasm_bindgen(js_namespace = console)] + fn log(s: &str); +} + +macro_rules! console_log { + // Note that this is using the `log` function imported above during + // `bare_bones` + ($($t:tt)*) => (log(&format_args!($($t)*).to_string())) +} + +#[derive(Debug, Clone)] +struct DecodingResult { + tokens: Vec, + text: String, + avg_logprob: f64, + no_speech_prob: f64, + temperature: f64, + compression_ratio: f64, +} + +#[derive(Debug, Clone)] +struct Segment { + start: f64, + duration: f64, + dr: DecodingResult, +} + +pub struct Decoder { + model: Whisper, + mel_filters: Vec, + tokenizer: Tokenizer, + suppress_tokens: Tensor, +} + +impl Decoder { + fn new( + model: Whisper, + tokenizer: Tokenizer, + mel_filters: Vec, + device: &Device, + ) -> anyhow::Result { + let suppress_tokens: Vec = (0..model.config.vocab_size as u32) + .map(|i| { + if SUPPRESS_TOKENS.contains(&i) { + f32::NEG_INFINITY + } else { + 0f32 + } + }) + .collect(); + let suppress_tokens = Tensor::new(suppress_tokens.as_slice(), device)?; + Ok(Self { + model, + mel_filters, + tokenizer, + suppress_tokens, + }) + } + + fn decode(&self, mel: &Tensor, t: f64) -> anyhow::Result { + let model = &self.model; + let audio_features = model.encoder.forward(mel)?; + console_log!("audio features: {:?}", audio_features.dims()); + let sample_len = model.config.max_target_positions / 2; + let mut sum_logprob = 0f64; + let mut no_speech_prob = f64::NAN; + let mut tokens = vec![SOT_TOKEN]; + for i in 0..sample_len { + let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?; + + // The model expects a batch dim but this inference loop does not handle + // it so we add it at this point. + let tokens_t = tokens_t.unsqueeze(0)?; + let logits = model.decoder.forward(&tokens_t, &audio_features)?; + let logits = logits.squeeze(0)?; + + // Extract the no speech probability on the first iteration by looking at the first + // token logits and the probability for the according token. + if i == 0 { + no_speech_prob = logits + .get(0)? + .softmax(0)? + .get(NO_SPEECH_TOKEN as usize)? + .to_scalar::()? as f64; + } + + let (seq_len, _) = logits.shape().r2()?; + let logits = logits + .get(seq_len - 1)? + .broadcast_add(&self.suppress_tokens)?; + let next_token = if t > 0f64 { + let prs = (&logits / t)?.softmax(0)?; + let logits_v: Vec = prs.to_vec1()?; + let distr = rand::distributions::WeightedIndex::new(&logits_v)?; + let mut rng = rand::thread_rng(); + distr.sample(&mut rng) as u32 + } else { + let logits_v: Vec = logits.to_vec1()?; + logits_v + .iter() + .enumerate() + .max_by(|(_, u), (_, v)| u.total_cmp(v)) + .map(|(i, _)| i as u32) + .unwrap() + }; + tokens.push(next_token); + let prob = logits + .softmax(candle::D::Minus1)? + .get(next_token as usize)? + .to_scalar::()? as f64; + if next_token == EOT_TOKEN || tokens.len() > model.config.max_target_positions { + break; + } + sum_logprob += prob.ln(); + } + let text = self + .tokenizer + .decode(tokens.clone(), true) + .map_err(E::msg)?; + let avg_logprob = sum_logprob / tokens.len() as f64; + + Ok(DecodingResult { + tokens, + text, + avg_logprob, + no_speech_prob, + temperature: t, + compression_ratio: f64::NAN, + }) + } + + fn decode_with_fallback(&self, segment: &Tensor) -> anyhow::Result { + for (i, &t) in TEMPERATURES.iter().enumerate() { + let dr: Result = self.decode(segment, t); + if i == TEMPERATURES.len() - 1 { + return dr; + } + // On errors, we try again with a different temperature. + match dr { + Ok(dr) => { + let needs_fallback = dr.compression_ratio > COMPRESSION_RATIO_THRESHOLD + || dr.avg_logprob < LOGPROB_THRESHOLD; + if !needs_fallback || dr.no_speech_prob > NO_SPEECH_THRESHOLD { + return Ok(dr); + } + } + Err(err) => { + console_log!("Error running at {t}: {err}") + } + } + } + unreachable!() + } + + fn run(&self, mel: &Tensor) -> anyhow::Result> { + let (_, _, content_frames) = mel.shape().r3()?; + let mut seek = 0; + let mut segments = vec![]; + while seek < content_frames { + let time_offset = (seek * HOP_LENGTH) as f64 / SAMPLE_RATE as f64; + let segment_size = usize::min(content_frames - seek, N_FRAMES); + let mel_segment = mel.narrow(2, seek, segment_size)?; + let segment_duration = (segment_size * HOP_LENGTH) as f64 / SAMPLE_RATE as f64; + let dr = self.decode_with_fallback(&mel_segment)?; + seek += segment_size; + if dr.no_speech_prob > NO_SPEECH_THRESHOLD && dr.avg_logprob < LOGPROB_THRESHOLD { + console_log!("no speech detected, skipping {seek} {dr:?}"); + continue; + } + let segment = Segment { + start: time_offset, + duration: segment_duration, + dr, + }; + console_log!("{seek}: {segment:?}"); + segments.push(segment) + } + Ok(segments) + } + + async fn load() -> Result { + let device = Device::Cpu; + let tokenizer_config = fetch_url("tokenizer.en.json").await?; + let tokenizer = Tokenizer::from_bytes(tokenizer_config).map_err(w)?; + + let mel_filters = fetch_url("mel_filters.safetensors").await?; + let mel_filters = candle::safetensors::SafeTensors::from_buffer(&mel_filters).map_err(w)?; + let mel_filters = mel_filters.tensor("mel_80", &device).map_err(w)?; + console_log!("loaded mel filters {:?}", mel_filters.shape()); + let mel_filters = mel_filters + .flatten_all() + .map_err(w)? + .to_vec1::() + .map_err(w)?; + let weights = fetch_url("tiny.en.safetensors").await?; + let weights = candle::safetensors::SafeTensors::from_buffer(&weights).map_err(w)?; + let vb = VarBuilder::from_safetensors(vec![weights], DTYPE, &device); + let config = Config::tiny_en(); + let whisper = Whisper::load(&vb, config).map_err(w)?; + console_log!("done loading model"); + let model = Decoder::new(whisper, tokenizer, mel_filters, &device).map_err(w)?; + Ok(model) + } + + async fn load_and_run(&self, name: &str) -> Result, JsValue> { + let device = Device::Cpu; + let wav_input = fetch_url(name).await?; + let mut wav_input = std::io::Cursor::new(wav_input); + let (header, data) = wav::read(&mut wav_input).map_err(w)?; + console_log!("loaded wav data: {header:?}"); + if header.sampling_rate != SAMPLE_RATE as u32 { + Err(format!( + "wav file must have a {} sampling rate", + SAMPLE_RATE + ))? + } + let data = data.as_sixteen().expect("expected 16 bit wav file"); + let pcm_data: Vec<_> = data[..data.len() / header.channel_count as usize] + .iter() + .map(|v| *v as f32 / 32768.) + .collect(); + console_log!("pcm data loaded {}", pcm_data.len()); + let mel = crate::audio::pcm_to_mel(&pcm_data, &self.mel_filters).map_err(w)?; + let mel_len = mel.len(); + let mel = Tensor::from_vec(mel, (1, N_MELS, mel_len / N_MELS), &device).map_err(w)?; + console_log!("loaded mel: {:?}", mel.dims()); + + let segments = self.run(&mel).map_err(w)?; + Ok(segments) + } +} + +async fn fetch_url(url: &str) -> Result, JsValue> { + use web_sys::{Request, RequestCache, RequestInit, RequestMode, Response}; + let window = web_sys::window().ok_or("window")?; + let mut opts = RequestInit::new(); + let opts = opts + .method("GET") + .mode(RequestMode::Cors) + .cache(RequestCache::NoCache); + + let request = Request::new_with_str_and_init(url, opts)?; + + let resp_value = JsFuture::from(window.fetch_with_request(&request)).await?; + + // `resp_value` is a `Response` object. + assert!(resp_value.is_instance_of::()); + let resp: Response = resp_value.dyn_into()?; + let data = JsFuture::from(resp.blob()?).await?; + let blob = web_sys::Blob::from(data); + let array_buffer = JsFuture::from(blob.array_buffer()).await?; + let data = js_sys::Uint8Array::new(&array_buffer).to_vec(); + Ok(data) +} + +fn w(x: T) -> String { + x.to_string() +} + +pub enum Msg { + Run(usize), + UpdateStatus(String), + RunFinished(String), + SetDecoder(Decoder), +} + +pub struct App { + status: String, + content: String, + decode_in_flight: bool, + decoder: Option>, +} + +impl Component for App { + type Message = Msg; + type Properties = (); + + fn create(_ctx: &Context) -> Self { + let status = "loading weights".to_string(); + Self { + status, + content: String::new(), + decode_in_flight: false, + decoder: None, + } + } + + fn rendered(&mut self, ctx: &Context, first_render: bool) { + if first_render { + ctx.link().send_future(async { + match Decoder::load().await { + Err(err) => { + let status = format!("{err:?}"); + Msg::UpdateStatus(status) + } + Ok(decoder) => Msg::SetDecoder(decoder), + } + }); + } + } + + fn update(&mut self, ctx: &Context, msg: Self::Message) -> bool { + match msg { + Msg::SetDecoder(decoder) => { + self.status = "weights loaded succesfully!".to_string(); + self.decoder = Some(std::sync::Arc::new(decoder)); + true + } + Msg::Run(sample_index) => { + let sample = SAMPLE_NAMES[sample_index]; + match &self.decoder { + None => self.content = "waiting for weights to load".to_string(), + Some(decoder) => { + if self.decode_in_flight { + self.content = "already decoding some sample at the moment".to_string() + } else { + let decoder = decoder.clone(); + self.decode_in_flight = true; + self.status = format!("decoding {sample}"); + self.content = String::new(); + ctx.link().send_future(async move { + let content = decoder.load_and_run(sample).await; + let content = match content { + Err(err) => format!("decoding error: {err:?}"), + Ok(segments) => format!("decoded succesfully: {segments:?}"), + }; + Msg::RunFinished(content) + }) + } + // + } + } + true + } + Msg::RunFinished(content) => { + self.status = "Run finished!".to_string(); + self.content = content; + self.decode_in_flight = false; + true + } + Msg::UpdateStatus(status) => { + self.status = status; + true + } + } + } + + fn view(&self, ctx: &Context) -> Html { + html! { +
+ + + + + + + + + + { + SAMPLE_NAMES.iter().enumerate().map(|(i, name)| { html! { + + + + + + } + }).collect::() + } + +
{"Sample"}
{name}
+

+ {&self.status} +

+ { + if self.decode_in_flight { + html! { } + } else { html!{ +
+

+ {&self.content} +

+
+ } + } + } + + // Display the current date and time the page was rendered + +
+ } + } +} diff --git a/candle-wasm-example/src/audio.rs b/candle-wasm-example/src/audio.rs index d095e239..d73c3142 100644 --- a/candle-wasm-example/src/audio.rs +++ b/candle-wasm-example/src/audio.rs @@ -1,5 +1,6 @@ // Audio processing code, adapted from whisper.cpp // https://github.com/ggerganov/whisper.cpp +use super::app; pub trait Float: num_traits::Float + num_traits::FloatConst + num_traits::NumAssign {} @@ -169,7 +170,7 @@ fn log_mel_spectrogram_( let n_len = samples.len() / fft_step; // pad audio with at least one extra chunk of zeros - let pad = 100 * super::CHUNK_LENGTH / 2; + let pad = 100 * app::CHUNK_LENGTH / 2; let n_len = if n_len % pad != 0 { (n_len / pad + 1) * pad } else { @@ -207,9 +208,9 @@ pub fn pcm_to_mel( let mel = log_mel_spectrogram_( samples, filters, - super::N_FFT, - super::HOP_LENGTH, - super::N_MELS, + app::N_FFT, + app::HOP_LENGTH, + app::N_MELS, false, ); Ok(mel) diff --git a/candle-wasm-example/src/lib.rs b/candle-wasm-example/src/lib.rs index 8e51f872..c4c0a3cf 100644 --- a/candle-wasm-example/src/lib.rs +++ b/candle-wasm-example/src/lib.rs @@ -1,335 +1,14 @@ #![allow(dead_code)] -use anyhow::Error as E; -use candle::{DType, Device, Tensor}; -use candle_nn::VarBuilder; -use rand::{distributions::Distribution, SeedableRng}; -use tokenizers::Tokenizer; use wasm_bindgen::prelude::*; -use wasm_bindgen_futures::JsFuture; +mod app; mod audio; mod model; -use model::{Config, Whisper}; - -const DTYPE: DType = DType::F32; - -// Audio parameters. -const SAMPLE_RATE: usize = 16000; -const N_FFT: usize = 400; -const N_MELS: usize = 80; -const HOP_LENGTH: usize = 160; -const CHUNK_LENGTH: usize = 30; -const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk -const N_FRAMES: usize = N_SAMPLES / HOP_LENGTH; // 3000 frames in a mel spectrogram input -const N_SAMPLES_PER_TOKEN: usize = HOP_LENGTH * 2; // the initial convolutions has stride 2 -const FRAMES_PER_SECOND: usize = SAMPLE_RATE / HOP_LENGTH; // 10ms per audio frame -const TOKENS_PER_SECOND: usize = SAMPLE_RATE / N_SAMPLES_PER_TOKEN; // 20ms per audio token - -const NO_SPEECH_THRESHOLD: f64 = 0.6; -const LOGPROB_THRESHOLD: f64 = -1.0; -const TEMPERATURES: [f64; 6] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]; -const COMPRESSION_RATIO_THRESHOLD: f64 = 2.4; - -// Tokenizer dependent bits. -const SOT_TOKEN: u32 = 50257; -const EOT_TOKEN: u32 = 50256; -const NO_SPEECH_TOKEN: u32 = 50361; -const NO_TIMESTAMP_TOKEN: u32 = 50362; -// From the _get_suppress_tokens function + 50362 (no timestamp) -// https://github.com/openai/whisper/blob/f572f2161ba831bae131364c3bffdead7af6d210/whisper/decoding.py#L605 -const SUPPRESS_TOKENS: [u32; 91] = [ - 1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357, - 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782, - 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, - 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992, - 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549, - 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361, 50362, -]; #[wasm_bindgen] -extern "C" { - // Use `js_namespace` here to bind `console.log(..)` instead of just - // `log(..)` - #[wasm_bindgen(js_namespace = console)] - fn log(s: &str); -} +pub fn run_app() -> Result<(), JsValue> { + wasm_logger::init(wasm_logger::Config::new(log::Level::Trace)); + yew::Renderer::::new().render(); -macro_rules! console_log { - // Note that this is using the `log` function imported above during - // `bare_bones` - ($($t:tt)*) => (log(&format_args!($($t)*).to_string())) -} - -#[derive(Debug, Clone)] -struct DecodingResult { - tokens: Vec, - text: String, - avg_logprob: f64, - no_speech_prob: f64, - temperature: f64, - compression_ratio: f64, -} - -#[derive(Debug, Clone)] -struct Segment { - start: f64, - duration: f64, - dr: DecodingResult, -} - -struct Decoder { - model: Whisper, - rng: rand::rngs::StdRng, - tokenizer: Tokenizer, - suppress_tokens: Tensor, -} - -impl Decoder { - fn new( - model: Whisper, - tokenizer: Tokenizer, - seed: u64, - device: &Device, - ) -> anyhow::Result { - let suppress_tokens: Vec = (0..model.config.vocab_size as u32) - .map(|i| { - if SUPPRESS_TOKENS.contains(&i) { - f32::NEG_INFINITY - } else { - 0f32 - } - }) - .collect(); - let suppress_tokens = Tensor::new(suppress_tokens.as_slice(), device)?; - Ok(Self { - model, - rng: rand::rngs::StdRng::seed_from_u64(seed), - tokenizer, - suppress_tokens, - }) - } - - fn decode(&mut self, mel: &Tensor, t: f64) -> anyhow::Result { - let model = &self.model; - let audio_features = model.encoder.forward(mel)?; - console_log!("audio features: {:?}", audio_features.dims()); - let sample_len = model.config.max_target_positions / 2; - let mut sum_logprob = 0f64; - let mut no_speech_prob = f64::NAN; - let mut tokens = vec![SOT_TOKEN]; - for i in 0..sample_len { - let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?; - - // The model expects a batch dim but this inference loop does not handle - // it so we add it at this point. - let tokens_t = tokens_t.unsqueeze(0)?; - let logits = model.decoder.forward(&tokens_t, &audio_features)?; - let logits = logits.squeeze(0)?; - - // Extract the no speech probability on the first iteration by looking at the first - // token logits and the probability for the according token. - if i == 0 { - no_speech_prob = logits - .get(0)? - .softmax(0)? - .get(NO_SPEECH_TOKEN as usize)? - .to_scalar::()? as f64; - } - - let (seq_len, _) = logits.shape().r2()?; - let logits = logits - .get(seq_len - 1)? - .broadcast_add(&self.suppress_tokens)?; - let next_token = if t > 0f64 { - let prs = (&logits / t)?.softmax(0)?; - let logits_v: Vec = prs.to_vec1()?; - let distr = rand::distributions::WeightedIndex::new(&logits_v)?; - distr.sample(&mut self.rng) as u32 - } else { - let logits_v: Vec = logits.to_vec1()?; - logits_v - .iter() - .enumerate() - .max_by(|(_, u), (_, v)| u.total_cmp(v)) - .map(|(i, _)| i as u32) - .unwrap() - }; - tokens.push(next_token); - let prob = logits - .softmax(candle::D::Minus1)? - .get(next_token as usize)? - .to_scalar::()? as f64; - if next_token == EOT_TOKEN || tokens.len() > model.config.max_target_positions { - break; - } - sum_logprob += prob.ln(); - } - let text = self - .tokenizer - .decode(tokens.clone(), true) - .map_err(E::msg)?; - let avg_logprob = sum_logprob / tokens.len() as f64; - - Ok(DecodingResult { - tokens, - text, - avg_logprob, - no_speech_prob, - temperature: t, - compression_ratio: f64::NAN, - }) - } - - fn decode_with_fallback(&mut self, segment: &Tensor) -> anyhow::Result { - for (i, &t) in TEMPERATURES.iter().enumerate() { - let dr: Result = self.decode(segment, t); - if i == TEMPERATURES.len() - 1 { - return dr; - } - // On errors, we try again with a different temperature. - match dr { - Ok(dr) => { - let needs_fallback = dr.compression_ratio > COMPRESSION_RATIO_THRESHOLD - || dr.avg_logprob < LOGPROB_THRESHOLD; - if !needs_fallback || dr.no_speech_prob > NO_SPEECH_THRESHOLD { - return Ok(dr); - } - } - Err(err) => { - console_log!("Error running at {t}: {err}") - } - } - } - unreachable!() - } - - fn run(&mut self, mel: &Tensor) -> anyhow::Result> { - let (_, _, content_frames) = mel.shape().r3()?; - let mut seek = 0; - let mut segments = vec![]; - while seek < content_frames { - let time_offset = (seek * HOP_LENGTH) as f64 / SAMPLE_RATE as f64; - let segment_size = usize::min(content_frames - seek, N_FRAMES); - let mel_segment = mel.narrow(2, seek, segment_size)?; - let segment_duration = (segment_size * HOP_LENGTH) as f64 / SAMPLE_RATE as f64; - let dr = self.decode_with_fallback(&mel_segment)?; - seek += segment_size; - if dr.no_speech_prob > NO_SPEECH_THRESHOLD && dr.avg_logprob < LOGPROB_THRESHOLD { - console_log!("no speech detected, skipping {seek} {dr:?}"); - continue; - } - let segment = Segment { - start: time_offset, - duration: segment_duration, - dr, - }; - console_log!("{seek}: {segment:?}"); - segments.push(segment) - } - Ok(segments) - } -} - -async fn fetch_url(url: &str) -> Result, JsValue> { - use web_sys::{Request, RequestCache, RequestInit, RequestMode, Response}; - let window = web_sys::window().ok_or("window")?; - let mut opts = RequestInit::new(); - let opts = opts - .method("GET") - .mode(RequestMode::Cors) - .cache(RequestCache::NoCache); - - let request = Request::new_with_str_and_init(url, opts)?; - - let resp_value = JsFuture::from(window.fetch_with_request(&request)).await?; - - // `resp_value` is a `Response` object. - assert!(resp_value.is_instance_of::()); - let resp: Response = resp_value.dyn_into()?; - let data = JsFuture::from(resp.blob()?).await?; - let blob = web_sys::Blob::from(data); - let array_buffer = JsFuture::from(blob.array_buffer()).await?; - let data = js_sys::Uint8Array::new(&array_buffer).to_vec(); - Ok(data) -} - -fn w(x: T) -> String { - x.to_string() -} - -async fn run_impl() -> Result<(), JsValue> { - let device = Device::Cpu; - let tokenizer_config = fetch_url("tokenizer.en.json").await?; - let tokenizer = Tokenizer::from_bytes(tokenizer_config).map_err(w)?; - - let mel_filters = fetch_url("mel_filters.safetensors").await?; - let mel_filters = candle::safetensors::SafeTensors::from_buffer(&mel_filters).map_err(w)?; - let mel_filters = mel_filters.tensor("mel_80", &device).map_err(w)?; - console_log!("loaded mel filters {:?}", mel_filters.shape()); - let mel_filters = mel_filters - .flatten_all() - .map_err(w)? - .to_vec1::() - .map_err(w)?; - - let wav_input = fetch_url("jfk.wav").await?; - let mut wav_input = std::io::Cursor::new(wav_input); - let (header, data) = wav::read(&mut wav_input).map_err(w)?; - console_log!("loaded wav data: {header:?}"); - if header.sampling_rate != SAMPLE_RATE as u32 { - Err(format!( - "wav file must have a {} sampling rate", - SAMPLE_RATE - ))? - } - let data = data.as_sixteen().expect("expected 16 bit wav file"); - let pcm_data: Vec<_> = data[..data.len() / header.channel_count as usize] - .iter() - .map(|v| *v as f32 / 32768.) - .collect(); - console_log!("pcm data loaded {}", pcm_data.len()); - let mel = audio::pcm_to_mel(&pcm_data, &mel_filters).map_err(w)?; - let mel_len = mel.len(); - let mel = Tensor::from_vec(mel, (1, N_MELS, mel_len / N_MELS), &device).map_err(w)?; - console_log!("loaded mel: {:?}", mel.dims()); - - let weights = fetch_url("tiny.en.safetensors").await?; - let weights = candle::safetensors::SafeTensors::from_buffer(&weights).map_err(w)?; - let vb = VarBuilder::from_safetensors(vec![weights], DTYPE, &device); - let config = Config::tiny_en(); - let model = Whisper::load(&vb, config).map_err(w)?; - let mut dc = Decoder::new(model, tokenizer, 299792458, &device).map_err(w)?; - dc.run(&mel).map_err(w)?; - Ok(()) -} - -fn test_fn_impl() -> anyhow::Result { - let t1 = Tensor::randn((3, 4), DType::F32, &Device::Cpu, 0., 1.)?; - let t2 = Tensor::randn((4, 2), DType::F32, &Device::Cpu, 0., 1.)?; - let t = t1.matmul(&t2)?; - console_log!("matmul result: {t}"); - let res = format!("Hello Candle!\n\nt1:\n{t1}\n\nt2:\n{t2}\n\nt1@t2:\n{t}\n"); - Ok(res) -} - -#[wasm_bindgen] -pub fn test_fn() -> std::result::Result<(), JsValue> { - let result = match test_fn_impl() { - Ok(v) => v, - Err(err) => format!("error: {err:?}"), - }; - let window = web_sys::window().expect("no global `window` exists"); - let document = window.document().expect("should have a document on window"); - let p_element = document.create_element("p")?; - p_element.set_text_content(Some(&result)); - let body = document.body().expect("document should have a body"); - body.append_child(&p_element)?; - Ok(()) -} - -#[wasm_bindgen] -pub async fn run_fn() -> std::result::Result<(), JsValue> { - console_log!("run_fn starting..."); - run_impl().await?; Ok(()) }