diff --git a/.gitignore b/.gitignore
index 400ff9bf..fa561541 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ flamegraph.svg
candle-wasm-example/*.wav
candle-wasm-example/*.safetensors
+candle-wasm-example/package-lock.json
diff --git a/candle-wasm-example/Cargo.toml b/candle-wasm-example/Cargo.toml
index e4a2319c..a76ce940 100644
--- a/candle-wasm-example/Cargo.toml
+++ b/candle-wasm-example/Cargo.toml
@@ -16,17 +16,25 @@ crate-type = ["cdylib"]
[dependencies]
candle = { path = "../candle-core" }
candle-nn = { path = "../candle-nn" }
-wasm-bindgen = "0.2.87"
-getrandom = { version = "0.2", features = ["js"] }
+num-traits = "0.2.15"
tokenizers = { version = "0.13.3", default-features=false, features=["unstable_wasm"] }
+
+# App crates.
+anyhow = "1.0.71"
+log = "0.4"
+rand = "0.8.5"
serde = { version = "1.0.166", features = ["derive"] }
serde_json = "1.0.99"
wav = "1.0.0"
-rand = "0.8.5"
-num-traits = "0.2.15"
-anyhow = "1.0.71"
+
+# Wasm specific crates.
+getrandom = { version = "0.2", features = ["js"] }
+gloo = "0.8"
js-sys = "0.3.64"
+wasm-bindgen = "0.2.87"
wasm-bindgen-futures = "0.4.37"
+wasm-logger = "0.2"
+yew = { version = "0.20.0", features = ["csr"] }
[dependencies.web-sys]
version = "0.3.64"
diff --git a/candle-wasm-example/index.html b/candle-wasm-example/index.html
index 2ba74929..a878197c 100644
--- a/candle-wasm-example/index.html
+++ b/candle-wasm-example/index.html
@@ -1,9 +1,16 @@
-
+
-
- Hello Candle - Rust
-
+
+ Welcome to Candle!
+
+
+
+
+
+
+
+
-
+
diff --git a/candle-wasm-example/index.js b/candle-wasm-example/index.js
deleted file mode 100644
index 74bb5e99..00000000
--- a/candle-wasm-example/index.js
+++ /dev/null
@@ -1,8 +0,0 @@
-import init from "./pkg/candle_wasm.js";
-
-const runWasm = async () => {
- const candleWasm = await init("./pkg/candle_wasm_bg.wasm");
- candleWasm.test_fn();
- await candleWasm.run_fn();
-};
-runWasm();
diff --git a/candle-wasm-example/main.js b/candle-wasm-example/main.js
new file mode 100644
index 00000000..d2d7de05
--- /dev/null
+++ b/candle-wasm-example/main.js
@@ -0,0 +1,6 @@
+import init, { run_app } from './pkg/candle_wasm_example.js';
+async function main() {
+ await init('/pkg/candle_wasm_example_bg.wasm');
+ run_app();
+}
+main()
diff --git a/candle-wasm-example/src/app.rs b/candle-wasm-example/src/app.rs
new file mode 100644
index 00000000..617c838d
--- /dev/null
+++ b/candle-wasm-example/src/app.rs
@@ -0,0 +1,450 @@
+use crate::model::{Config, Whisper};
+use anyhow::Error as E;
+use candle::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use js_sys::Date;
+use rand::distributions::Distribution;
+use tokenizers::Tokenizer;
+use wasm_bindgen::prelude::*;
+use wasm_bindgen_futures::JsFuture;
+use yew::{html, Component, Context, Html};
+
+const SAMPLE_NAMES: [&str; 6] = [
+ "jfk.wav", "a13.wav", "gb0.wav", "gb1.wav", "hp0.wav", "mm0.wav",
+];
+
+pub const DTYPE: DType = DType::F32;
+
+// Audio parameters.
+pub const SAMPLE_RATE: usize = 16000;
+pub const N_FFT: usize = 400;
+pub const N_MELS: usize = 80;
+pub const HOP_LENGTH: usize = 160;
+pub const CHUNK_LENGTH: usize = 30;
+pub const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk
+pub const N_FRAMES: usize = N_SAMPLES / HOP_LENGTH; // 3000 frames in a mel spectrogram input
+pub const N_SAMPLES_PER_TOKEN: usize = HOP_LENGTH * 2; // the initial convolutions has stride 2
+pub const FRAMES_PER_SECOND: usize = SAMPLE_RATE / HOP_LENGTH; // 10ms per audio frame
+pub const TOKENS_PER_SECOND: usize = SAMPLE_RATE / N_SAMPLES_PER_TOKEN; // 20ms per audio token
+
+pub const NO_SPEECH_THRESHOLD: f64 = 0.6;
+pub const LOGPROB_THRESHOLD: f64 = -1.0;
+pub const TEMPERATURES: [f64; 6] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0];
+pub const COMPRESSION_RATIO_THRESHOLD: f64 = 2.4;
+
+// Tokenizer dependent bits.
+pub const SOT_TOKEN: u32 = 50257;
+pub const EOT_TOKEN: u32 = 50256;
+pub const NO_SPEECH_TOKEN: u32 = 50361;
+pub const NO_TIMESTAMP_TOKEN: u32 = 50362;
+// From the _get_suppress_tokens function + 50362 (no timestamp)
+// https://github.com/openai/whisper/blob/f572f2161ba831bae131364c3bffdead7af6d210/whisper/decoding.py#L605
+pub const SUPPRESS_TOKENS: [u32; 91] = [
+ 1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357,
+ 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782,
+ 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959,
+ 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992,
+ 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549,
+ 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361, 50362,
+];
+
+#[wasm_bindgen]
+extern "C" {
+ // Use `js_namespace` here to bind `console.log(..)` instead of just
+ // `log(..)`
+ #[wasm_bindgen(js_namespace = console)]
+ fn log(s: &str);
+}
+
+macro_rules! console_log {
+ // Note that this is using the `log` function imported above during
+ // `bare_bones`
+ ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
+}
+
+#[derive(Debug, Clone)]
+struct DecodingResult {
+ tokens: Vec,
+ text: String,
+ avg_logprob: f64,
+ no_speech_prob: f64,
+ temperature: f64,
+ compression_ratio: f64,
+}
+
+#[derive(Debug, Clone)]
+struct Segment {
+ start: f64,
+ duration: f64,
+ dr: DecodingResult,
+}
+
+pub struct Decoder {
+ model: Whisper,
+ mel_filters: Vec,
+ tokenizer: Tokenizer,
+ suppress_tokens: Tensor,
+}
+
+impl Decoder {
+ fn new(
+ model: Whisper,
+ tokenizer: Tokenizer,
+ mel_filters: Vec,
+ device: &Device,
+ ) -> anyhow::Result {
+ let suppress_tokens: Vec = (0..model.config.vocab_size as u32)
+ .map(|i| {
+ if SUPPRESS_TOKENS.contains(&i) {
+ f32::NEG_INFINITY
+ } else {
+ 0f32
+ }
+ })
+ .collect();
+ let suppress_tokens = Tensor::new(suppress_tokens.as_slice(), device)?;
+ Ok(Self {
+ model,
+ mel_filters,
+ tokenizer,
+ suppress_tokens,
+ })
+ }
+
+ fn decode(&self, mel: &Tensor, t: f64) -> anyhow::Result {
+ let model = &self.model;
+ let audio_features = model.encoder.forward(mel)?;
+ console_log!("audio features: {:?}", audio_features.dims());
+ let sample_len = model.config.max_target_positions / 2;
+ let mut sum_logprob = 0f64;
+ let mut no_speech_prob = f64::NAN;
+ let mut tokens = vec![SOT_TOKEN];
+ for i in 0..sample_len {
+ let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?;
+
+ // The model expects a batch dim but this inference loop does not handle
+ // it so we add it at this point.
+ let tokens_t = tokens_t.unsqueeze(0)?;
+ let logits = model.decoder.forward(&tokens_t, &audio_features)?;
+ let logits = logits.squeeze(0)?;
+
+ // Extract the no speech probability on the first iteration by looking at the first
+ // token logits and the probability for the according token.
+ if i == 0 {
+ no_speech_prob = logits
+ .get(0)?
+ .softmax(0)?
+ .get(NO_SPEECH_TOKEN as usize)?
+ .to_scalar::()? as f64;
+ }
+
+ let (seq_len, _) = logits.shape().r2()?;
+ let logits = logits
+ .get(seq_len - 1)?
+ .broadcast_add(&self.suppress_tokens)?;
+ let next_token = if t > 0f64 {
+ let prs = (&logits / t)?.softmax(0)?;
+ let logits_v: Vec = prs.to_vec1()?;
+ let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
+ let mut rng = rand::thread_rng();
+ distr.sample(&mut rng) as u32
+ } else {
+ let logits_v: Vec = logits.to_vec1()?;
+ logits_v
+ .iter()
+ .enumerate()
+ .max_by(|(_, u), (_, v)| u.total_cmp(v))
+ .map(|(i, _)| i as u32)
+ .unwrap()
+ };
+ tokens.push(next_token);
+ let prob = logits
+ .softmax(candle::D::Minus1)?
+ .get(next_token as usize)?
+ .to_scalar::()? as f64;
+ if next_token == EOT_TOKEN || tokens.len() > model.config.max_target_positions {
+ break;
+ }
+ sum_logprob += prob.ln();
+ }
+ let text = self
+ .tokenizer
+ .decode(tokens.clone(), true)
+ .map_err(E::msg)?;
+ let avg_logprob = sum_logprob / tokens.len() as f64;
+
+ Ok(DecodingResult {
+ tokens,
+ text,
+ avg_logprob,
+ no_speech_prob,
+ temperature: t,
+ compression_ratio: f64::NAN,
+ })
+ }
+
+ fn decode_with_fallback(&self, segment: &Tensor) -> anyhow::Result {
+ for (i, &t) in TEMPERATURES.iter().enumerate() {
+ let dr: Result = self.decode(segment, t);
+ if i == TEMPERATURES.len() - 1 {
+ return dr;
+ }
+ // On errors, we try again with a different temperature.
+ match dr {
+ Ok(dr) => {
+ let needs_fallback = dr.compression_ratio > COMPRESSION_RATIO_THRESHOLD
+ || dr.avg_logprob < LOGPROB_THRESHOLD;
+ if !needs_fallback || dr.no_speech_prob > NO_SPEECH_THRESHOLD {
+ return Ok(dr);
+ }
+ }
+ Err(err) => {
+ console_log!("Error running at {t}: {err}")
+ }
+ }
+ }
+ unreachable!()
+ }
+
+ fn run(&self, mel: &Tensor) -> anyhow::Result> {
+ let (_, _, content_frames) = mel.shape().r3()?;
+ let mut seek = 0;
+ let mut segments = vec![];
+ while seek < content_frames {
+ let time_offset = (seek * HOP_LENGTH) as f64 / SAMPLE_RATE as f64;
+ let segment_size = usize::min(content_frames - seek, N_FRAMES);
+ let mel_segment = mel.narrow(2, seek, segment_size)?;
+ let segment_duration = (segment_size * HOP_LENGTH) as f64 / SAMPLE_RATE as f64;
+ let dr = self.decode_with_fallback(&mel_segment)?;
+ seek += segment_size;
+ if dr.no_speech_prob > NO_SPEECH_THRESHOLD && dr.avg_logprob < LOGPROB_THRESHOLD {
+ console_log!("no speech detected, skipping {seek} {dr:?}");
+ continue;
+ }
+ let segment = Segment {
+ start: time_offset,
+ duration: segment_duration,
+ dr,
+ };
+ console_log!("{seek}: {segment:?}");
+ segments.push(segment)
+ }
+ Ok(segments)
+ }
+
+ async fn load() -> Result {
+ let device = Device::Cpu;
+ let tokenizer_config = fetch_url("tokenizer.en.json").await?;
+ let tokenizer = Tokenizer::from_bytes(tokenizer_config).map_err(w)?;
+
+ let mel_filters = fetch_url("mel_filters.safetensors").await?;
+ let mel_filters = candle::safetensors::SafeTensors::from_buffer(&mel_filters).map_err(w)?;
+ let mel_filters = mel_filters.tensor("mel_80", &device).map_err(w)?;
+ console_log!("loaded mel filters {:?}", mel_filters.shape());
+ let mel_filters = mel_filters
+ .flatten_all()
+ .map_err(w)?
+ .to_vec1::()
+ .map_err(w)?;
+ let weights = fetch_url("tiny.en.safetensors").await?;
+ let weights = candle::safetensors::SafeTensors::from_buffer(&weights).map_err(w)?;
+ let vb = VarBuilder::from_safetensors(vec![weights], DTYPE, &device);
+ let config = Config::tiny_en();
+ let whisper = Whisper::load(&vb, config).map_err(w)?;
+ console_log!("done loading model");
+ let model = Decoder::new(whisper, tokenizer, mel_filters, &device).map_err(w)?;
+ Ok(model)
+ }
+
+ async fn load_and_run(&self, name: &str) -> Result, JsValue> {
+ let device = Device::Cpu;
+ let wav_input = fetch_url(name).await?;
+ let mut wav_input = std::io::Cursor::new(wav_input);
+ let (header, data) = wav::read(&mut wav_input).map_err(w)?;
+ console_log!("loaded wav data: {header:?}");
+ if header.sampling_rate != SAMPLE_RATE as u32 {
+ Err(format!(
+ "wav file must have a {} sampling rate",
+ SAMPLE_RATE
+ ))?
+ }
+ let data = data.as_sixteen().expect("expected 16 bit wav file");
+ let pcm_data: Vec<_> = data[..data.len() / header.channel_count as usize]
+ .iter()
+ .map(|v| *v as f32 / 32768.)
+ .collect();
+ console_log!("pcm data loaded {}", pcm_data.len());
+ let mel = crate::audio::pcm_to_mel(&pcm_data, &self.mel_filters).map_err(w)?;
+ let mel_len = mel.len();
+ let mel = Tensor::from_vec(mel, (1, N_MELS, mel_len / N_MELS), &device).map_err(w)?;
+ console_log!("loaded mel: {:?}", mel.dims());
+
+ let segments = self.run(&mel).map_err(w)?;
+ Ok(segments)
+ }
+}
+
+async fn fetch_url(url: &str) -> Result, JsValue> {
+ use web_sys::{Request, RequestCache, RequestInit, RequestMode, Response};
+ let window = web_sys::window().ok_or("window")?;
+ let mut opts = RequestInit::new();
+ let opts = opts
+ .method("GET")
+ .mode(RequestMode::Cors)
+ .cache(RequestCache::NoCache);
+
+ let request = Request::new_with_str_and_init(url, opts)?;
+
+ let resp_value = JsFuture::from(window.fetch_with_request(&request)).await?;
+
+ // `resp_value` is a `Response` object.
+ assert!(resp_value.is_instance_of::());
+ let resp: Response = resp_value.dyn_into()?;
+ let data = JsFuture::from(resp.blob()?).await?;
+ let blob = web_sys::Blob::from(data);
+ let array_buffer = JsFuture::from(blob.array_buffer()).await?;
+ let data = js_sys::Uint8Array::new(&array_buffer).to_vec();
+ Ok(data)
+}
+
+fn w(x: T) -> String {
+ x.to_string()
+}
+
+pub enum Msg {
+ Run(usize),
+ UpdateStatus(String),
+ RunFinished(String),
+ SetDecoder(Decoder),
+}
+
+pub struct App {
+ status: String,
+ content: String,
+ decode_in_flight: bool,
+ decoder: Option>,
+}
+
+impl Component for App {
+ type Message = Msg;
+ type Properties = ();
+
+ fn create(_ctx: &Context) -> Self {
+ let status = "loading weights".to_string();
+ Self {
+ status,
+ content: String::new(),
+ decode_in_flight: false,
+ decoder: None,
+ }
+ }
+
+ fn rendered(&mut self, ctx: &Context, first_render: bool) {
+ if first_render {
+ ctx.link().send_future(async {
+ match Decoder::load().await {
+ Err(err) => {
+ let status = format!("{err:?}");
+ Msg::UpdateStatus(status)
+ }
+ Ok(decoder) => Msg::SetDecoder(decoder),
+ }
+ });
+ }
+ }
+
+ fn update(&mut self, ctx: &Context, msg: Self::Message) -> bool {
+ match msg {
+ Msg::SetDecoder(decoder) => {
+ self.status = "weights loaded succesfully!".to_string();
+ self.decoder = Some(std::sync::Arc::new(decoder));
+ true
+ }
+ Msg::Run(sample_index) => {
+ let sample = SAMPLE_NAMES[sample_index];
+ match &self.decoder {
+ None => self.content = "waiting for weights to load".to_string(),
+ Some(decoder) => {
+ if self.decode_in_flight {
+ self.content = "already decoding some sample at the moment".to_string()
+ } else {
+ let decoder = decoder.clone();
+ self.decode_in_flight = true;
+ self.status = format!("decoding {sample}");
+ self.content = String::new();
+ ctx.link().send_future(async move {
+ let content = decoder.load_and_run(sample).await;
+ let content = match content {
+ Err(err) => format!("decoding error: {err:?}"),
+ Ok(segments) => format!("decoded succesfully: {segments:?}"),
+ };
+ Msg::RunFinished(content)
+ })
+ }
+ //
+ }
+ }
+ true
+ }
+ Msg::RunFinished(content) => {
+ self.status = "Run finished!".to_string();
+ self.content = content;
+ self.decode_in_flight = false;
+ true
+ }
+ Msg::UpdateStatus(status) => {
+ self.status = status;
+ true
+ }
+ }
+ }
+
+ fn view(&self, ctx: &Context) -> Html {
+ html! {
+
+
+
+
+ {"Sample"} |
+ |
+ |
+
+
+
+ {
+ SAMPLE_NAMES.iter().enumerate().map(|(i, name)| { html! {
+
+ {name} |
+ |
+ |
+
+ }
+ }).collect::()
+ }
+
+
+
+ {&self.status}
+
+ {
+ if self.decode_in_flight {
+ html! {
}
+ } else { html!{
+
+
+ {&self.content}
+
+
+ }
+ }
+ }
+
+ // Display the current date and time the page was rendered
+
+
+ }
+ }
+}
diff --git a/candle-wasm-example/src/audio.rs b/candle-wasm-example/src/audio.rs
index d095e239..d73c3142 100644
--- a/candle-wasm-example/src/audio.rs
+++ b/candle-wasm-example/src/audio.rs
@@ -1,5 +1,6 @@
// Audio processing code, adapted from whisper.cpp
// https://github.com/ggerganov/whisper.cpp
+use super::app;
pub trait Float: num_traits::Float + num_traits::FloatConst + num_traits::NumAssign {}
@@ -169,7 +170,7 @@ fn log_mel_spectrogram_(
let n_len = samples.len() / fft_step;
// pad audio with at least one extra chunk of zeros
- let pad = 100 * super::CHUNK_LENGTH / 2;
+ let pad = 100 * app::CHUNK_LENGTH / 2;
let n_len = if n_len % pad != 0 {
(n_len / pad + 1) * pad
} else {
@@ -207,9 +208,9 @@ pub fn pcm_to_mel(
let mel = log_mel_spectrogram_(
samples,
filters,
- super::N_FFT,
- super::HOP_LENGTH,
- super::N_MELS,
+ app::N_FFT,
+ app::HOP_LENGTH,
+ app::N_MELS,
false,
);
Ok(mel)
diff --git a/candle-wasm-example/src/lib.rs b/candle-wasm-example/src/lib.rs
index 8e51f872..c4c0a3cf 100644
--- a/candle-wasm-example/src/lib.rs
+++ b/candle-wasm-example/src/lib.rs
@@ -1,335 +1,14 @@
#![allow(dead_code)]
-use anyhow::Error as E;
-use candle::{DType, Device, Tensor};
-use candle_nn::VarBuilder;
-use rand::{distributions::Distribution, SeedableRng};
-use tokenizers::Tokenizer;
use wasm_bindgen::prelude::*;
-use wasm_bindgen_futures::JsFuture;
+mod app;
mod audio;
mod model;
-use model::{Config, Whisper};
-
-const DTYPE: DType = DType::F32;
-
-// Audio parameters.
-const SAMPLE_RATE: usize = 16000;
-const N_FFT: usize = 400;
-const N_MELS: usize = 80;
-const HOP_LENGTH: usize = 160;
-const CHUNK_LENGTH: usize = 30;
-const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk
-const N_FRAMES: usize = N_SAMPLES / HOP_LENGTH; // 3000 frames in a mel spectrogram input
-const N_SAMPLES_PER_TOKEN: usize = HOP_LENGTH * 2; // the initial convolutions has stride 2
-const FRAMES_PER_SECOND: usize = SAMPLE_RATE / HOP_LENGTH; // 10ms per audio frame
-const TOKENS_PER_SECOND: usize = SAMPLE_RATE / N_SAMPLES_PER_TOKEN; // 20ms per audio token
-
-const NO_SPEECH_THRESHOLD: f64 = 0.6;
-const LOGPROB_THRESHOLD: f64 = -1.0;
-const TEMPERATURES: [f64; 6] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0];
-const COMPRESSION_RATIO_THRESHOLD: f64 = 2.4;
-
-// Tokenizer dependent bits.
-const SOT_TOKEN: u32 = 50257;
-const EOT_TOKEN: u32 = 50256;
-const NO_SPEECH_TOKEN: u32 = 50361;
-const NO_TIMESTAMP_TOKEN: u32 = 50362;
-// From the _get_suppress_tokens function + 50362 (no timestamp)
-// https://github.com/openai/whisper/blob/f572f2161ba831bae131364c3bffdead7af6d210/whisper/decoding.py#L605
-const SUPPRESS_TOKENS: [u32; 91] = [
- 1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 357,
- 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391, 1635, 1782,
- 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959,
- 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791, 17992,
- 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409, 34949, 40283, 40493, 40549,
- 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361, 50362,
-];
#[wasm_bindgen]
-extern "C" {
- // Use `js_namespace` here to bind `console.log(..)` instead of just
- // `log(..)`
- #[wasm_bindgen(js_namespace = console)]
- fn log(s: &str);
-}
+pub fn run_app() -> Result<(), JsValue> {
+ wasm_logger::init(wasm_logger::Config::new(log::Level::Trace));
+ yew::Renderer::::new().render();
-macro_rules! console_log {
- // Note that this is using the `log` function imported above during
- // `bare_bones`
- ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
-}
-
-#[derive(Debug, Clone)]
-struct DecodingResult {
- tokens: Vec,
- text: String,
- avg_logprob: f64,
- no_speech_prob: f64,
- temperature: f64,
- compression_ratio: f64,
-}
-
-#[derive(Debug, Clone)]
-struct Segment {
- start: f64,
- duration: f64,
- dr: DecodingResult,
-}
-
-struct Decoder {
- model: Whisper,
- rng: rand::rngs::StdRng,
- tokenizer: Tokenizer,
- suppress_tokens: Tensor,
-}
-
-impl Decoder {
- fn new(
- model: Whisper,
- tokenizer: Tokenizer,
- seed: u64,
- device: &Device,
- ) -> anyhow::Result {
- let suppress_tokens: Vec = (0..model.config.vocab_size as u32)
- .map(|i| {
- if SUPPRESS_TOKENS.contains(&i) {
- f32::NEG_INFINITY
- } else {
- 0f32
- }
- })
- .collect();
- let suppress_tokens = Tensor::new(suppress_tokens.as_slice(), device)?;
- Ok(Self {
- model,
- rng: rand::rngs::StdRng::seed_from_u64(seed),
- tokenizer,
- suppress_tokens,
- })
- }
-
- fn decode(&mut self, mel: &Tensor, t: f64) -> anyhow::Result {
- let model = &self.model;
- let audio_features = model.encoder.forward(mel)?;
- console_log!("audio features: {:?}", audio_features.dims());
- let sample_len = model.config.max_target_positions / 2;
- let mut sum_logprob = 0f64;
- let mut no_speech_prob = f64::NAN;
- let mut tokens = vec![SOT_TOKEN];
- for i in 0..sample_len {
- let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?;
-
- // The model expects a batch dim but this inference loop does not handle
- // it so we add it at this point.
- let tokens_t = tokens_t.unsqueeze(0)?;
- let logits = model.decoder.forward(&tokens_t, &audio_features)?;
- let logits = logits.squeeze(0)?;
-
- // Extract the no speech probability on the first iteration by looking at the first
- // token logits and the probability for the according token.
- if i == 0 {
- no_speech_prob = logits
- .get(0)?
- .softmax(0)?
- .get(NO_SPEECH_TOKEN as usize)?
- .to_scalar::()? as f64;
- }
-
- let (seq_len, _) = logits.shape().r2()?;
- let logits = logits
- .get(seq_len - 1)?
- .broadcast_add(&self.suppress_tokens)?;
- let next_token = if t > 0f64 {
- let prs = (&logits / t)?.softmax(0)?;
- let logits_v: Vec = prs.to_vec1()?;
- let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
- distr.sample(&mut self.rng) as u32
- } else {
- let logits_v: Vec = logits.to_vec1()?;
- logits_v
- .iter()
- .enumerate()
- .max_by(|(_, u), (_, v)| u.total_cmp(v))
- .map(|(i, _)| i as u32)
- .unwrap()
- };
- tokens.push(next_token);
- let prob = logits
- .softmax(candle::D::Minus1)?
- .get(next_token as usize)?
- .to_scalar::()? as f64;
- if next_token == EOT_TOKEN || tokens.len() > model.config.max_target_positions {
- break;
- }
- sum_logprob += prob.ln();
- }
- let text = self
- .tokenizer
- .decode(tokens.clone(), true)
- .map_err(E::msg)?;
- let avg_logprob = sum_logprob / tokens.len() as f64;
-
- Ok(DecodingResult {
- tokens,
- text,
- avg_logprob,
- no_speech_prob,
- temperature: t,
- compression_ratio: f64::NAN,
- })
- }
-
- fn decode_with_fallback(&mut self, segment: &Tensor) -> anyhow::Result {
- for (i, &t) in TEMPERATURES.iter().enumerate() {
- let dr: Result = self.decode(segment, t);
- if i == TEMPERATURES.len() - 1 {
- return dr;
- }
- // On errors, we try again with a different temperature.
- match dr {
- Ok(dr) => {
- let needs_fallback = dr.compression_ratio > COMPRESSION_RATIO_THRESHOLD
- || dr.avg_logprob < LOGPROB_THRESHOLD;
- if !needs_fallback || dr.no_speech_prob > NO_SPEECH_THRESHOLD {
- return Ok(dr);
- }
- }
- Err(err) => {
- console_log!("Error running at {t}: {err}")
- }
- }
- }
- unreachable!()
- }
-
- fn run(&mut self, mel: &Tensor) -> anyhow::Result> {
- let (_, _, content_frames) = mel.shape().r3()?;
- let mut seek = 0;
- let mut segments = vec![];
- while seek < content_frames {
- let time_offset = (seek * HOP_LENGTH) as f64 / SAMPLE_RATE as f64;
- let segment_size = usize::min(content_frames - seek, N_FRAMES);
- let mel_segment = mel.narrow(2, seek, segment_size)?;
- let segment_duration = (segment_size * HOP_LENGTH) as f64 / SAMPLE_RATE as f64;
- let dr = self.decode_with_fallback(&mel_segment)?;
- seek += segment_size;
- if dr.no_speech_prob > NO_SPEECH_THRESHOLD && dr.avg_logprob < LOGPROB_THRESHOLD {
- console_log!("no speech detected, skipping {seek} {dr:?}");
- continue;
- }
- let segment = Segment {
- start: time_offset,
- duration: segment_duration,
- dr,
- };
- console_log!("{seek}: {segment:?}");
- segments.push(segment)
- }
- Ok(segments)
- }
-}
-
-async fn fetch_url(url: &str) -> Result, JsValue> {
- use web_sys::{Request, RequestCache, RequestInit, RequestMode, Response};
- let window = web_sys::window().ok_or("window")?;
- let mut opts = RequestInit::new();
- let opts = opts
- .method("GET")
- .mode(RequestMode::Cors)
- .cache(RequestCache::NoCache);
-
- let request = Request::new_with_str_and_init(url, opts)?;
-
- let resp_value = JsFuture::from(window.fetch_with_request(&request)).await?;
-
- // `resp_value` is a `Response` object.
- assert!(resp_value.is_instance_of::());
- let resp: Response = resp_value.dyn_into()?;
- let data = JsFuture::from(resp.blob()?).await?;
- let blob = web_sys::Blob::from(data);
- let array_buffer = JsFuture::from(blob.array_buffer()).await?;
- let data = js_sys::Uint8Array::new(&array_buffer).to_vec();
- Ok(data)
-}
-
-fn w(x: T) -> String {
- x.to_string()
-}
-
-async fn run_impl() -> Result<(), JsValue> {
- let device = Device::Cpu;
- let tokenizer_config = fetch_url("tokenizer.en.json").await?;
- let tokenizer = Tokenizer::from_bytes(tokenizer_config).map_err(w)?;
-
- let mel_filters = fetch_url("mel_filters.safetensors").await?;
- let mel_filters = candle::safetensors::SafeTensors::from_buffer(&mel_filters).map_err(w)?;
- let mel_filters = mel_filters.tensor("mel_80", &device).map_err(w)?;
- console_log!("loaded mel filters {:?}", mel_filters.shape());
- let mel_filters = mel_filters
- .flatten_all()
- .map_err(w)?
- .to_vec1::()
- .map_err(w)?;
-
- let wav_input = fetch_url("jfk.wav").await?;
- let mut wav_input = std::io::Cursor::new(wav_input);
- let (header, data) = wav::read(&mut wav_input).map_err(w)?;
- console_log!("loaded wav data: {header:?}");
- if header.sampling_rate != SAMPLE_RATE as u32 {
- Err(format!(
- "wav file must have a {} sampling rate",
- SAMPLE_RATE
- ))?
- }
- let data = data.as_sixteen().expect("expected 16 bit wav file");
- let pcm_data: Vec<_> = data[..data.len() / header.channel_count as usize]
- .iter()
- .map(|v| *v as f32 / 32768.)
- .collect();
- console_log!("pcm data loaded {}", pcm_data.len());
- let mel = audio::pcm_to_mel(&pcm_data, &mel_filters).map_err(w)?;
- let mel_len = mel.len();
- let mel = Tensor::from_vec(mel, (1, N_MELS, mel_len / N_MELS), &device).map_err(w)?;
- console_log!("loaded mel: {:?}", mel.dims());
-
- let weights = fetch_url("tiny.en.safetensors").await?;
- let weights = candle::safetensors::SafeTensors::from_buffer(&weights).map_err(w)?;
- let vb = VarBuilder::from_safetensors(vec![weights], DTYPE, &device);
- let config = Config::tiny_en();
- let model = Whisper::load(&vb, config).map_err(w)?;
- let mut dc = Decoder::new(model, tokenizer, 299792458, &device).map_err(w)?;
- dc.run(&mel).map_err(w)?;
- Ok(())
-}
-
-fn test_fn_impl() -> anyhow::Result {
- let t1 = Tensor::randn((3, 4), DType::F32, &Device::Cpu, 0., 1.)?;
- let t2 = Tensor::randn((4, 2), DType::F32, &Device::Cpu, 0., 1.)?;
- let t = t1.matmul(&t2)?;
- console_log!("matmul result: {t}");
- let res = format!("Hello Candle!\n\nt1:\n{t1}\n\nt2:\n{t2}\n\nt1@t2:\n{t}\n");
- Ok(res)
-}
-
-#[wasm_bindgen]
-pub fn test_fn() -> std::result::Result<(), JsValue> {
- let result = match test_fn_impl() {
- Ok(v) => v,
- Err(err) => format!("error: {err:?}"),
- };
- let window = web_sys::window().expect("no global `window` exists");
- let document = window.document().expect("should have a document on window");
- let p_element = document.create_element("p")?;
- p_element.set_text_content(Some(&result));
- let body = document.body().expect("document should have a body");
- body.append_child(&p_element)?;
- Ok(())
-}
-
-#[wasm_bindgen]
-pub async fn run_fn() -> std::result::Result<(), JsValue> {
- console_log!("run_fn starting...");
- run_impl().await?;
Ok(())
}