// https://github.com/openai/whisper/blob/main/whisper/model.py/rgs // TODO: // - kv-cache support? // - Batch size greater than 1. // - More token filters (SuppressBlanks, ApplyTimestampRules). #[cfg(feature = "mkl")] extern crate intel_mkl_src; use anyhow::{Error as E, Result}; use candle::{DType, Device, Tensor}; use candle_nn::{ops::softmax, VarBuilder}; use clap::{Parser, ValueEnum}; use hf_hub::{api::sync::Api, Repo, RepoType}; use rand::{distributions::Distribution, SeedableRng}; use tokenizers::Tokenizer; mod audio; mod model; use model::{Config, Whisper}; mod multilingual; const DTYPE: DType = DType::F32; // Audio parameters. const SAMPLE_RATE: usize = 16000; const N_FFT: usize = 400; const N_MELS: usize = 80; const HOP_LENGTH: usize = 160; const CHUNK_LENGTH: usize = 30; const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk const N_FRAMES: usize = N_SAMPLES / HOP_LENGTH; // 3000 frames in a mel spectrogram input const NO_SPEECH_THRESHOLD: f64 = 0.6; const LOGPROB_THRESHOLD: f64 = -1.0; const TEMPERATURES: [f64; 6] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]; const COMPRESSION_RATIO_THRESHOLD: f64 = 2.4; // Tokenizer dependent bits. const SOT_TOKEN: &str = "<|startoftranscript|>"; const TRANSCRIBE_TOKEN: &str = "<|transcribe|>"; const EOT_TOKEN: &str = "<|endoftext|>"; const NO_SPEECH_TOKEN: &str = "<|nocaptions|>"; #[allow(dead_code)] #[derive(Debug, Clone)] struct DecodingResult { tokens: Vec, text: String, avg_logprob: f64, no_speech_prob: f64, temperature: f64, compression_ratio: f64, } #[allow(dead_code)] #[derive(Debug, Clone)] struct Segment { start: f64, duration: f64, dr: DecodingResult, } struct Decoder { model: Whisper, rng: rand::rngs::StdRng, tokenizer: Tokenizer, suppress_tokens: Tensor, sot_token: u32, transcribe_token: u32, eot_token: u32, no_speech_token: u32, language_token: Option, } impl Decoder { fn new( model: Whisper, tokenizer: Tokenizer, seed: u64, device: &Device, language_token: Option, ) -> Result { let suppress_tokens: Vec = (0..model.config.vocab_size as u32) .map(|i| { if model.config.suppress_tokens.contains(&i) { f32::NEG_INFINITY } else { 0f32 } }) .collect(); let suppress_tokens = Tensor::new(suppress_tokens.as_slice(), device)?; let sot_token = token_id(&tokenizer, SOT_TOKEN)?; let transcribe_token = token_id(&tokenizer, TRANSCRIBE_TOKEN)?; let eot_token = token_id(&tokenizer, EOT_TOKEN)?; let no_speech_token = token_id(&tokenizer, NO_SPEECH_TOKEN)?; Ok(Self { model, rng: rand::rngs::StdRng::seed_from_u64(seed), tokenizer, suppress_tokens, sot_token, transcribe_token, eot_token, no_speech_token, language_token, }) } fn decode(&mut self, mel: &Tensor, t: f64) -> Result { let model = &mut self.model; let audio_features = model.encoder.forward(mel, true)?; println!("audio features: {:?}", audio_features.dims()); let sample_len = model.config.max_target_positions / 2; let mut sum_logprob = 0f64; let mut no_speech_prob = f64::NAN; let mut tokens = vec![self.sot_token]; if let Some(language_token) = self.language_token { tokens.push(language_token) } tokens.push(self.transcribe_token); for i in 0..sample_len { let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?; // The model expects a batch dim but this inference loop does not handle // it so we add it at this point. let tokens_t = tokens_t.unsqueeze(0)?; let logits = model.decoder.forward(&tokens_t, &audio_features, i == 0)?; let logits = logits.squeeze(0)?; // Extract the no speech probability on the first iteration by looking at the first // token logits and the probability for the according token. if i == 0 { no_speech_prob = softmax(&logits.get(0)?, 0)? .get(self.no_speech_token as usize)? .to_scalar::()? as f64; } let (seq_len, _) = logits.dims2()?; let logits = logits .get(seq_len - 1)? .broadcast_add(&self.suppress_tokens)?; let next_token = if t > 0f64 { let prs = softmax(&(&logits / t)?, 0)?; let logits_v: Vec = prs.to_vec1()?; let distr = rand::distributions::WeightedIndex::new(&logits_v)?; distr.sample(&mut self.rng) as u32 } else { let logits_v: Vec = logits.to_vec1()?; logits_v .iter() .enumerate() .max_by(|(_, u), (_, v)| u.total_cmp(v)) .map(|(i, _)| i as u32) .unwrap() }; tokens.push(next_token); let prob = softmax(&logits, candle::D::Minus1)? .get(next_token as usize)? .to_scalar::()? as f64; if next_token == self.eot_token || tokens.len() > model.config.max_target_positions { break; } sum_logprob += prob.ln(); } let text = self .tokenizer .decode(tokens.clone(), true) .map_err(E::msg)?; let avg_logprob = sum_logprob / tokens.len() as f64; Ok(DecodingResult { tokens, text, avg_logprob, no_speech_prob, temperature: t, compression_ratio: f64::NAN, }) } fn decode_with_fallback(&mut self, segment: &Tensor) -> Result { for (i, &t) in TEMPERATURES.iter().enumerate() { let dr: Result = self.decode(segment, t); if i == TEMPERATURES.len() - 1 { return dr; } // On errors, we try again with a different temperature. match dr { Ok(dr) => { let needs_fallback = dr.compression_ratio > COMPRESSION_RATIO_THRESHOLD || dr.avg_logprob < LOGPROB_THRESHOLD; if !needs_fallback || dr.no_speech_prob > NO_SPEECH_THRESHOLD { return Ok(dr); } } Err(err) => { println!("Error running at {t}: {err}") } } } unreachable!() } fn run(&mut self, mel: &Tensor) -> Result> { let (_, _, content_frames) = mel.dims3()?; let mut seek = 0; let mut segments = vec![]; while seek < content_frames { let start = std::time::Instant::now(); let time_offset = (seek * HOP_LENGTH) as f64 / SAMPLE_RATE as f64; let segment_size = usize::min(content_frames - seek, N_FRAMES); let mel_segment = mel.narrow(2, seek, segment_size)?; let segment_duration = (segment_size * HOP_LENGTH) as f64 / SAMPLE_RATE as f64; let dr = self.decode_with_fallback(&mel_segment)?; seek += segment_size; if dr.no_speech_prob > NO_SPEECH_THRESHOLD && dr.avg_logprob < LOGPROB_THRESHOLD { println!("no speech detected, skipping {seek} {dr:?}"); continue; } let segment = Segment { start: time_offset, duration: segment_duration, dr, }; println!("{seek}: {segment:?}, in {:?}", start.elapsed()); segments.push(segment) } Ok(segments) } } pub fn token_id(tokenizer: &Tokenizer, token: &str) -> candle::Result { match tokenizer.token_to_id(token) { None => candle::bail!("no token-id for {token}"), Some(id) => Ok(id), } } #[derive(Clone, Copy, Debug, ValueEnum)] enum WhichModel { Tiny, TinyEn, Base, BaseEn, SmallEn, MediumEn, LargeV2, } impl WhichModel { fn is_multilingual(&self) -> bool { match self { Self::Tiny | Self::Base | Self::LargeV2 => true, Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn => false, } } fn model_and_revision(&self) -> (&'static str, &'static str) { match self { Self::Tiny => ("openai/whisper-tiny", "main"), Self::TinyEn => ("openai/whisper-tiny.en", "refs/pr/15"), Self::Base => ("openai/whisper-base", "refs/pr/22"), Self::BaseEn => ("openai/whisper-base.en", "refs/pr/13"), Self::SmallEn => ("openai/whisper-small.en", "refs/pr/10"), Self::MediumEn => ("openai/whisper-medium.en", "refs/pr/11"), Self::LargeV2 => ("openai/whisper-large-v2", "refs/pr/57"), } } } #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { /// Run on CPU rather than on GPU. #[arg(long)] cpu: bool, #[arg(long)] model_id: Option, /// The model to use, check out available models: /// https://huggingface.co/models?search=whisper #[arg(long)] revision: Option, /// The model to be used, can be tiny, small, medium. #[arg(long, default_value = "tiny-en")] model: WhichModel, /// The input to be processed, in wav format, will default to `jfk.wav`. Alternatively /// this can be set to sample:jfk, sample:gb1, ... to fetch a sample from the following /// repo: https://huggingface.co/datasets/Narsil/candle_demo/ #[arg(long)] input: Option, /// The seed to use when generating random samples. #[arg(long, default_value_t = 299792458)] seed: u64, /// Enable tracing (generates a trace-timestamp.json file). #[arg(long)] tracing: bool, /// Language. #[arg(long)] language: Option, } fn main() -> Result<()> { use tracing_chrome::ChromeLayerBuilder; use tracing_subscriber::prelude::*; let args = Args::parse(); let _guard = if args.tracing { println!("tracing..."); let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); tracing_subscriber::registry().with(chrome_layer).init(); Some(guard) } else { None }; let device = candle_examples::device(args.cpu)?; let (default_model, default_revision) = args.model.model_and_revision(); let default_model = default_model.to_string(); let default_revision = default_revision.to_string(); let path = std::path::PathBuf::from(default_model.clone()); let (model_id, revision) = match (args.model_id, args.revision) { (Some(model_id), Some(revision)) => (model_id, revision), (Some(model_id), None) => (model_id, "main".to_string()), (None, Some(revision)) => (default_model, revision), (None, None) => (default_model, default_revision), }; let (config_filename, tokenizer_filename, weights_filename, input) = if path.exists() { let mut config_filename = path.clone(); config_filename.push("config.json"); let mut tokenizer_filename = path.clone(); tokenizer_filename.push("tokenizer.json"); let mut model_filename = path; model_filename.push("model.safetensors"); ( config_filename, tokenizer_filename, model_filename, std::path::PathBuf::from(args.input.expect("You didn't specify a file to read from yet, are using a local model, please add `--input example.wav` to read some audio file")), ) } else { let api = Api::new()?; let dataset = api.dataset("Narsil/candle-examples".to_string()); let repo = api.repo(Repo::with_revision(model_id, RepoType::Model, revision)); let sample = if let Some(input) = args.input { if let Some(sample) = input.strip_prefix("sample:") { dataset.get(&format!("samples_{sample}.wav"))? } else { std::path::PathBuf::from(input) } } else { println!("No audio file submitted: Downloading https://huggingface.co/datasets/Narsil/candle_demo/blob/main/samples_jfk.wav"); dataset.get("samples_jfk.wav")? }; ( repo.get("config.json")?, repo.get("tokenizer.json")?, repo.get("model.safetensors")?, sample, ) }; let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?; let mel_bytes = include_bytes!("melfilters.bytes"); let mut mel_filters = vec![0f32; mel_bytes.len() / 4]; ::read_f32_into(mel_bytes, &mut mel_filters); let mut input = std::fs::File::open(input)?; let (header, data) = wav::read(&mut input)?; println!("loaded wav data: {header:?}"); if header.sampling_rate != SAMPLE_RATE as u32 { anyhow::bail!("wav file must have a {} sampling rate", SAMPLE_RATE) } let data = data.as_sixteen().expect("expected 16 bit wav file"); let pcm_data: Vec<_> = data[..data.len() / header.channel_count as usize] .iter() .map(|v| *v as f32 / 32768.) .collect(); println!("pcm data loaded {}", pcm_data.len()); let mel = audio::pcm_to_mel(&pcm_data, &mel_filters)?; let mel_len = mel.len(); let mel = Tensor::from_vec(mel, (1, N_MELS, mel_len / N_MELS), &device)?; println!("loaded mel: {:?}", mel.dims()); let weights = unsafe { candle::safetensors::MmapedFile::new(weights_filename)? }; let weights = weights.deserialize()?; let vb = VarBuilder::from_safetensors(vec![weights], DTYPE, &device); let config: Config = serde_json::from_str(&std::fs::read_to_string(config_filename)?)?; let mut model = Whisper::load(&vb, config)?; let language_token = match (args.model.is_multilingual(), args.language) { (true, None) => Some(multilingual::detect_language(&mut model, &tokenizer, &mel)?), (false, None) => None, (true, Some(language)) => match token_id(&tokenizer, &format!("<|{language}|>")) { Ok(token_id) => Some(token_id), Err(_) => anyhow::bail!("language {language} is not supported"), }, (false, Some(_)) => { anyhow::bail!("a language cannot be set for non-multilingual models") } }; let mut dc = Decoder::new(model, tokenizer, args.seed, &device, language_token)?; dc.run(&mel)?; Ok(()) }