Preliminary support for whisper v3. (#1294)

* Preliminary support for whisper v3. * Add the missing files.
2025-06-20 04:00:28 +00:00 · 2023-11-08 06:42:52 +01:00
parent f3a4f3db76
commit 2d28497197
5 changed files with 31 additions and 15 deletions
--- a/candle-transformers/src/models/whisper/audio.rs
+++ b/candle-transformers/src/models/whisper/audio.rs
@ -198,13 +198,17 @@ fn log_mel_spectrogram_<T: Float + std::fmt::Display>(
    mel
 }

-pub fn pcm_to_mel<T: Float + std::fmt::Display>(samples: &[T], filters: &[T]) -> Vec<T> {
+pub fn pcm_to_mel<T: Float + std::fmt::Display>(
+    cfg: &super::Config,
+    samples: &[T],
+    filters: &[T],
+) -> Vec<T> {
    log_mel_spectrogram_(
        samples,
        filters,
        super::N_FFT,
        super::HOP_LENGTH,
-        super::N_MELS,
+        cfg.num_mel_bins,
        false,
    )
 }
--- a/candle-transformers/src/models/whisper/mod.rs
+++ b/candle-transformers/src/models/whisper/mod.rs
@ -18,6 +18,7 @@ pub struct Config {
    // pub n_text_state: usize,
    pub decoder_attention_heads: usize, // n_text_head
    pub decoder_layers: usize,          // n_text_layer
+    #[serde(default)]
    pub suppress_tokens: Vec<u32>,
 }

@ -26,7 +27,6 @@ pub const DTYPE: candle::DType = candle::DType::F32;
 // Audio parameters.
 pub const SAMPLE_RATE: usize = 16000;
 pub const N_FFT: usize = 400;
-pub const N_MELS: usize = 80;
 pub const HOP_LENGTH: usize = 160;
 pub const CHUNK_LENGTH: usize = 30;
 pub const N_SAMPLES: usize = CHUNK_LENGTH * SAMPLE_RATE; // 480000 samples in a 30-second chunk