More multilingual support for whisper. (#419)

* More multilingual support for whisper.

* Use the language token appropriately.
This commit is contained in:
Laurent Mazare
2023-08-12 16:32:52 +02:00
committed by GitHub
parent 0c3f109faa
commit 0741ebbd51
3 changed files with 47 additions and 23 deletions

View File

@ -16,11 +16,21 @@ pub struct Config {
// pub n_text_state: usize,
pub decoder_attention_heads: usize, // n_text_head
pub decoder_layers: usize, // n_text_layer
pub suppress_tokens: Vec<u32>,
}
impl Config {
#[allow(dead_code)]
pub fn tiny_en() -> Self {
let suppress_tokens = vec![
1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93,
357, 366, 438, 532, 685, 705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377, 1391,
1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211, 4600, 4808, 5299, 5855, 6329,
7203, 9609, 9959, 10563, 10786, 11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306,
16410, 16791, 17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
34949, 40283, 40493, 40549, 47282, 49146, 50257, 50357, 50358, 50359, 50360, 50361,
50362,
];
Self {
num_mel_bins: 80,
vocab_size: 51864,
@ -32,6 +42,7 @@ impl Config {
// n_text_state: 384,
decoder_attention_heads: 6,
decoder_layers: 4,
suppress_tokens,
}
}
}