Quantized version of mistral. (#1009)

* Quantized version of mistral. * Integrate the quantized mistral variant. * Use the quantized weight files. * Tweak the quantization command. * Fix the dtype when computing the rotary embeddings. * Update the readme with the quantized version. * Fix the decoding of the remaining tokens.
2025-06-19 11:56:45 +00:00 · 2023-09-30 19:25:47 +02:00
parent 06207332bc
commit deee7612da
7 changed files with 507 additions and 37 deletions
--- a/candle-transformers/src/models/mistral.rs
+++ b/candle-transformers/src/models/mistral.rs
@ -6,18 +6,18 @@ use std::sync::Arc;

 #[derive(Debug, Clone, PartialEq)]
 pub struct Config {
-    vocab_size: usize,
-    hidden_size: usize,
-    intermediate_size: usize,
-    num_hidden_layers: usize,
-    num_attention_heads: usize,
-    num_key_value_heads: usize,
-    hidden_act: Activation,
-    max_position_embeddings: usize,
-    rms_norm_eps: f64,
-    rope_theta: f64,
-    sliding_window: usize,
-    use_flash_attn: bool,
+    pub(crate) vocab_size: usize,
+    pub(crate) hidden_size: usize,
+    pub(crate) intermediate_size: usize,
+    pub(crate) num_hidden_layers: usize,
+    pub(crate) num_attention_heads: usize,
+    pub(crate) num_key_value_heads: usize,
+    pub(crate) hidden_act: Activation,
+    pub(crate) max_position_embeddings: usize,
+    pub(crate) rms_norm_eps: f64,
+    pub(crate) rope_theta: f64,
+    pub(crate) sliding_window: usize,
+    pub(crate) use_flash_attn: bool,
 }

 impl Config {