Quantized version of mistral. (#1009)

* Quantized version of mistral. * Integrate the quantized mistral variant. * Use the quantized weight files. * Tweak the quantization command. * Fix the dtype when computing the rotary embeddings. * Update the readme with the quantized version. * Fix the decoding of the remaining tokens.
2025-06-16 10:38:54 +00:00 · 2023-09-30 19:25:47 +02:00
parent 06207332bc
commit deee7612da
7 changed files with 507 additions and 37 deletions
--- a/candle-examples/examples/mistral/README.md
+++ b/candle-examples/examples/mistral/README.md
@ -6,6 +6,9 @@ as of 2023-09-28. Weights (and the original Python model code) are released unde
 - [Blog post](https://mistral.ai/news/announcing-mistral-7b/) from Mistral announcing the model release.
 - [Model card](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the
  HuggingFace Hub.
+This example supports the initial model as well as a quantized variant.
+
+## Running the example

 ```bash
 $ cargo run --example mistral --release --features cuda -- --prompt 'Write helloworld code in Rust' --sample-len 150
@ -38,3 +41,50 @@ fn main() {

 This example is released under the terms
 ```
+
+## Running the quantized version of the model
+
+```bash
+$ cargo run --example mistral --features accelerate --release -- \
+$   --prompt "Here is a sample quick sort implementation in rust " --quantized -n 400
+avx: false, neon: true, simd128: false, f16c: false
+temp: 0.00 repeat-penalty: 1.10 repeat-last-n: 64
+retrieved the files in 562.292µs
+loaded the model in 1.100323667s
+Here is a sample quick sort implementation in rust
+
+``rust
+fn quick_sort(arr: &mut [i32]) {
+    if arr.len() <= 1 {
+        return;
+    }
+
+    let pivot = arr[0];
+    let mut left = vec![];
+    let mut right = vec![];
+
+    for i in 1..arr.len() {
+        if arr[i] < pivot {
+            left.push(arr[i]);
+        } else {
+            right.push(arr[i]);
+        }
+    }
+
+    quick_sort(&mut left);
+    quick_sort(&mut right);
+
+    let mut i = 0;
+    for _ in &left {
+        arr[i] = left.pop().unwrap();
+        i += 1;
+    }
+
+    for _ in &right {
+        arr[i] = right.pop().unwrap();
+        i += 1;
+    }
+}
+``
+226 tokens generated (10.91 token/s)
+```
--- a/candle-examples/examples/mistral/main.rs
+++ b/candle-examples/examples/mistral/main.rs
@ -7,7 +7,8 @@ extern crate accelerate_src;
 use anyhow::{Error as E, Result};
 use clap::Parser;

-use candle_transformers::models::mistral::{Config, Model};
+use candle_transformers::models::mistral::{Config, Model as Mistral};
+use candle_transformers::models::quantized_mistral::Model as QMistral;

 use candle::{DType, Device, Tensor};
 use candle_examples::token_output_stream::TokenOutputStream;
@ -16,6 +17,11 @@ use candle_transformers::generation::LogitsProcessor;
 use hf_hub::{api::sync::Api, Repo, RepoType};
 use tokenizers::Tokenizer;

+enum Model {
+    Mistral(Mistral),
+    Quantized(QMistral),
+}
+
 struct TextGeneration {
    model: Model,
    device: Device,
@ -76,7 +82,10 @@ impl TextGeneration {
            let start_pos = tokens.len().saturating_sub(context_size);
            let ctxt = &tokens[start_pos..];
            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
-            let logits = self.model.forward(&input, start_pos)?;
+            let logits = match &mut self.model {
+                Model::Mistral(m) => m.forward(&input, start_pos)?,
+                Model::Quantized(m) => m.forward(&input, start_pos)?,
+            };
            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
            let logits = if self.repeat_penalty == 1. {
                logits
@ -101,8 +110,9 @@ impl TextGeneration {
            }
        }
        let dt = start_gen.elapsed();
-        let rest = self.tokenizer.decode_rest().map_err(E::msg)?;
-        print!("{rest}");
+        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
+            print!("{rest}");
+        }
        std::io::stdout().flush()?;
        println!(
            "\n{generated_tokens} tokens generated ({:.2} token/s)",
@ -211,24 +221,39 @@ fn main() -> Result<()> {
            .split(',')
            .map(std::path::PathBuf::from)
            .collect::<Vec<_>>(),
-        None => vec![
-            repo.get("pytorch_model-00001-of-00002.safetensors")?,
-            repo.get("pytorch_model-00002-of-00002.safetensors")?,
-        ],
+        None => {
+            if args.quantized {
+                vec![repo.get("model-q4k.gguf")?]
+            } else {
+                vec![
+                    repo.get("pytorch_model-00001-of-00002.safetensors")?,
+                    repo.get("pytorch_model-00002-of-00002.safetensors")?,
+                ]
+            }
+        }
    };
    println!("retrieved the files in {:?}", start.elapsed());
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

    let start = std::time::Instant::now();
    let config = Config::config_7b_v0_1(args.use_flash_attn);
-    let device = candle_examples::device(args.cpu)?;
-    let dtype = if device.is_cuda() {
-        DType::BF16
+    let (model, device) = if args.quantized {
+        let filename = &filenames[0];
+        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(filename)?;
+        let model = QMistral::new(&config, vb)?;
+        (Model::Quantized(model), Device::Cpu)
    } else {
-        DType::F32
+        let device = candle_examples::device(args.cpu)?;
+        let dtype = if device.is_cuda() {
+            DType::BF16
+        } else {
+            DType::F32
+        };
+        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
+        let model = Mistral::new(&config, vb)?;
+        (Model::Mistral(model), device)
    };
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
-    let model = Model::new(&config, vb)?;
+
    println!("loaded the model in {:?}", start.elapsed());

    let mut pipeline = TextGeneration::new(