Use the proper block size for quantizing models. (#933)

* Use the proper block size for quantizing models. * Use the proper dimension.
2025-06-16 18:48:51 +00:00 · 2023-09-22 21:36:56 +01:00
parent 3ef328c53d
commit 912a3d63b0
2 changed files with 18 additions and 3 deletions
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@ -243,12 +243,27 @@ fn run_quantize_safetensors(
        Quantization::F16 => QTensor::quantize::<half::f16>,
        Quantization::F32 => QTensor::quantize::<f32>,
    };
+    let block_size = match q {
+        Quantization::Q4_0 => k_quants::QK4_0,
+        Quantization::Q4_1 => k_quants::QK4_1,
+        Quantization::Q5_0 => k_quants::QK5_0,
+        Quantization::Q5_1 => k_quants::QK5_1,
+        Quantization::Q8_0 => k_quants::QK8_0,
+        Quantization::Q8_1 => k_quants::QK8_1,
+        Quantization::Q2k
+        | Quantization::Q3k
+        | Quantization::Q4k
+        | Quantization::Q5k
+        | Quantization::Q6k
+        | Quantization::Q8k => k_quants::QK_K,
+        Quantization::F16 | Quantization::F32 => 1,
+    };

    let qtensors = tensors
        .into_par_iter()
        .map(|(name, tensor)| {
-            println!("  quantizing {name} {tensor:?}");
-            let should_quantize = tensor.rank() == 2 && tensor.dim(0)? % 256 == 0;
+            let should_quantize = tensor.rank() == 2 && tensor.dim(1)? % block_size == 0;
+            println!("  quantizing {name} {tensor:?} {should_quantize}");
            let tensor = if should_quantize {
                quantize_fn(&tensor)?
            } else {
--- a/candle-examples/examples/phi/README.md
+++ b/candle-examples/examples/phi/README.md
@ -1,4 +1,4 @@
-# candle-starcoder: code generation model
+# candle-phi: 1.3b LLM with state of the art performance for <10b models.

 [phi-1.5](https://huggingface.co/microsoft/phi-1_5).