Use the proper block size for quantizing models. (#933)

* Use the proper block size for quantizing models.

* Use the proper dimension.
This commit is contained in:
Laurent Mazare
2023-09-22 21:36:56 +01:00
committed by GitHub
parent 3ef328c53d
commit 912a3d63b0
2 changed files with 18 additions and 3 deletions

View File

@ -243,12 +243,27 @@ fn run_quantize_safetensors(
Quantization::F16 => QTensor::quantize::<half::f16>,
Quantization::F32 => QTensor::quantize::<f32>,
};
let block_size = match q {
Quantization::Q4_0 => k_quants::QK4_0,
Quantization::Q4_1 => k_quants::QK4_1,
Quantization::Q5_0 => k_quants::QK5_0,
Quantization::Q5_1 => k_quants::QK5_1,
Quantization::Q8_0 => k_quants::QK8_0,
Quantization::Q8_1 => k_quants::QK8_1,
Quantization::Q2k
| Quantization::Q3k
| Quantization::Q4k
| Quantization::Q5k
| Quantization::Q6k
| Quantization::Q8k => k_quants::QK_K,
Quantization::F16 | Quantization::F32 => 1,
};
let qtensors = tensors
.into_par_iter()
.map(|(name, tensor)| {
println!(" quantizing {name} {tensor:?}");
let should_quantize = tensor.rank() == 2 && tensor.dim(0)? % 256 == 0;
let should_quantize = tensor.rank() == 2 && tensor.dim(1)? % block_size == 0;
println!(" quantizing {name} {tensor:?} {should_quantize}");
let tensor = if should_quantize {
quantize_fn(&tensor)?
} else {

View File

@ -1,4 +1,4 @@
# candle-starcoder: code generation model
# candle-phi: 1.3b LLM with state of the art performance for <10b models.
[phi-1.5](https://huggingface.co/microsoft/phi-1_5).