From 98172d46fa866ecaad2afd52777e9ac9a5f15b52 Mon Sep 17 00:00:00 2001 From: zmlcc Date: Fri, 8 Sep 2023 20:29:40 +0800 Subject: [PATCH] Fix some errors about BlockQ8_1 (#776) * use int8 type instead of uint8 for BlockQ8_1.qs The uint8 type of BlockQ8_1.qs causes great loss for negative weights Ref: https://github.com/ggerganov/llama.cpp/blob/ebc96086af49fe70108cafcea6ab4bebd658a41a/ggml.c#L904 Signed-off-by: Zhang Miaolei * fix sum error in vec_dot of BlockQ4_1 Ref: https://github.com/ggerganov/llama.cpp/blob/ebc96086af49fe70108cafcea6ab4bebd658a41a/ggml.c#L2840 Signed-off-by: Zhang Miaolei * fix sum error in vec_dot of BlockQ5_1 Ref: https://github.com/ggerganov/llama.cpp/blob/ebc96086af49fe70108cafcea6ab4bebd658a41a/ggml.c#L3490 Signed-off-by: Zhang Miaolei --------- Signed-off-by: Zhang Miaolei --- candle-core/src/quantized/k_quants.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs index 65fd6a6e..a0fe455c 100644 --- a/candle-core/src/quantized/k_quants.rs +++ b/candle-core/src/quantized/k_quants.rs @@ -85,7 +85,7 @@ const _: () = assert!(std::mem::size_of::() == 34); pub struct BlockQ8_1 { pub(crate) d: f16, pub(crate) s: f16, - pub(crate) qs: [u8; QK8_1], + pub(crate) qs: [i8; QK8_1], } const _: () = assert!(std::mem::size_of::() == 36); @@ -278,6 +278,7 @@ impl GgmlType for BlockQ4_1 { } sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d) + + f16::to_f32(xs.m) * f16::to_f32(ys.s) } Ok(sumf) } @@ -471,6 +472,7 @@ impl GgmlType for BlockQ5_1 { } sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d) + + f16::to_f32(xs.m) * f16::to_f32(ys.s) } Ok(sumf) } @@ -652,8 +654,8 @@ impl GgmlType for BlockQ8_1 { for j in 0..Self::BLCK_SIZE / 2 { let v0 = xs[j] * id; let v1 = xs[j + Self::BLCK_SIZE / 2] * id; - ys.qs[j] = f32::round(v0) as u8; - ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as u8; + ys.qs[j] = f32::round(v0) as i8; + ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as i8; sum += ys.qs[j] as i32 + ys.qs[j + Self::BLCK_SIZE / 2] as i32; } ys.s = f16::from_f32(sum as f32) * ys.d;