mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 10:38:54 +00:00
Fix some errors about BlockQ8_1 (#776)
* use int8 type instead of uint8 for BlockQ8_1.qs The uint8 type of BlockQ8_1.qs causes great loss for negative weights Ref:ebc96086af/ggml.c (L904)
Signed-off-by: Zhang Miaolei <zmlcc@outlook.com> * fix sum error in vec_dot of BlockQ4_1 Ref:ebc96086af/ggml.c (L2840)
Signed-off-by: Zhang Miaolei <zmlcc@outlook.com> * fix sum error in vec_dot of BlockQ5_1 Ref:ebc96086af/ggml.c (L3490)
Signed-off-by: Zhang Miaolei <zmlcc@outlook.com> --------- Signed-off-by: Zhang Miaolei <zmlcc@outlook.com>
This commit is contained in:
@ -85,7 +85,7 @@ const _: () = assert!(std::mem::size_of::<BlockQ8_0>() == 34);
|
||||
pub struct BlockQ8_1 {
|
||||
pub(crate) d: f16,
|
||||
pub(crate) s: f16,
|
||||
pub(crate) qs: [u8; QK8_1],
|
||||
pub(crate) qs: [i8; QK8_1],
|
||||
}
|
||||
const _: () = assert!(std::mem::size_of::<BlockQ8_1>() == 36);
|
||||
|
||||
@ -278,6 +278,7 @@ impl GgmlType for BlockQ4_1 {
|
||||
}
|
||||
|
||||
sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
|
||||
+ f16::to_f32(xs.m) * f16::to_f32(ys.s)
|
||||
}
|
||||
Ok(sumf)
|
||||
}
|
||||
@ -471,6 +472,7 @@ impl GgmlType for BlockQ5_1 {
|
||||
}
|
||||
|
||||
sumf += sumi as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
|
||||
+ f16::to_f32(xs.m) * f16::to_f32(ys.s)
|
||||
}
|
||||
Ok(sumf)
|
||||
}
|
||||
@ -652,8 +654,8 @@ impl GgmlType for BlockQ8_1 {
|
||||
for j in 0..Self::BLCK_SIZE / 2 {
|
||||
let v0 = xs[j] * id;
|
||||
let v1 = xs[j + Self::BLCK_SIZE / 2] * id;
|
||||
ys.qs[j] = f32::round(v0) as u8;
|
||||
ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as u8;
|
||||
ys.qs[j] = f32::round(v0) as i8;
|
||||
ys.qs[j + Self::BLCK_SIZE / 2] = f32::round(v1) as i8;
|
||||
sum += ys.qs[j] as i32 + ys.qs[j + Self::BLCK_SIZE / 2] as i32;
|
||||
}
|
||||
ys.s = f16::from_f32(sum as f32) * ys.d;
|
||||
|
Reference in New Issue
Block a user