Add a kv-cache to the quantized llama example. (#466)

* Add a kv-cache to the quantized llama example.

* Also print the prompt.

* Bugfix in q6k dequantizing.

* Another bugfix.
This commit is contained in:
Laurent Mazare
2023-08-16 14:28:42 +01:00
committed by GitHub
parent 3071134788
commit a9101700b6
2 changed files with 20 additions and 9 deletions

View File

@ -70,7 +70,7 @@ const _: () = assert!(std::mem::size_of::<BlockQ5_1>() == 24);
#[repr(C)]
pub struct BlockQ8_0 {
d: f16,
qs: [u8; QK8_0],
qs: [i8; QK8_0],
}
const _: () = assert!(std::mem::size_of::<BlockQ8_0>() == 34);
@ -476,14 +476,14 @@ impl GgmlType for BlockQ6K {
if k % QK_K != 0 {
crate::bail!("dequantize_row_q6k: {k} is not divisible by {QK_K}")
}
for x in xs.iter() {
for (idx_x, x) in xs.iter().enumerate() {
let d = x.d.to_f32();
let ql = &x.ql;
let qh = &x.qh;
let sc = &x.scales;
for n in (0..QK_K).step_by(128) {
let idx = n / 128;
let ys = &mut ys[n..];
let ys = &mut ys[idx_x * QK_K + n..];
let sc = &sc[8 * idx..];
let ql = &ql[64 * idx..];
let qh = &qh[32 * idx..];
@ -663,7 +663,7 @@ impl GgmlType for BlockQ8_0 {
let id = if d != 0f32 { 1. / d } else { 0. };
ys.d = f16::from_f32(d);
for (y, &x) in ys.qs.iter_mut().zip(xs.iter()) {
*y = f32::round(x * id) as u8
*y = f32::round(x * id) as i8
}
}
Ok(())