From c72eb3d75ba9a9b2e5e9d6d7e277dc42aed087f4 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 26 Aug 2023 13:07:54 +0200 Subject: [PATCH] Add reference implementation for `q4k` and `q5k` (#586) * add `q2k` vec-dot * `q3k` vec-dot + quantization bugfix * `q4k` vec-dot * `q5k` vec-dot * Validate against GGML unit test results. * Remove some more `transmutes` --- candle-core/src/quantized/k_quants.rs | 181 +++++++++++++++++++++++++- candle-core/tests/quantized_tests.rs | 94 ++++++++++++- 2 files changed, 270 insertions(+), 5 deletions(-) diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs index cdea2434..216e9b5d 100644 --- a/candle-core/src/quantized/k_quants.rs +++ b/candle-core/src/quantized/k_quants.rs @@ -906,8 +906,91 @@ impl GgmlType for BlockQ4K { const BLCK_SIZE: usize = QK_K; type VecDotType = BlockQ8K; - fn vec_dot(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result { - todo!() + fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result { + if n % QK_K != 0 { + crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}") + } + + const KMASK1: u32 = 0x3f3f3f3f; + const KMASK2: u32 = 0x0f0f0f0f; + const KMASK3: u32 = 0x03030303; + + let mut utmp: [u32; 4] = [0; 4]; + let mut scales: [u8; 8]; + let mut mins: [u8; 8]; + + let mut aux8: [i8; QK_K] = [0; QK_K]; + let mut aux16: [i16; 8] = [0; 8]; + let mut sums: [f32; 8] = [0.0; 8]; + let mut aux32: [i32; 8] = [0; 8]; + + let mut sumf = 0.0; + for (y, x) in ys.iter().zip(xs.iter()) { + let q4 = &x.qs; + let q8 = &y.qs; + aux32.fill(0); + + let mut a = &mut aux8[..]; + let mut q4 = &q4[..]; + for _ in 0..QK_K / 64 { + for l in 0..32 { + a[l] = (q4[l] & 0xF) as i8; + } + a = &mut a[32..]; + for l in 0..32 { + a[l] = (q4[l] >> 4) as i8; + } + a = &mut a[32..]; + q4 = &q4[32..]; + } + + LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]); + + utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4); + let uaux = utmp[1] & KMASK1; + utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4); + utmp[2] = uaux; + utmp[0] &= KMASK1; + + //extract scales and mins + let mut utemp_scales = &mut [0u32; 2]; + let mut utemp_mins = &mut [0u32; 2]; + utemp_scales.copy_from_slice(&utmp[0..2]); + utemp_mins.copy_from_slice(&utmp[2..4]); + + scales = + unsafe { *std::mem::transmute::<&mut [u32; 2], &mut [u8; 8]>(&mut utemp_scales) }; + mins = unsafe { *std::mem::transmute::<&mut [u32; 2], &mut [u8; 8]>(&mut utemp_mins) }; + + let mut sumi = 0; + for j in 0..QK_K / 16 { + sumi += y.bsums[j] as i32 * mins[j / 2] as i32; + } + + let mut a = &mut aux8[..]; + let mut q8 = &q8[..]; + + for scale in scales { + let scale = scale as i32; + for _ in 0..4 { + for l in 0..8 { + aux16[l] = q8[l] as i16 * a[l] as i16; + } + for l in 0..8 { + aux32[l] += scale * aux16[l] as i32; + } + q8 = &q8[8..]; + a = &mut a[8..]; + } + } + let d = x.d.to_f32() * y.d; + for l in 0..8 { + sums[l] += d * aux32[l] as f32; + } + let dmin = x.dmin.to_f32() * y.d; + sumf -= dmin * sumi as f32; + } + Ok(sumf + sums.iter().sum::()) } fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> { @@ -1008,8 +1091,98 @@ impl GgmlType for BlockQ5K { const BLCK_SIZE: usize = QK_K; type VecDotType = BlockQ8K; - fn vec_dot(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result { - todo!() + fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result { + if n % QK_K != 0 { + crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}") + } + + const KMASK1: u32 = 0x3f3f3f3f; + const KMASK2: u32 = 0x0f0f0f0f; + const KMASK3: u32 = 0x03030303; + + let mut utmp: [u32; 4] = [0; 4]; + let mut scales: [u8; 8]; + let mut mins: [u8; 8]; + + let mut aux8: [i8; QK_K] = [0; QK_K]; + let mut aux16: [i16; 8] = [0; 8]; + let mut sums: [f32; 8] = [0.0; 8]; + let mut aux32: [i32; 8] = [0; 8]; + + let mut sumf = 0.0; + for (y, x) in ys.iter().zip(xs.iter()) { + let q5 = &x.qs; + let hm = &x.qh; + let q8 = &y.qs; + aux32.fill(0); + + let mut a = &mut aux8[..]; + let mut q5 = &q5[..]; + let mut m = 1u8; + + for _ in 0..QK_K / 64 { + for l in 0..32 { + a[l] = (q5[l] & 0xF) as i8; + a[l] += if hm[l] & m != 0 { 16 } else { 0 }; + } + a = &mut a[32..]; + m <<= 1; + for l in 0..32 { + a[l] = (q5[l] >> 4) as i8; + a[l] += if hm[l] & m != 0 { 16 } else { 0 }; + } + a = &mut a[32..]; + m <<= 1; + q5 = &q5[32..]; + } + + LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]); + + utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4); + let uaux = utmp[1] & KMASK1; + utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4); + utmp[2] = uaux; + utmp[0] &= KMASK1; + + //extract scales and mins + let mut utemp_scales = &mut [0u32; 2]; + let mut utemp_mins = &mut [0u32; 2]; + utemp_scales.copy_from_slice(&utmp[0..2]); + utemp_mins.copy_from_slice(&utmp[2..4]); + + scales = + unsafe { *std::mem::transmute::<&mut [u32; 2], &mut [u8; 8]>(&mut utemp_scales) }; + mins = unsafe { *std::mem::transmute::<&mut [u32; 2], &mut [u8; 8]>(&mut utemp_mins) }; + + let mut sumi = 0; + for j in 0..QK_K / 16 { + sumi += y.bsums[j] as i32 * mins[j / 2] as i32; + } + + let mut a = &mut aux8[..]; + let mut q8 = &q8[..]; + + for scale in scales { + let scale = scale as i32; + for _ in 0..4 { + for l in 0..8 { + aux16[l] = q8[l] as i16 * a[l] as i16; + } + for l in 0..8 { + aux32[l] += scale * aux16[l] as i32; + } + q8 = &q8[8..]; + a = &mut a[8..]; + } + } + let d = x.d.to_f32() * y.d; + for l in 0..8 { + sums[l] += d * aux32[l] as f32; + } + let dmin = x.dmin.to_f32() * y.d; + sumf -= dmin * sumi as f32; + } + Ok(sumf + sums.iter().sum::()) } // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L793 diff --git a/candle-core/tests/quantized_tests.rs b/candle-core/tests/quantized_tests.rs index 436c8ebe..a679e7b5 100644 --- a/candle-core/tests/quantized_tests.rs +++ b/candle-core/tests/quantized_tests.rs @@ -1,4 +1,7 @@ -use candle_core::{quantized, Device, Result, Tensor}; +use candle_core::{ + quantized::{self, GgmlDType}, + Device, Result, Tensor, +}; use quantized::{k_quants, GgmlType}; mod test_utils; use rand::prelude::*; @@ -395,6 +398,27 @@ fn vec_dot_referenze(a: &[f32], b: &[f32]) -> f32 { a.iter().zip(b).map(|(a, b)| a * b).sum() } +/// Returns the error achieved by the GGML matmul unit test. +fn ggml_reference_matmul_error(quantiztation_tpye: GgmlDType) -> Result { + match quantiztation_tpye { + GgmlDType::F16 => Ok(0.000010), + GgmlDType::Q2K => Ok(0.004086), + GgmlDType::Q3K => Ok(0.016148), + GgmlDType::Q4K => Ok(0.002425), + GgmlDType::Q5K => Ok(0.000740), + GgmlDType::Q6K => Ok(0.000952), + GgmlDType::Q4_0 => Ok(0.001143), + GgmlDType::Q4_1 => Ok(0.007784), + GgmlDType::Q5_0 => Ok(0.001353), + GgmlDType::Q5_1 => Ok(0.001363), + GgmlDType::Q8_0 => Ok(0.000092), + _ => candle_core::bail!( + "No GGML results for quantization type {:?}", + quantiztation_tpye + ), + } +} + /// Mirrores the GGML matmul unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91 fn ggml_matmul_error_test() -> Result<()> { let a = create_ggml_like_vector(0.0); @@ -411,6 +435,8 @@ fn ggml_matmul_error_test() -> Result<()> { let error = (result - reference_result).abs() / length as f32; + let ggml_error = ggml_reference_matmul_error(T::DTYPE)?; + if error > GGML_MAX_DOT_PRODUCT_ERROR { candle_core::bail!( "Dot product error {} exceeds max error {}", @@ -418,6 +444,17 @@ fn ggml_matmul_error_test() -> Result<()> { GGML_MAX_DOT_PRODUCT_ERROR ); } + + // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML + // => we use a slightly higher error threshold + const ERROR_LENIENCY: f32 = 0.00001; + if error - ERROR_LENIENCY > ggml_error { + candle_core::bail!( + "Dot product error {} exceeds ggml reference error {}", + error, + ggml_error + ); + } Ok(()) } @@ -498,6 +535,61 @@ fn quantized_matmul_q3k() -> Result<()> { Ok(()) } +#[test] +fn quantized_matmul_q4k() -> Result<()> { + use k_quants::BlockQ4K; + + let cpu = &Device::Cpu; + let (m, k, n) = (11, 512, 21); + let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?; + assert_eq!(mm.dims(), [m, n]); + let dst = mm.flatten_all()?.to_vec1::()?; + let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]); + assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]); + + let rhs = quantized::QTensor::quantize::(&rhs)?; + let rhs = quantized::QMatMul::from_qtensor(rhs); + let mm = rhs.forward(&lhs)?; + + assert_eq!(mm.dims(), [m, n]); + let dst = mm.flatten_all()?.to_vec1::()?; + let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]); + assert_eq!(dst, [1.125, 1.435, -0.201, 1.589]); + + //mirrored GGML unit test + ggml_matmul_error_test::()?; + + Ok(()) +} + +#[test] +fn quantized_matmul_q5k() -> Result<()> { + use k_quants::BlockQ5K; + + let cpu = &Device::Cpu; + let (m, k, n) = (11, 512, 21); + let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?; + assert_eq!(mm.dims(), [m, n]); + let dst = mm.flatten_all()?.to_vec1::()?; + let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]); + assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]); + + let rhs = quantized::QTensor::quantize::(&rhs)?; + let rhs = quantized::QMatMul::from_qtensor(rhs); + let mm = rhs.forward(&lhs)?; + + assert_eq!(mm.dims(), [m, n]); + let dst = mm.flatten_all()?.to_vec1::()?; + let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]); + assert_eq!(dst, [1.192, 1.491, -0.18, 1.743]); + + //mirrored GGML unit test + //Expected: 0.000740408897 + ggml_matmul_error_test::()?; + + Ok(()) +} + #[test] fn quantized_matmul_q6k() -> Result<()> { use k_quants::BlockQ6K;