diff --git a/candle-core/examples/cpu_benchmarks.rs b/candle-core/examples/cpu_benchmarks.rs index 6c40269f..ef27131e 100644 --- a/candle-core/examples/cpu_benchmarks.rs +++ b/candle-core/examples/cpu_benchmarks.rs @@ -5,6 +5,7 @@ extern crate intel_mkl_src; #[cfg(feature = "accelerate")] extern crate accelerate_src; +use candle_core::quantized::GgmlType; use candle_core::{Device, Result, Tensor, D}; use clap::{Parser, Subcommand}; @@ -81,6 +82,27 @@ impl Benchmark for Matmul { const ITERS: usize = 100; } +// This benchmark is similar to: +// https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp +struct QMatMul; +impl Benchmark for QMatMul { + type PreProcessData = (candle_core::quantized::QMatMul, Tensor); + type RunResult = Tensor; + fn preprocess() -> Result { + let zeros = vec![candle_core::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32]; + let mm = candle_core::quantized::QTensor::new(zeros, (4096, 11008)); + let mm = candle_core::quantized::QMatMul::from_qtensor(mm); + let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?; + Ok((mm, arg)) + } + + fn run_one(d: &Self::PreProcessData) -> Result { + d.0.forward(&d.1) + } + + const ITERS: usize = 100; +} + struct Softmax; impl Benchmark for Softmax { type PreProcessData = Tensor; @@ -116,6 +138,7 @@ enum Task { Conv1d, Conv2d, Matmul, + Qmatmul, Softmax, } @@ -137,6 +160,7 @@ fn main() -> Result<()> { Task::Conv2d => run::(args.iters)?, Task::Matmul => run::(args.iters)?, Task::Softmax => run::(args.iters)?, + Task::Qmatmul => run::(args.iters)?, } Ok(()) } diff --git a/candle-core/src/quantized/k_quants.rs b/candle-core/src/quantized/k_quants.rs index 2f622026..27d2ee3a 100644 --- a/candle-core/src/quantized/k_quants.rs +++ b/candle-core/src/quantized/k_quants.rs @@ -1,3 +1,8 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + use super::GgmlDType; use crate::Result; use half::f16; @@ -620,6 +625,48 @@ impl GgmlType for BlockQ8K { } } +#[cfg(target_feature = "avx")] +#[inline(always)] +unsafe fn sum_i16_pairs_float(x: __m256i) -> __m256 { + let ones = _mm256_set1_epi16(1); + let summed_pairs = _mm256_madd_epi16(ones, x); + _mm256_cvtepi32_ps(summed_pairs) +} + +#[cfg(target_feature = "avx")] +#[inline(always)] +unsafe fn mul_sum_us8_pairs_float(ax: __m256i, sy: __m256i) -> __m256 { + let dot = _mm256_maddubs_epi16(ax, sy); + sum_i16_pairs_float(dot) +} + +#[cfg(target_feature = "avx")] +#[inline(always)] +unsafe fn hsum_float_8(x: __m256) -> f32 { + let mut res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + _mm_cvtss_f32(res) +} + +#[cfg(target_feature = "avx")] +#[inline(always)] +unsafe fn bytes_from_nibbles_32(rsi: *const u8) -> __m256i { + let tmp = _mm_loadu_si128(rsi as *const __m128i); + let bytes = _mm256_insertf128_si256::<1>(_mm256_castsi128_si256(tmp), _mm_srli_epi16(tmp, 4)); + let low_mask = _mm256_set1_epi8(0xF); + _mm256_and_si256(low_mask, bytes) +} + +#[cfg(target_feature = "avx")] +#[inline(always)] +unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> __m256 { + let ax = _mm256_sign_epi8(x, x); + let sy = _mm256_sign_epi8(y, x); + mul_sum_us8_pairs_float(ax, sy) +} + impl GgmlType for BlockQ4_0 { const DTYPE: GgmlDType = GgmlDType::Q4_0; const BLCK_SIZE: usize = QK4_0; @@ -685,7 +732,35 @@ impl GgmlType for BlockQ4_0 { Ok(()) } + #[cfg(target_feature = "avx")] + fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result { + let qk = QK8_0; + let nb = n / qk; + if n % QK8_0 != 0 { + crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}") + } + if nb % 2 != 0 { + crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even") + } + + unsafe { + // Generic implementation. + let mut acc = _mm256_setzero_ps(); + for (x, y) in xs.iter().zip(ys.iter()) { + let d = _mm256_set1_ps(f16::to_f32(x.d) * f16::to_f32(y.d)); + let bx = bytes_from_nibbles_32(x.qs.as_ptr()); + let off = _mm256_set1_epi8(8); + let bx = _mm256_sub_epi8(bx, off); + let by = _mm256_loadu_si256(y.qs.as_ptr() as *const __m256i); + let q = mul_sum_i8_pairs_float(bx, by); + acc = _mm256_fmadd_ps(d, q, acc); + } + Ok(hsum_float_8(acc)) + } + } + // https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/ggml.c#L2361C10-L2361C122 + #[cfg(not(target_feature = "avx"))] fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result { let qk = QK8_0; let nb = n / qk; diff --git a/candle-core/tests/quantized_tests.rs b/candle-core/tests/quantized_tests.rs index b40a7fdb..a1318b9c 100644 --- a/candle-core/tests/quantized_tests.rs +++ b/candle-core/tests/quantized_tests.rs @@ -16,10 +16,10 @@ fn quantized_matmul() -> Result<()> { k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?; k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?; assert_eq!( - dst, + dst.iter().map(|x| x.round()).collect::>(), &[ - 85120.43, 214561.61, 345454.9, 474748.1, 213474.94, 604465.25, 1000686.4, 1388317.3, - 341875.88, 994283.0, 1655708.8, 2301518.3 + 85120.0, 214562.0, 345455.0, 474748.0, 213475.0, 604465.0, 1000686.0, 1388317.0, + 341876.0, 994283.0, 1655709.0, 2301518.0 ] ); let mm = tensor_lhs.matmul(&tensor_rhs)?; @@ -36,11 +36,11 @@ fn quantized_matmul() -> Result<()> { let matmul = quantized::QMatMul::from_qtensor(qtensor); let res = matmul.forward(&tensor_lhs)?; assert_eq!( - res.to_vec2::()?, + to_vec2_round(&res, 0)?, &[ - [85120.43, 214561.61, 345454.9, 474748.1], - [213474.94, 604465.25, 1000686.4, 1388317.3], - [341875.88, 994283.0, 1655708.8, 2301518.3] + [85120.0, 214562.0, 345455.0, 474748.0], + [213475.0, 604465.0, 1000686.0, 1388317.0], + [341876.0, 994283.0, 1655709.0, 2301518.0] ] ); @@ -64,10 +64,10 @@ fn quantized_matmul_neg() -> Result<()> { k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?; k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?; assert_eq!( - dst, + dst.iter().map(|x| x.round()).collect::>(), &[ - 243524.14, -19596.34, -285051.3, -549814.94, 23776.629, 21650.926, 19397.924, - 18366.586, -196472.1, 63011.6, 324584.56, 587901.9 + 243524.0, -19596.0, -285051.0, -549815.0, 23777.0, 21651.0, 19398.0, 18367.0, + -196472.0, 63012.0, 324585.0, 587902.0 ] ); let mm = tensor_lhs.matmul(&tensor_rhs)?;