mirror of
https://github.com/huggingface/candle.git
synced 2025-06-17 02:58:50 +00:00
AVX version of the vecdot for q4_0. (#474)
* AVX version of the vecdot for q4_0. * Tweak the avx bits. * Add a qmatmul benchmark. * Fix the quantized test.
This commit is contained in:
@ -5,6 +5,7 @@ extern crate intel_mkl_src;
|
|||||||
#[cfg(feature = "accelerate")]
|
#[cfg(feature = "accelerate")]
|
||||||
extern crate accelerate_src;
|
extern crate accelerate_src;
|
||||||
|
|
||||||
|
use candle_core::quantized::GgmlType;
|
||||||
use candle_core::{Device, Result, Tensor, D};
|
use candle_core::{Device, Result, Tensor, D};
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
|
|
||||||
@ -81,6 +82,27 @@ impl Benchmark for Matmul {
|
|||||||
const ITERS: usize = 100;
|
const ITERS: usize = 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This benchmark is similar to:
|
||||||
|
// https://github.com/ggerganov/llama.cpp/blob/master/examples/benchmark/benchmark-matmult.cpp
|
||||||
|
struct QMatMul;
|
||||||
|
impl Benchmark for QMatMul {
|
||||||
|
type PreProcessData = (candle_core::quantized::QMatMul, Tensor);
|
||||||
|
type RunResult = Tensor;
|
||||||
|
fn preprocess() -> Result<Self::PreProcessData> {
|
||||||
|
let zeros = vec![candle_core::quantized::k_quants::BlockQ4_0::zeros(); 4096 * 11008 / 32];
|
||||||
|
let mm = candle_core::quantized::QTensor::new(zeros, (4096, 11008));
|
||||||
|
let mm = candle_core::quantized::QMatMul::from_qtensor(mm);
|
||||||
|
let arg = Tensor::randn(0f32, 1., (128, 11008), &Device::Cpu)?;
|
||||||
|
Ok((mm, arg))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
|
||||||
|
d.0.forward(&d.1)
|
||||||
|
}
|
||||||
|
|
||||||
|
const ITERS: usize = 100;
|
||||||
|
}
|
||||||
|
|
||||||
struct Softmax;
|
struct Softmax;
|
||||||
impl Benchmark for Softmax {
|
impl Benchmark for Softmax {
|
||||||
type PreProcessData = Tensor;
|
type PreProcessData = Tensor;
|
||||||
@ -116,6 +138,7 @@ enum Task {
|
|||||||
Conv1d,
|
Conv1d,
|
||||||
Conv2d,
|
Conv2d,
|
||||||
Matmul,
|
Matmul,
|
||||||
|
Qmatmul,
|
||||||
Softmax,
|
Softmax,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,6 +160,7 @@ fn main() -> Result<()> {
|
|||||||
Task::Conv2d => run::<Conv2d>(args.iters)?,
|
Task::Conv2d => run::<Conv2d>(args.iters)?,
|
||||||
Task::Matmul => run::<Matmul>(args.iters)?,
|
Task::Matmul => run::<Matmul>(args.iters)?,
|
||||||
Task::Softmax => run::<Softmax>(args.iters)?,
|
Task::Softmax => run::<Softmax>(args.iters)?,
|
||||||
|
Task::Qmatmul => run::<QMatMul>(args.iters)?,
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
|
#[cfg(target_arch = "x86")]
|
||||||
|
use core::arch::x86::*;
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
use core::arch::x86_64::*;
|
||||||
|
|
||||||
use super::GgmlDType;
|
use super::GgmlDType;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
use half::f16;
|
use half::f16;
|
||||||
@ -620,6 +625,48 @@ impl GgmlType for BlockQ8K {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(target_feature = "avx")]
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn sum_i16_pairs_float(x: __m256i) -> __m256 {
|
||||||
|
let ones = _mm256_set1_epi16(1);
|
||||||
|
let summed_pairs = _mm256_madd_epi16(ones, x);
|
||||||
|
_mm256_cvtepi32_ps(summed_pairs)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_feature = "avx")]
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn mul_sum_us8_pairs_float(ax: __m256i, sy: __m256i) -> __m256 {
|
||||||
|
let dot = _mm256_maddubs_epi16(ax, sy);
|
||||||
|
sum_i16_pairs_float(dot)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_feature = "avx")]
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn hsum_float_8(x: __m256) -> f32 {
|
||||||
|
let mut res = _mm256_extractf128_ps(x, 1);
|
||||||
|
res = _mm_add_ps(res, _mm256_castps256_ps128(x));
|
||||||
|
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
|
||||||
|
res = _mm_add_ss(res, _mm_movehdup_ps(res));
|
||||||
|
_mm_cvtss_f32(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_feature = "avx")]
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn bytes_from_nibbles_32(rsi: *const u8) -> __m256i {
|
||||||
|
let tmp = _mm_loadu_si128(rsi as *const __m128i);
|
||||||
|
let bytes = _mm256_insertf128_si256::<1>(_mm256_castsi128_si256(tmp), _mm_srli_epi16(tmp, 4));
|
||||||
|
let low_mask = _mm256_set1_epi8(0xF);
|
||||||
|
_mm256_and_si256(low_mask, bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_feature = "avx")]
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> __m256 {
|
||||||
|
let ax = _mm256_sign_epi8(x, x);
|
||||||
|
let sy = _mm256_sign_epi8(y, x);
|
||||||
|
mul_sum_us8_pairs_float(ax, sy)
|
||||||
|
}
|
||||||
|
|
||||||
impl GgmlType for BlockQ4_0 {
|
impl GgmlType for BlockQ4_0 {
|
||||||
const DTYPE: GgmlDType = GgmlDType::Q4_0;
|
const DTYPE: GgmlDType = GgmlDType::Q4_0;
|
||||||
const BLCK_SIZE: usize = QK4_0;
|
const BLCK_SIZE: usize = QK4_0;
|
||||||
@ -685,7 +732,35 @@ impl GgmlType for BlockQ4_0 {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(target_feature = "avx")]
|
||||||
|
fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
|
||||||
|
let qk = QK8_0;
|
||||||
|
let nb = n / qk;
|
||||||
|
if n % QK8_0 != 0 {
|
||||||
|
crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
|
||||||
|
}
|
||||||
|
if nb % 2 != 0 {
|
||||||
|
crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
// Generic implementation.
|
||||||
|
let mut acc = _mm256_setzero_ps();
|
||||||
|
for (x, y) in xs.iter().zip(ys.iter()) {
|
||||||
|
let d = _mm256_set1_ps(f16::to_f32(x.d) * f16::to_f32(y.d));
|
||||||
|
let bx = bytes_from_nibbles_32(x.qs.as_ptr());
|
||||||
|
let off = _mm256_set1_epi8(8);
|
||||||
|
let bx = _mm256_sub_epi8(bx, off);
|
||||||
|
let by = _mm256_loadu_si256(y.qs.as_ptr() as *const __m256i);
|
||||||
|
let q = mul_sum_i8_pairs_float(bx, by);
|
||||||
|
acc = _mm256_fmadd_ps(d, q, acc);
|
||||||
|
}
|
||||||
|
Ok(hsum_float_8(acc))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/ggml.c#L2361C10-L2361C122
|
// https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/ggml.c#L2361C10-L2361C122
|
||||||
|
#[cfg(not(target_feature = "avx"))]
|
||||||
fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
|
fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
|
||||||
let qk = QK8_0;
|
let qk = QK8_0;
|
||||||
let nb = n / qk;
|
let nb = n / qk;
|
||||||
|
@ -16,10 +16,10 @@ fn quantized_matmul() -> Result<()> {
|
|||||||
k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
|
k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
|
||||||
k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
|
k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dst,
|
dst.iter().map(|x| x.round()).collect::<Vec<_>>(),
|
||||||
&[
|
&[
|
||||||
85120.43, 214561.61, 345454.9, 474748.1, 213474.94, 604465.25, 1000686.4, 1388317.3,
|
85120.0, 214562.0, 345455.0, 474748.0, 213475.0, 604465.0, 1000686.0, 1388317.0,
|
||||||
341875.88, 994283.0, 1655708.8, 2301518.3
|
341876.0, 994283.0, 1655709.0, 2301518.0
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
let mm = tensor_lhs.matmul(&tensor_rhs)?;
|
let mm = tensor_lhs.matmul(&tensor_rhs)?;
|
||||||
@ -36,11 +36,11 @@ fn quantized_matmul() -> Result<()> {
|
|||||||
let matmul = quantized::QMatMul::from_qtensor(qtensor);
|
let matmul = quantized::QMatMul::from_qtensor(qtensor);
|
||||||
let res = matmul.forward(&tensor_lhs)?;
|
let res = matmul.forward(&tensor_lhs)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
res.to_vec2::<f32>()?,
|
to_vec2_round(&res, 0)?,
|
||||||
&[
|
&[
|
||||||
[85120.43, 214561.61, 345454.9, 474748.1],
|
[85120.0, 214562.0, 345455.0, 474748.0],
|
||||||
[213474.94, 604465.25, 1000686.4, 1388317.3],
|
[213475.0, 604465.0, 1000686.0, 1388317.0],
|
||||||
[341875.88, 994283.0, 1655708.8, 2301518.3]
|
[341876.0, 994283.0, 1655709.0, 2301518.0]
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -64,10 +64,10 @@ fn quantized_matmul_neg() -> Result<()> {
|
|||||||
k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
|
k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
|
||||||
k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
|
k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dst,
|
dst.iter().map(|x| x.round()).collect::<Vec<_>>(),
|
||||||
&[
|
&[
|
||||||
243524.14, -19596.34, -285051.3, -549814.94, 23776.629, 21650.926, 19397.924,
|
243524.0, -19596.0, -285051.0, -549815.0, 23777.0, 21651.0, 19398.0, 18367.0,
|
||||||
18366.586, -196472.1, 63011.6, 324584.56, 587901.9
|
-196472.0, 63012.0, 324585.0, 587902.0
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
let mm = tensor_lhs.matmul(&tensor_rhs)?;
|
let mm = tensor_lhs.matmul(&tensor_rhs)?;
|
||||||
|
Reference in New Issue
Block a user