Add the mmv kernels for small batch sizes. (#2075)

* Add the mmv kernels for smaller sizes. * Support more mmv kernels. * Use the new kernels. * Fix the call. * Silly fix. * Improve the testing. * Fix for dmmv. * Add another dedicated test for the batching mmv.
2025-06-20 12:06:35 +00:00 · 2024-04-16 21:30:51 +02:00
parent 4d14777673
commit 2817643db9
3 changed files with 335 additions and 29 deletions
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
@ -170,12 +170,46 @@ fn quantized_matmul_neg(device: &Device) -> Result<()> {
    let res2 = matmul.forward(&lhs2)?;
    let res2 = res2.i(1)?;
    let diff = (res - res2)?.abs()?.sum_all()?.to_vec0::<f32>()?;
-    assert_eq!(diff, 0.);
+    if device.is_cuda() {
+        assert!(diff < 0.1);
+    } else {
+        assert_eq!(diff, 0.);
+    }
+    Ok(())
+}
+
+fn qmm_batch(dev: &Device) -> Result<()> {
+    let (lhs, rhs, _mm) = get_random_tensors(2, 256, 6, dev)?;
+    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q2K)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let mm = rhs.forward(&lhs)?;
+    assert_eq!(mm.shape().dims(), [2, 6]);
+    let lhs2 = Tensor::cat(&[&lhs, &lhs], 0)?;
+    let mm2 = rhs.forward(&lhs2)?;
+    assert_eq!(mm2.shape().dims(), [4, 6]);
+    let diff2 = (mm2.i(2..)? - &mm)?.abs()?.sum_all()?.to_vec0::<f32>()?;
+    assert_eq!(diff2, 0.0);
+    let lhs3 = Tensor::cat(&[&lhs2, &lhs], 0)?;
+    let mm3 = rhs.forward(&lhs3)?;
+    assert_eq!(mm3.shape().dims(), [6, 6]);
+    let diff3 = (mm3.i(2..4)? - &mm)?.abs()?.sum_all()?.to_vec0::<f32>()?;
+    if dev.is_cuda() {
+        assert!(diff3 < 1e-4)
+    } else {
+        assert_eq!(diff3, 0.0)
+    };
+    let diff3 = (mm3.i(4..)? - &mm)?.abs()?.sum_all()?.to_vec0::<f32>()?;
+    if dev.is_cuda() {
+        assert!(diff3 < 1e-4)
+    } else {
+        assert_eq!(diff3, 0.0)
+    };
    Ok(())
 }

 test_device!(quantized_matmul, qmm_cpu, qmm_cuda, qmm_metal);
 test_device!(quantized_matmul_neg, qmm_n_cpu, qmm_n_cuda, qmm_n_metal);
+test_device!(qmm_batch, qmm_b_cpu, qmm_b_cuda, qmm_b_metal);

 fn quantize_q4_0(device: &Device) -> Result<()> {
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();