Cuda acceleration for quantized model. (#1754)

* Boilerplate for the quantized cuda support. * More basic cuda support. * More cuda quantization (quantize on cpu for now). * Add the dequantization bit. * Start adding some dedicated cuda kernels from llama.cpp. * Move the kernel code. * Start interfacing with the kernel. * Tweak the kernel launch params. * Bugfix for quantized metal. * Fix some clippy lints. * Tweak the launch parameters. * Tweak cuda basics to perform a quantized matmul. * Perform the dequantization on the cpu + use cublas for matmul. * Add the dequantization kernel. * Test the qmatmul. * More kernels. * Matmul-vec kernel. * Add a couple kernels. * More dequantization kernels.
2025-06-16 10:38:54 +00:00 · 2024-02-25 18:11:47 +01:00
parent 8d04f70f4d
commit 2f22afd80e
11 changed files with 1996 additions and 69 deletions
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -5,25 +5,32 @@ extern crate accelerate_src;
 extern crate intel_mkl_src;

 use anyhow::Result;
-use candle_core::{Device, Tensor};
+use candle_core::{Device, Module, Tensor};
+
+use candle_core::quantized::{QMatMul, QTensor};

 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
-    let in_t = Tensor::rand(-1f32, 1f32, (1, 3, 12, 7), &device)?;
-    let k_t = Tensor::rand(-1f32, 1f32, (6, 3, 1, 1), &device)?;
-    let out_t = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
-    println!("{out_t}");
-    let in_t = in_t.to_device(&Device::Cpu)?;
-    let k_t = k_t.to_device(&Device::Cpu)?;
-    let out_t2 = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
-    let diff = (out_t.to_device(&Device::Cpu)? - out_t2)?
-        .sqr()?
-        .sum_all()?;
-    println!("{diff}");
+    let q = Tensor::randn(0f32, 1.0, (72, 32), &device)?;
+    let q_cpu = q.to_device(&Device::Cpu)?;
+    let q = QTensor::quantize(&q, candle_core::quantized::GgmlDType::Q4_0)?;
+    let q = QMatMul::from_qtensor(q)?;
+    let x = Tensor::randn(0f32, 1.0, (5, 32), &device)?;
+    let res_q_cuda = q.forward(&x)?;
+    println!("{res_q_cuda}");

-    let t = Tensor::randn(0f32, 1f32, (2, 4, 96, 96), &device)?;
-    let w = Tensor::randn(0f32, 1f32, (320, 4, 3, 3), &device)?;
-    let res = t.conv2d(&w, 1, 1, 1, 1)?;
-    println!("{res:?}");
+    let q_cpu = QTensor::quantize(&q_cpu, candle_core::quantized::GgmlDType::Q4_0)?;
+    let q_cpu_tensor = q_cpu.dequantize(&Device::Cpu)?;
+    let q_cpu = QMatMul::from_qtensor(q_cpu)?;
+    let x_cpu = x.to_device(&Device::Cpu)?;
+    let res_q_cpu = q_cpu.forward(&x_cpu)?;
+    println!("{res_q_cpu}");
+
+    let res_mm = x_cpu.matmul(&q_cpu_tensor.t()?)?;
+    let diff = (res_mm - res_q_cuda.to_device(&Device::Cpu))?
+        .abs()?
+        .flatten_all()?
+        .max(0)?;
+    println!("{diff}");
    Ok(())
 }