More ggml cuda kernels (#1977)

* Add more cuda kernels for quantized matmul. * Add the vec-dot bits. * Expose the quantized matmul-vec kernels. * Also include the quantize-q8-1 kernel. * Glue code for the q8-1 quantization. * mm-vec product via q8-1 quantization. * Add a test. * Add a mm test. * Get the test to return some sensible results. * Also test dmmv. * Fix the launch params. * Allow for tweaking the force_dmmv parameter while it's experimental.
2025-06-16 10:38:54 +00:00 · 2024-04-01 00:15:48 +02:00
parent f9954b73ba
commit cd29c7ccd4
3 changed files with 1169 additions and 82 deletions
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -235,6 +235,10 @@ struct Args {
    /// Group-Query Attention, use 8 for the 70B version of LLaMAv2.
    #[arg(long)]
    gqa: Option<usize>,
+
+    /// Use the (experimental) fast cuda kernels.
+    #[arg(long)]
+    fast_cuda: bool,
 }

 impl Args {
@ -341,6 +345,10 @@ fn main() -> anyhow::Result<()> {
    use tracing_subscriber::prelude::*;

    let args = Args::parse();
+
+    #[cfg(feature = "cuda")]
+    candle::quantized::cuda::set_force_dmmv(!args.fast_cuda);
+
    let temperature = if args.temperature == 0. {
        None
    } else {