Sketch a fast cuda kernel for reduce-sum. (#109)

* Sketch a fast cuda kernel for reduce-sum. * Sketch the rust support code for the fast sum kernel. * More work on the fast kernel. * Add some testing ground. * A couple fixes for the fast sum kernel.
2025-06-16 18:48:51 +00:00 · 2023-07-08 12:43:56 +01:00
parent 33479c5f1b
commit e676f85f00
3 changed files with 134 additions and 1 deletions
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -0,0 +1,15 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+use anyhow::Result;
+use candle::{Device, Tensor};
+
+fn main() -> Result<()> {
+    let device = Device::new_cuda(0)?;
+    let t = Tensor::new(&[[1f32, 2., 3., 4.2]], &device)?;
+    let sum = t.sum(&[0])?;
+    println!("{sum}");
+    let sum = t.sum(&[1])?;
+    println!("{sum}");
+    Ok(())
+}