Cudnn support (#445)

* Add a cudnn feature to be used for conv2d. * Allocate the proper workspace. * Only create a single cudnn handle per cuda device. * Proper cudnn usage. * Bugfix.
2025-06-15 10:26:33 +00:00 · 2023-08-14 21:30:41 +01:00
parent c84883ecf2
commit 90374097dc
7 changed files with 195 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -28,7 +28,7 @@ Check out our [examples](./candle-examples/examples/):
 - [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code
  generation.
 - [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
-  image generative model, only cpu support at the moment and on the slow side.
+  image generative model, yet to be optimized.

 Run them using the following commands:
 ```
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -35,6 +35,7 @@ clap = { workspace = true }

 [features]
 default = []
-cuda = ["dep:cudarc", "dep:candle-kernels"]
+cuda = ["cudarc", "dep:candle-kernels"]
+cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -9,10 +9,9 @@ use candle_core::{Device, Tensor};

 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
-    let t = Tensor::new(&[[1f32, 2., 3., 4.2]], &device)?;
-    let sum = t.sum_keepdim(0)?;
-    println!("{sum}");
-    let sum = t.sum_keepdim(1)?;
-    println!("{sum}");
+    let t = Tensor::randn(0f32, 1f32, (2, 4, 96, 96), &device)?;
+    let w = Tensor::randn(0f32, 1f32, (320, 4, 3, 3), &device)?;
+    let res = t.conv2d(&w, 1, 1)?;
+    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@ -64,7 +64,7 @@ impl From<CudaError> for crate::Error {

 /// Unique identifier for cuda devices.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub(crate) struct DeviceId(usize);
+pub struct DeviceId(usize);

 impl DeviceId {
    fn new() -> Self {
@ -111,6 +111,14 @@ impl<O, E: Into<CudaError>> WrapErr<O> for std::result::Result<O, E> {
 }

 impl CudaDevice {
+    pub fn cuda_device(&self) -> Arc<cudarc::driver::CudaDevice> {
+        self.device.clone()
+    }
+
+    pub fn id(&self) -> DeviceId {
+        self.id
+    }
+
    fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
        let elem_count = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
@ -936,17 +944,18 @@ impl<'a> Map2 for Conv2D<'a> {
        // Kernel shape: (c_out, c_in_k, w_k, h_k)
        // Input shape: (b_size, c_in, w_in, c_in)
        let p = &self.0;
+        let (out_w, out_h) = (p.out_w(), p.out_h());
+        let dst_el = p.c_out * out_w * out_h * p.b_size;
        let inp = &inp.slice(inp_l.start_offset()..);
        let k = &k.slice(k_l.start_offset()..);
        let shape = inp_l.shape();
        let dims = shape.dims();
        let el = shape.elem_count();
-        let (out_w, out_h) = (p.out_w(), p.out_h());
-        let dst_el = p.c_out * out_w * out_h * p.b_size;
-        let cfg = LaunchConfig::for_num_elems(dst_el as u32);
-        let func = dev.get_or_load_func(&kernel_name::<T>("conv2d"), kernels::CONV)?;
+
        // SAFETY: Set later by running the kernel.
        let out = unsafe { dev.alloc::<T>(dst_el) }.w()?;
+        let cfg = LaunchConfig::for_num_elems(dst_el as u32);
+        let func = dev.get_or_load_func(&kernel_name::<T>("conv2d"), kernels::CONV)?;
        let ds = if dims.len() == 4 {
            [dims, inp_l.stride(), k_l.dims(), k_l.stride()].concat()
        } else {
@ -1508,6 +1517,7 @@ impl BackendStorage for CudaStorage {
        Ok(Self { slice, device })
    }

+    #[cfg(not(feature = "cudnn"))]
    fn conv2d(
        &self,
        l: &Layout,
@ -1520,6 +1530,69 @@ impl BackendStorage for CudaStorage {
        Ok(Self { slice, device })
    }

+    #[cfg(feature = "cudnn")]
+    fn conv2d(
+        &self,
+        inp_l: &Layout,
+        kernel: &Self,
+        kernel_l: &Layout,
+        params: &crate::conv::ParamsConv2D,
+    ) -> Result<Self> {
+        let device = self.device().clone();
+        if !kernel_l.is_contiguous() {
+            let slice = Conv2D(params).map(&self.slice, inp_l, &kernel.slice, kernel_l, &device)?;
+            return Ok(Self { slice, device });
+        }
+        let (out_w, out_h) = (params.out_w(), params.out_h());
+        let dst_el = params.c_out * out_w * out_h * params.b_size;
+        let slice = match (&self.slice, &kernel.slice) {
+            (S::U8(inp), S::U8(k)) => {
+                let inp = &inp.slice(inp_l.start_offset()..);
+                let k = &k.slice(kernel_l.start_offset()..);
+                let mut out = unsafe { device.alloc::<u8>(dst_el) }.w()?;
+                crate::cudnn::launch_conv2d::<u8>(inp, inp_l, k, &mut out, params, &device)
+                    .map_err(crate::Error::wrap)?;
+                S::U8(out)
+            }
+            (S::BF16(inp), S::BF16(k)) => {
+                let inp = &inp.slice(inp_l.start_offset()..);
+                let k = &k.slice(kernel_l.start_offset()..);
+                let mut out = unsafe { device.alloc::<bf16>(dst_el) }.w()?;
+                crate::cudnn::launch_conv2d::<bf16>(inp, inp_l, k, &mut out, params, &device)
+                    .map_err(crate::Error::wrap)?;
+                S::BF16(out)
+            }
+            (S::F16(inp), S::F16(k)) => {
+                let inp = &inp.slice(inp_l.start_offset()..);
+                let k = &k.slice(kernel_l.start_offset()..);
+                let mut out = unsafe { device.alloc::<f16>(dst_el) }.w()?;
+                crate::cudnn::launch_conv2d::<f16>(inp, inp_l, k, &mut out, params, &device)
+                    .map_err(crate::Error::wrap)?;
+                S::F16(out)
+            }
+
+            (S::F32(inp), S::F32(k)) => {
+                let inp = &inp.slice(inp_l.start_offset()..);
+                let k = &k.slice(kernel_l.start_offset()..);
+                let mut out = unsafe { device.alloc::<f32>(dst_el) }.w()?;
+                crate::cudnn::launch_conv2d::<f32>(inp, inp_l, k, &mut out, params, &device)
+                    .map_err(crate::Error::wrap)?;
+                S::F32(out)
+            }
+            (S::F64(inp), S::F64(k)) => {
+                let inp = &inp.slice(inp_l.start_offset()..);
+                let k = &k.slice(kernel_l.start_offset()..);
+                let mut out = unsafe { device.alloc::<f64>(dst_el) }.w()?;
+                crate::cudnn::launch_conv2d::<f64>(inp, inp_l, k, &mut out, params, &device)
+                    .map_err(crate::Error::wrap)?;
+                S::F64(out)
+            }
+            (S::U32(_), S::U32(_)) => Err(CudaError::InternalError("conv2d does not support u32"))?,
+            _ => Err(CudaError::InternalError("dtype mismatch in conv2d"))?,
+        };
+        Ok(Self { slice, device })
+    }
+
    fn avg_pool2d(&self, l: &Layout, k: (usize, usize), stride: (usize, usize)) -> Result<Self> {
        let device = self.device().clone();
        let slice = Pool2D {
--- a/candle-core/src/cudnn.rs
+++ b/candle-core/src/cudnn.rs
@ -0,0 +1,107 @@
+use crate::WithDType;
+use cudarc;
+use cudarc::cudnn::safe::{Conv2dForward, Cudnn};
+use cudarc::driver::{CudaSlice, CudaView, DeviceRepr, ValidAsZeroBits};
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+// The cudnn handles are stored per thread here rather than on the CudaDevice as they are neither
+// send nor sync.
+thread_local! {
+    static CUDNN: RefCell<HashMap<crate::cuda_backend::DeviceId, Arc<Cudnn>>> = HashMap::new().into();
+}
+
+impl From<cudarc::cudnn::CudnnError> for crate::Error {
+    fn from(err: cudarc::cudnn::CudnnError) -> Self {
+        crate::Error::wrap(err)
+    }
+}
+
+impl From<cudarc::driver::DriverError> for crate::Error {
+    fn from(err: cudarc::driver::DriverError) -> Self {
+        crate::Error::wrap(err)
+    }
+}
+
+pub(crate) fn launch_conv2d<
+    T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType,
+>(
+    src: &CudaView<T>,
+    src_l: &crate::Layout,
+    filter: &CudaView<T>,
+    dst: &mut CudaSlice<T>,
+    params: &crate::conv::ParamsConv2D,
+    dev: &crate::cuda_backend::CudaDevice,
+) -> crate::Result<()> {
+    let device_id = dev.id();
+    let cudnn = CUDNN.with(|cudnn| {
+        if let Some(cudnn) = cudnn.borrow().get(&device_id) {
+            return Ok(cudnn.clone());
+        }
+        let c = Cudnn::new(dev.cuda_device());
+        if let Ok(c) = &c {
+            cudnn.borrow_mut().insert(device_id, c.clone());
+        }
+        c
+    })?;
+    let conv = cudnn.create_conv2d::<T>(
+        /* pad */ [params.padding as i32, params.padding as i32],
+        /* stride */ [params.stride as i32, params.stride as i32],
+        /* dilation */ [1, 1],
+        cudarc::cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
+    )?;
+    let x_shape = [
+        params.b_size as i32,
+        params.c_in as i32,
+        params.i_w as i32,
+        params.i_h as i32,
+    ];
+    // Note that `src` already starts at the proper offset.
+    let x = if src_l.is_contiguous() {
+        cudnn.create_4d_tensor(
+            cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            x_shape,
+        )?
+    } else {
+        let s = src_l.stride();
+        cudnn.create_4d_tensor_ex(
+            x_shape,
+            [s[0] as i32, s[1] as i32, s[2] as i32, s[3] as i32],
+        )?
+    };
+    let w = cudnn.create_4d_filter(
+        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+        [
+            params.c_out as i32,
+            params.c_in as i32,
+            params.k_w as i32,
+            params.k_h as i32,
+        ],
+    )?;
+    let (w_out, h_out) = (params.out_w() as i32, params.out_h() as i32);
+    let y = cudnn.create_4d_tensor(
+        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+        [params.b_size as i32, params.c_out as i32, w_out, h_out],
+    )?;
+    let conv2d = Conv2dForward {
+        conv: &conv,
+        x: &x,
+        w: &w,
+        y: &y,
+    };
+    let alg = conv2d.pick_algorithm()?;
+    let workspace_size = conv2d.get_workspace_size(alg)?;
+    let mut workspace = dev.cuda_device().alloc_zeros::<u8>(workspace_size)?;
+    unsafe {
+        conv2d.launch::<CudaSlice<u8>, _, _, _>(
+            alg,
+            Some(&mut workspace),
+            (T::one(), T::zero()),
+            src,
+            filter,
+            dst,
+        )?;
+    }
+    Ok(())
+}
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -43,6 +43,8 @@ pub mod cpu_backend;
 pub mod cpu_kernels;
 #[cfg(feature = "cuda")]
 pub mod cuda_backend;
+#[cfg(feature = "cudnn")]
+pub mod cudnn;
 mod device;
 pub mod display;
 mod dtype;
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -47,6 +47,7 @@ anyhow = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
 cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+cudnn = ["candle/cudnn"]
 flash-attn = ["cuda", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
 nccl = ["cuda", "cudarc/nccl", "dep:half"]