Start adding support for cuda.

2025-06-16 10:38:54 +00:00 · 2023-06-21 18:11:56 +01:00
parent 7c317f9611
commit 2bfe8f18ab
5 changed files with 39 additions and 18 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -22,5 +22,5 @@ rand = "0.8.5"
 tokenizers = "0.13.3"

 [features]
-default = []
+default = ["cuda"]
 cuda = ["dep:cudarc"]
--- a/src/device.rs
+++ b/src/device.rs
@ -1,11 +1,19 @@
 use crate::{CpuStorage, DType, Result, Shape, Storage};

+/// A `DeviceLocation` represents a physical device whereas multiple `Device`
+/// can live on the same location (typically for cuda devices).
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub enum Device {
+pub enum DeviceLocation {
    Cpu,
    Cuda { gpu_id: usize },
 }

+#[derive(Debug, Clone)]
+pub enum Device {
+    Cpu,
+    Cuda(std::sync::Arc<cudarc::driver::CudaDevice>),
+}
+
 // TODO: Should we back the cpu implementation using the NdArray crate or similar?
 pub trait NdArray {
    fn shape(&self) -> Result<Shape>;
@ -54,13 +62,22 @@ impl<S: crate::WithDType, const N: usize, const M: usize> NdArray for &[[S; N];
 }

 impl Device {
+    pub fn location(&self) -> DeviceLocation {
+        match self {
+            Self::Cpu => DeviceLocation::Cpu,
+            Self::Cuda(device) => DeviceLocation::Cuda {
+                gpu_id: device.ordinal(),
+            },
+        }
+    }
+
    pub(crate) fn ones(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
        match self {
            Device::Cpu => {
                let storage = Storage::Cpu(CpuStorage::ones_impl(shape, dtype));
                Ok(storage)
            }
-            Device::Cuda { gpu_id: _ } => {
+            Device::Cuda(_) => {
                todo!()
            }
        }
@ -69,11 +86,12 @@ impl Device {
    pub(crate) fn zeros(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
        match self {
            Device::Cpu => {
-                let storage = Storage::Cpu(CpuStorage::zeros_impl(shape, dtype));
-                Ok(storage)
+                let storage = CpuStorage::zeros_impl(shape, dtype);
+                Ok(Storage::Cpu(storage))
            }
-            Device::Cuda { gpu_id: _ } => {
-                todo!()
+            Device::Cuda(device) => {
+                let storage = device.alloc_zeros::<f32>(shape.elem_count())?;
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -84,7 +102,7 @@ impl Device {
                let storage = Storage::Cpu(array.to_cpu_storage());
                Ok(storage)
            }
-            Device::Cuda { gpu_id: _ } => {
+            Device::Cuda(_) => {
                todo!()
            }
        }
--- a/src/error.rs
+++ b/src/error.rs
@ -1,4 +1,4 @@
-use crate::{DType, Device, Shape};
+use crate::{DType, DeviceLocation, Shape};

 /// Main library error type.
 #[derive(thiserror::Error, Debug)]
@ -15,8 +15,8 @@ pub enum Error {

    #[error("device mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
    DeviceMismatchBinaryOp {
-        lhs: Device,
-        rhs: Device,
+        lhs: DeviceLocation,
+        rhs: DeviceLocation,
        op: &'static str,
    },

@ -33,6 +33,9 @@ pub enum Error {
        got: usize,
        shape: Shape,
    },
+
+    #[error(transparent)]
+    Cudarc(#[from] cudarc::driver::DriverError),
 }

 pub type Result<T> = std::result::Result<T, Error>;
--- a/src/lib.rs
+++ b/src/lib.rs
@ -9,7 +9,7 @@ mod strided_index;
 mod tensor;

 pub use cpu_backend::CpuStorage;
-pub use device::Device;
+pub use device::{Device, DeviceLocation};
 pub use dtype::{DType, WithDType};
 pub use error::{Error, Result};
 pub use shape::Shape;
--- a/src/storage.rs
+++ b/src/storage.rs
@ -3,7 +3,7 @@ use crate::{CpuStorage, DType, Device, Error, Result, Shape};
 #[derive(Debug, Clone)]
 pub enum Storage {
    Cpu(CpuStorage),
-    Cuda { gpu_id: usize }, // TODO: Actually add the storage.
+    Cuda(cudarc::driver::CudaSlice<f32>),
 }

 pub(crate) trait UnaryOp {
@ -100,7 +100,7 @@ impl Storage {
    pub fn device(&self) -> Device {
        match self {
            Self::Cpu(_) => Device::Cpu,
-            Self::Cuda { gpu_id } => Device::Cuda { gpu_id: *gpu_id },
+            Self::Cuda(slice) => Device::Cuda(slice.device()),
        }
    }

@ -112,8 +112,8 @@ impl Storage {
    }

    pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
-        let lhs = self.device();
-        let rhs = rhs.device();
+        let lhs = self.device().location();
+        let rhs = rhs.device().location();
        if lhs != rhs {
            Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op })
        } else {
@ -179,8 +179,8 @@ impl Storage {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: lhs.device(),
-                    rhs: rhs.device(),
+                    lhs: lhs.device().location(),
+                    rhs: rhs.device().location(),
                    op: B::NAME,
                })
            }