From 71735c7a02018974a4f281d19d484950052965da Mon Sep 17 00:00:00 2001 From: laurent Date: Wed, 21 Jun 2023 19:43:25 +0100 Subject: [PATCH] Move the data between the host and the device. --- src/cuda_backend.rs | 80 +++++++++++++++++++++++++++++++++++++++++++++ src/device.rs | 23 +++++-------- src/lib.rs | 2 ++ src/storage.rs | 8 ++--- src/tensor.rs | 7 +++- 5 files changed, 100 insertions(+), 20 deletions(-) create mode 100644 src/cuda_backend.rs diff --git a/src/cuda_backend.rs b/src/cuda_backend.rs new file mode 100644 index 00000000..44e75575 --- /dev/null +++ b/src/cuda_backend.rs @@ -0,0 +1,80 @@ +use crate::{CpuStorage, DType, Result, Shape}; +use cudarc::driver::CudaSlice; + +#[derive(Debug, Clone)] +pub struct CudaDevice(std::sync::Arc); + +impl CudaDevice { + pub(crate) fn new(ordinal: usize) -> Result { + let device = cudarc::driver::CudaDevice::new(ordinal)?; + Ok(Self(device)) + } + + pub(crate) fn ordinal(&self) -> usize { + self.0.ordinal() + } + + pub(crate) fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result { + let elem_count = shape.elem_count(); + match dtype { + DType::F32 => { + let data = self.0.alloc_zeros::(elem_count)?; + Ok(CudaStorage::F32(data)) + } + DType::F64 => { + let data = self.0.alloc_zeros::(elem_count)?; + Ok(CudaStorage::F64(data)) + } + } + } + + pub(crate) fn cuda_from_cpu_storage(&self, storage: &CpuStorage) -> Result { + match storage { + CpuStorage::F32(storage) => { + let data = self.0.htod_sync_copy(storage)?; + Ok(CudaStorage::F32(data)) + } + CpuStorage::F64(storage) => { + let data = self.0.htod_sync_copy(storage)?; + Ok(CudaStorage::F64(data)) + } + } + } +} + +#[derive(Debug, Clone)] +pub enum CudaStorage { + F32(CudaSlice), + F64(CudaSlice), +} + +impl CudaStorage { + pub fn dtype(&self) -> DType { + match self { + Self::F32(_) => DType::F32, + Self::F64(_) => DType::F64, + } + } + + pub fn device(&self) -> CudaDevice { + match self { + Self::F32(slice) => CudaDevice(slice.device()), + Self::F64(slice) => CudaDevice(slice.device()), + } + } + + pub(crate) fn to_cpu_storage(&self) -> Result { + match self { + Self::F32(slice) => { + let dev = slice.device(); + let cpu_storage = dev.dtoh_sync_copy(slice)?; + Ok(CpuStorage::F32(cpu_storage)) + } + Self::F64(slice) => { + let dev = slice.device(); + let cpu_storage = dev.dtoh_sync_copy(slice)?; + Ok(CpuStorage::F64(cpu_storage)) + } + } + } +} diff --git a/src/device.rs b/src/device.rs index 19e1a302..e522cd42 100644 --- a/src/device.rs +++ b/src/device.rs @@ -11,7 +11,7 @@ pub enum DeviceLocation { #[derive(Debug, Clone)] pub enum Device { Cpu, - Cuda(std::sync::Arc), + Cuda(crate::CudaDevice), } // TODO: Should we back the cpu implementation using the NdArray crate or similar? @@ -63,8 +63,7 @@ impl NdArray for &[[S; N]; impl Device { pub fn new_cuda(ordinal: usize) -> Result { - let device = cudarc::driver::CudaDevice::new(ordinal)?; - Ok(Self::Cuda(device)) + Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?)) } pub fn location(&self) -> DeviceLocation { @@ -85,7 +84,8 @@ impl Device { Device::Cuda(device) => { // TODO: Instead of allocating memory on the host and transfering it, // allocate some zeros on the device and use a shader to set them to 1. - let storage = device.htod_copy(vec![1f32; shape.elem_count()])?; + let storage = CpuStorage::ones_impl(shape, dtype); + let storage = device.cuda_from_cpu_storage(&storage)?; Ok(Storage::Cuda(storage)) } } @@ -98,7 +98,7 @@ impl Device { Ok(Storage::Cpu(storage)) } Device::Cuda(device) => { - let storage = device.alloc_zeros::(shape.elem_count())?; + let storage = device.zeros_impl(shape, dtype)?; Ok(Storage::Cuda(storage)) } } @@ -108,16 +108,9 @@ impl Device { match self { Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())), Device::Cuda(device) => { - // TODO: Avoid making a copy through the cpu. - match array.to_cpu_storage() { - CpuStorage::F64(_) => { - todo!() - } - CpuStorage::F32(data) => { - let storage = device.htod_copy(data)?; - Ok(Storage::Cuda(storage)) - } - } + let storage = array.to_cpu_storage(); + let storage = device.cuda_from_cpu_storage(&storage)?; + Ok(Storage::Cuda(storage)) } } } diff --git a/src/lib.rs b/src/lib.rs index 9462bfa0..f09daa90 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod cpu_backend; +mod cuda_backend; mod device; mod dtype; mod error; @@ -9,6 +10,7 @@ mod strided_index; mod tensor; pub use cpu_backend::CpuStorage; +pub use cuda_backend::{CudaDevice, CudaStorage}; pub use device::{Device, DeviceLocation}; pub use dtype::{DType, WithDType}; pub use error::{Error, Result}; diff --git a/src/storage.rs b/src/storage.rs index 4c8ebcc3..7230104e 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -1,9 +1,9 @@ -use crate::{CpuStorage, DType, Device, Error, Result, Shape}; +use crate::{CpuStorage, CudaStorage, DType, Device, Error, Result, Shape}; #[derive(Debug, Clone)] pub enum Storage { Cpu(CpuStorage), - Cuda(cudarc::driver::CudaSlice), + Cuda(CudaStorage), } pub(crate) trait UnaryOp { @@ -100,14 +100,14 @@ impl Storage { pub fn device(&self) -> Device { match self { Self::Cpu(_) => Device::Cpu, - Self::Cuda(slice) => Device::Cuda(slice.device()), + Self::Cuda(storage) => Device::Cuda(storage.device()), } } pub fn dtype(&self) -> DType { match self { Self::Cpu(storage) => storage.dtype(), - Self::Cuda { .. } => todo!(), + Self::Cuda(storage) => storage.dtype(), } } diff --git a/src/tensor.rs b/src/tensor.rs index 5faf886f..be642329 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -250,7 +250,12 @@ impl Tensor { let data = S::cpu_storage_as_slice(cpu_storage)?; Ok(self.strided_index().map(|i| data[i]).collect()) } - Storage::Cuda(_) => todo!(), + Storage::Cuda(slice) => { + // TODO: Would it be possible to only fetch the necessary data? + let cpu_storage = slice.to_cpu_storage()?; + let data = S::cpu_storage_as_slice(&cpu_storage)?; + Ok(self.strided_index().map(|i| data[i]).collect()) + } } }