Transfer tensors between devices.

2025-06-18 03:28:50 +00:00 · 2023-06-23 08:35:22 +01:00
parent fc41ccb5bb
commit 3b550a56dc
5 changed files with 63 additions and 0 deletions
--- a/src/cpu_backend.rs
+++ b/src/cpu_backend.rs
@ -4,6 +4,8 @@ use gemm::{gemm, Parallelism};
 // TODO: Think about whether we would be better off with a dtype and
 // a buffer as an owned slice of bytes.
 // TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
 // intercept the oom errors to avoid panicking and provide a proper error.
 #[derive(Debug, Clone)]
 pub enum CpuStorage {
    F32(Vec<f32>),
--- a/src/cuda_backend.rs
+++ b/src/cuda_backend.rs
@ -28,8 +28,22 @@ pub enum CudaError {
 type Result<T> = std::result::Result<T, CudaError>;
 /// Unique identifier for cuda devices.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub(crate) struct DeviceId(usize);
 impl DeviceId {
    fn new() -> Self {
        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
        use std::sync::atomic;
        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
    }
 }
 #[derive(Debug, Clone)]
 pub struct CudaDevice {
    id: DeviceId,
    device: Arc<cudarc::driver::CudaDevice>,
    #[allow(dead_code)]
    blas: Arc<cudarc::cublas::CudaBlas>,
@ -48,11 +62,16 @@ impl CudaDevice {
        let device = cudarc::driver::CudaDevice::new(ordinal)?;
        let blas = cudarc::cublas::CudaBlas::new(device.clone())?;
        Ok(Self {
            id: DeviceId::new(),
            device,
            blas: Arc::new(blas),
        })
    }
    pub(crate) fn same_id(&self, rhs: &Self) -> bool {
        self.id == rhs.id
    }
    pub(crate) fn ordinal(&self) -> usize {
        self.device.ordinal()
    }
--- a/src/device.rs
+++ b/src/device.rs
@ -66,6 +66,14 @@ impl Device {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
    }
    pub fn same_id(&self, rhs: &Self) -> bool {
        match (self, rhs) {
            (Self::Cpu, Self::Cpu) => true,
            (Self::Cuda(lhs), Self::Cuda(rhs)) => lhs.same_id(rhs),
            _ => false,
        }
    }
    pub fn location(&self) -> DeviceLocation {
        match self {
            Self::Cpu => DeviceLocation::Cpu,
--- a/src/dummy_cuda_backend.rs
+++ b/src/dummy_cuda_backend.rs
@ -17,6 +17,10 @@ impl CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }
    pub(crate) fn same_id(&self, _: &Self) -> bool {
        true
    }
    pub(crate) fn ordinal(&self) -> usize {
        fail!()
    }
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -504,6 +504,36 @@ impl Tensor {
        Ok(Tensor(Arc::new(tensor_)))
    }
    /// If the target device is the same as the tensor device, only a shallow copy is performed.
    pub fn to_device(&self, device: &Device) -> Result<Tensor> {
        if self.device().same_id(device) {
            Ok(self.clone())
        } else {
            let storage = match (&self.storage, device) {
                (Storage::Cpu(storage), Device::Cuda(cuda)) => {
                    Storage::Cuda(cuda.cuda_from_cpu_storage(storage)?)
                }
                (Storage::Cuda(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
                (Storage::Cuda(storage), Device::Cuda(cuda)) => {
                    // TODO: Avoid passing through the cpu storage here, especially if the gpu ids
                    // are the same.
                    let cpu_storage = storage.to_cpu_storage()?;
                    Storage::Cuda(cuda.cuda_from_cpu_storage(&cpu_storage)?)
                }
                (Storage::Cpu(storage), Device::Cpu) => Storage::Cpu(storage.clone()),
            };
            let tensor_ = Tensor_ {
                id: TensorId::new(),
                storage,
                shape: self.shape.clone(),
                stride: self.stride.clone(),
                op: None, // TODO: Have a proper op here.
                is_variable: self.is_variable,
            };
            Ok(Tensor(Arc::new(tensor_)))
        }
    }
    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
    /// elements having dependencies on the latter ones, e.g. the first element if any is the
    /// argument.