Softmax cpu implementation.

2025-06-17 02:58:50 +00:00 · 2023-06-23 22:26:53 +01:00
parent 8443963d4f
commit d0a91db8fd
5 changed files with 55 additions and 8 deletions
--- a/src/cpu_backend.rs
+++ b/src/cpu_backend.rs
@ -147,8 +147,52 @@ impl CpuStorage {
        }
    }
-    pub(crate) fn divide_by_sum_over_dim(&mut self, _shape: &Shape, _dim: usize) {
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
-        todo!()
+        // [self] stores data in a contiguous way.
        let dims = shape.dims();
        let number_of_slices = dims[dim];
        let prod_pre_dim = dims[..dim].iter().product();
        let prod_post_dim = dims[dim + 1..].iter().product();
        let elem_count = shape.elem_count();
        match self {
            Self::F32(storage) => {
                for pre_idx in 0..prod_pre_dim {
                    for post_idx in 0..prod_post_dim {
                        let mut sum = 0f64;
                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
                        while idx < elem_count {
                            sum += storage[idx] as f64;
                            idx += prod_post_dim
                        }
                        let sum = sum as f32;
                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
                        while idx < elem_count {
                            storage[idx] /= sum;
                            idx += prod_post_dim
                        }
                    }
                }
            }
            Self::F64(storage) => {
                for pre_idx in 0..prod_pre_dim {
                    for post_idx in 0..prod_post_dim {
                        let mut sum = 0f64;
                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
                        while idx < elem_count {
                            sum += storage[idx];
                            idx += prod_post_dim
                        }
                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
                        while idx < elem_count {
                            storage[idx] /= sum;
                            idx += prod_post_dim
                        }
                    }
                }
            }
            Self::U32(_) => {}
        }
        Ok(())
    }
    pub(crate) fn affine_impl(
--- a/src/cuda_backend.rs
+++ b/src/cuda_backend.rs
@ -291,7 +291,7 @@ impl CudaStorage {
        Ok(Self { slice, device })
    }
-    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) {
+    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
        todo!()
    }
--- a/src/dummy_cuda_backend.rs
+++ b/src/dummy_cuda_backend.rs
@ -62,7 +62,9 @@ impl CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }
-    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) {}
+    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    pub(crate) fn to_dtype(&self, _: &Shape, _: &[usize], _: DType) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
--- a/src/storage.rs
+++ b/src/storage.rs
@ -72,11 +72,12 @@ impl Storage {
        }
    }
-    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) {
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
        match self {
-            Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim),
+            Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
-            Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim),
+            Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
        }
        Ok(())
    }
    pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -301,7 +301,7 @@ impl Tensor {
            .storage
            .unary_impl::<crate::op::Exp>(shape, self.stride())?;
        // The resulting storage is contiguous.
-        storage.divide_by_sum_over_dim(shape, dim);
+        storage.divide_by_sum_over_dim(shape, dim)?;
        let op = if self.track_op() {
            Some(Op::Softmax(self.clone(), dim))
        } else {