From d0a91db8fd02f90694d4bc96222a2a069304a770 Mon Sep 17 00:00:00 2001
From: laurent <laurent.mazare@gmail.com>
Date: Fri, 23 Jun 2023 22:26:53 +0100
Subject: [PATCH] Softmax cpu implementation.

---
 src/cpu_backend.rs        | 48 +++++++++++++++++++++++++++++++++++++--
 src/cuda_backend.rs       |  2 +-
 src/dummy_cuda_backend.rs |  4 +++-
 src/storage.rs            |  7 +++---
 src/tensor.rs             |  2 +-
 5 files changed, 55 insertions(+), 8 deletions(-)
diff --git a/src/cpu_backend.rs b/src/cpu_backend.rs
index fecbe643..472d093e 100644
--- a/src/cpu_backend.rs
+++ b/src/cpu_backend.rs
@@ -147,8 +147,52 @@ impl CpuStorage {
         }
     }
 
-    pub(crate) fn divide_by_sum_over_dim(&mut self, _shape: &Shape, _dim: usize) {
-        todo!()
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
+        // [self] stores data in a contiguous way.
+        let dims = shape.dims();
+        let number_of_slices = dims[dim];
+        let prod_pre_dim = dims[..dim].iter().product();
+        let prod_post_dim = dims[dim + 1..].iter().product();
+        let elem_count = shape.elem_count();
+        match self {
+            Self::F32(storage) => {
+                for pre_idx in 0..prod_pre_dim {
+                    for post_idx in 0..prod_post_dim {
+                        let mut sum = 0f64;
+                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
+                        while idx < elem_count {
+                            sum += storage[idx] as f64;
+                            idx += prod_post_dim
+                        }
+                        let sum = sum as f32;
+                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
+                        while idx < elem_count {
+                            storage[idx] /= sum;
+                            idx += prod_post_dim
+                        }
+                    }
+                }
+            }
+            Self::F64(storage) => {
+                for pre_idx in 0..prod_pre_dim {
+                    for post_idx in 0..prod_post_dim {
+                        let mut sum = 0f64;
+                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
+                        while idx < elem_count {
+                            sum += storage[idx];
+                            idx += prod_post_dim
+                        }
+                        let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx;
+                        while idx < elem_count {
+                            storage[idx] /= sum;
+                            idx += prod_post_dim
+                        }
+                    }
+                }
+            }
+            Self::U32(_) => {}
+        }
+        Ok(())
     }
 
     pub(crate) fn affine_impl(
diff --git a/src/cuda_backend.rs b/src/cuda_backend.rs
index f2a2ce43..90437ed0 100644
--- a/src/cuda_backend.rs
+++ b/src/cuda_backend.rs
@@ -291,7 +291,7 @@ impl CudaStorage {
         Ok(Self { slice, device })
     }
 
-    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) {
+    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
         todo!()
     }
 
diff --git a/src/dummy_cuda_backend.rs b/src/dummy_cuda_backend.rs
index fb50f8f2..b3199f65 100644
--- a/src/dummy_cuda_backend.rs
+++ b/src/dummy_cuda_backend.rs
@@ -62,7 +62,9 @@ impl CudaStorage {
         Err(Error::NotCompiledWithCudaSupport)
     }
 
-    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) {}
+    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
 
     pub(crate) fn to_dtype(&self, _: &Shape, _: &[usize], _: DType) -> Result<Self> {
         Err(Error::NotCompiledWithCudaSupport)
diff --git a/src/storage.rs b/src/storage.rs
index c5544478..38a48b72 100644
--- a/src/storage.rs
+++ b/src/storage.rs
@@ -72,11 +72,12 @@ impl Storage {
         }
     }
 
-    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) {
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
         match self {
-            Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim),
-            Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim),
+            Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
+            Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
         }
+        Ok(())
     }
 
     pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
diff --git a/src/tensor.rs b/src/tensor.rs
index 82de7e17..2e6d1014 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -301,7 +301,7 @@ impl Tensor {
             .storage
             .unary_impl::<crate::op::Exp>(shape, self.stride())?;
         // The resulting storage is contiguous.
-        storage.divide_by_sum_over_dim(shape, dim);
+        storage.divide_by_sum_over_dim(shape, dim)?;
         let op = if self.track_op() {
             Some(Op::Softmax(self.clone(), dim))
         } else {