From d0a91db8fd02f90694d4bc96222a2a069304a770 Mon Sep 17 00:00:00 2001 From: laurent Date: Fri, 23 Jun 2023 22:26:53 +0100 Subject: [PATCH] Softmax cpu implementation. --- src/cpu_backend.rs | 48 +++++++++++++++++++++++++++++++++++++-- src/cuda_backend.rs | 2 +- src/dummy_cuda_backend.rs | 4 +++- src/storage.rs | 7 +++--- src/tensor.rs | 2 +- 5 files changed, 55 insertions(+), 8 deletions(-) diff --git a/src/cpu_backend.rs b/src/cpu_backend.rs index fecbe643..472d093e 100644 --- a/src/cpu_backend.rs +++ b/src/cpu_backend.rs @@ -147,8 +147,52 @@ impl CpuStorage { } } - pub(crate) fn divide_by_sum_over_dim(&mut self, _shape: &Shape, _dim: usize) { - todo!() + pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> { + // [self] stores data in a contiguous way. + let dims = shape.dims(); + let number_of_slices = dims[dim]; + let prod_pre_dim = dims[..dim].iter().product(); + let prod_post_dim = dims[dim + 1..].iter().product(); + let elem_count = shape.elem_count(); + match self { + Self::F32(storage) => { + for pre_idx in 0..prod_pre_dim { + for post_idx in 0..prod_post_dim { + let mut sum = 0f64; + let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx; + while idx < elem_count { + sum += storage[idx] as f64; + idx += prod_post_dim + } + let sum = sum as f32; + let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx; + while idx < elem_count { + storage[idx] /= sum; + idx += prod_post_dim + } + } + } + } + Self::F64(storage) => { + for pre_idx in 0..prod_pre_dim { + for post_idx in 0..prod_post_dim { + let mut sum = 0f64; + let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx; + while idx < elem_count { + sum += storage[idx]; + idx += prod_post_dim + } + let mut idx = pre_idx * prod_post_dim * number_of_slices + post_idx; + while idx < elem_count { + storage[idx] /= sum; + idx += prod_post_dim + } + } + } + } + Self::U32(_) => {} + } + Ok(()) } pub(crate) fn affine_impl( diff --git a/src/cuda_backend.rs b/src/cuda_backend.rs index f2a2ce43..90437ed0 100644 --- a/src/cuda_backend.rs +++ b/src/cuda_backend.rs @@ -291,7 +291,7 @@ impl CudaStorage { Ok(Self { slice, device }) } - pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) { + pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> { todo!() } diff --git a/src/dummy_cuda_backend.rs b/src/dummy_cuda_backend.rs index fb50f8f2..b3199f65 100644 --- a/src/dummy_cuda_backend.rs +++ b/src/dummy_cuda_backend.rs @@ -62,7 +62,9 @@ impl CudaStorage { Err(Error::NotCompiledWithCudaSupport) } - pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) {} + pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> { + Err(Error::NotCompiledWithCudaSupport) + } pub(crate) fn to_dtype(&self, _: &Shape, _: &[usize], _: DType) -> Result { Err(Error::NotCompiledWithCudaSupport) diff --git a/src/storage.rs b/src/storage.rs index c5544478..38a48b72 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -72,11 +72,12 @@ impl Storage { } } - pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) { + pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> { match self { - Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim), - Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim), + Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim)?, + Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim)?, } + Ok(()) } pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result { diff --git a/src/tensor.rs b/src/tensor.rs index 82de7e17..2e6d1014 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -301,7 +301,7 @@ impl Tensor { .storage .unary_impl::(shape, self.stride())?; // The resulting storage is contiguous. - storage.divide_by_sum_over_dim(shape, dim); + storage.divide_by_sum_over_dim(shape, dim)?; let op = if self.track_op() { Some(Op::Softmax(self.clone(), dim)) } else {