Add a softmax bench. (#433)

* Add a softmax bench. * Add the vectorized sum reduce.
2025-06-16 18:48:51 +00:00 · 2023-08-13 21:09:18 +02:00
parent 9af438ac1b
commit d379a76a9e
3 changed files with 61 additions and 9 deletions
--- a/candle-core/examples/cpu_benchmarks.rs
+++ b/candle-core/examples/cpu_benchmarks.rs
@ -5,9 +5,18 @@ extern crate intel_mkl_src;
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;

-use candle_core::{Device, Result, Tensor};
+use candle_core::{Device, Result, Tensor, D};
 use clap::{Parser, Subcommand};

+fn softmax<D: candle_core::shape::Dim>(xs: &Tensor, dim: D) -> Result<Tensor> {
+    let dim = dim.to_index(xs.shape(), "softmax")?;
+    let max = xs.max_keepdim(dim)?;
+    let diff = xs.broadcast_sub(&max)?;
+    let num = diff.exp()?;
+    let den = num.sum_keepdim(dim)?;
+    num.broadcast_div(&den)
+}
+
 trait Benchmark {
    type PreProcessData;
    type RunResult;
@ -72,6 +81,23 @@ impl Benchmark for Matmul {
    const ITERS: usize = 100;
 }

+struct Softmax;
+impl Benchmark for Softmax {
+    type PreProcessData = Tensor;
+    type RunResult = Tensor;
+    fn preprocess() -> Result<Self::PreProcessData> {
+        // Typical whisper tiny size.
+        let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?;
+        Ok(x)
+    }
+
+    fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+        softmax(d, D::Minus1)
+    }
+
+    const ITERS: usize = 100;
+}
+
 fn run<B: Benchmark>(iters: Option<usize>) -> Result<()> {
    use std::hint::black_box;

@ -90,6 +116,7 @@ enum Task {
    Conv1d,
    Conv2d,
    Matmul,
+    Softmax,
 }

 #[derive(Parser, Debug)]
@ -109,6 +136,7 @@ fn main() -> Result<()> {
        Task::Conv1d => run::<Conv1d>(args.iters)?,
        Task::Conv2d => run::<Conv2d>(args.iters)?,
        Task::Matmul => run::<Matmul>(args.iters)?,
+        Task::Softmax => run::<Softmax>(args.iters)?,
    }
    Ok(())
 }
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
@ -278,17 +278,17 @@ impl Map1Any for ReduceIndex {
    }
 }

-struct Reduce<'a> {
+struct ReduceSum<'a> {
    dst_shape: &'a Shape,
    reduce_dims: &'a [usize],
    reduce_dims_and_stride: Vec<(usize, usize)>,
 }

-impl<'a> Reduce<'a> {
+impl<'a> ReduceSum<'a> {
    #[inline(always)]
    fn fold_impl<T, F>(&self, src: &[T], src_l: &Layout, start_elt: T, f: F) -> Result<Vec<T>>
    where
-        T: Clone + Copy,
+        T: WithDType,
        F: Fn(T, T) -> T,
    {
        let mut dst = vec![start_elt; self.dst_shape.elem_count()];
@ -312,9 +312,13 @@ impl<'a> Reduce<'a> {
                        .product::<usize>();
                    for (dst_i, dst_v) in dst.iter_mut().enumerate() {
                        let src_i = dst_i * reduce_sz;
-                        for &s in src[src_i..src_i + reduce_sz].iter() {
-                            *dst_v = f(*dst_v, s)
-                        }
+                        unsafe {
+                            T::vec_reduce_sum(
+                                src[src_i..src_i + reduce_sz].as_ptr(),
+                                dst_v,
+                                reduce_sz,
+                            )
+                        };
                    }
                    return Ok(dst);
                };
@ -346,7 +350,7 @@ impl<'a> Reduce<'a> {
    }
 }

-impl<'a> Map1 for Reduce<'a> {
+impl<'a> Map1 for ReduceSum<'a> {
    #[inline(always)]
    fn f<T: WithDType>(&self, src: &[T], src_l: &Layout) -> Result<Vec<T>> {
        self.fold_impl(src, src_l, T::zero(), |x, y| x + y)
@ -1697,7 +1701,7 @@ impl BackendStorage for CpuStorage {
                    .iter()
                    .map(|&d| (src_dims[d], src_dims[d + 1..].iter().product::<usize>()))
                    .collect();
-                Reduce {
+                ReduceSum {
                    dst_shape: &dst_shape,
                    reduce_dims: &reduce_dims,
                    reduce_dims_and_stride,
--- a/candle-core/src/cpu_kernels.rs
+++ b/candle-core/src/cpu_kernels.rs
@ -12,6 +12,20 @@ pub trait VecDot: num_traits::NumAssign + Copy {
            *res += *lhs.add(i) * *rhs.add(i)
        }
    }
+
+    /// Sum of all elements in a vector.
+    ///
+    /// # Safety
+    ///
+    /// The length of `xs` must be at least `len`. `res` has to point to a valid
+    /// element.
+    #[inline(always)]
+    unsafe fn vec_reduce_sum(xs: *const Self, res: *mut Self, len: usize) {
+        *res = Self::zero();
+        for i in 0..len {
+            *res += *xs.add(i)
+        }
+    }
 }

 impl VecDot for f32 {
@ -19,6 +33,12 @@ impl VecDot for f32 {
    unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
        ggblas::ggml::vec_dot_f32(lhs, rhs, res, len)
    }
+
+    // TODO: enable the following once the updated ggblas is available.
+    // #[inline(always)]
+    // unsafe fn vec_reduce_sum(xs: *const Self, res: *mut Self, len: usize) {
+    //    ggblas::ggml::vec_reduce_sum(xs, res, len)
+    // }
 }

 impl VecDot for f64 {}