/// This example contains some simple benchmarks so that it's easy to run them in perf etc. #[cfg(feature = "mkl")] extern crate intel_mkl_src; #[cfg(feature = "accelerate")] extern crate accelerate_src; // use candle::quantized::GgmlType; use candle::{DType, Device, Result, Tensor}; // use clap::{Parser, Subcommand}; // fn softmax(xs: &Tensor, dim: D) -> Result { // let dim = dim.to_index(xs.shape(), "softmax")?; // let max = xs.max_keepdim(dim)?; // let diff = xs.broadcast_sub(&max)?; // let num = diff.exp()?; // let den = num.sum_keepdim(dim)?; // num.broadcast_div(&den) // } trait Benchmark { type PreProcessData; type RunResult; fn preprocess() -> Result; fn run_one(_: &Self::PreProcessData) -> Result; const ITERS: usize; } struct Matmul; impl Benchmark for Matmul { type PreProcessData = (Tensor, Tensor); type RunResult = Tensor; fn preprocess() -> Result { let lhs = Tensor::randn((1024, 1024), DType::F32, &Device::Cpu, 1.0, 0.0)?; let rhs = Tensor::randn((1024, 1024), DType::F32, &Device::Cpu, 1.0, 0.0)?; Ok((lhs, rhs)) } fn run_one(d: &Self::PreProcessData) -> Result { d.0.matmul(&d.1) } const ITERS: usize = 100; } // struct Softmax; // impl Benchmark for Softmax { // type PreProcessData = Tensor; // type RunResult = Tensor; // fn preprocess() -> Result { // // Typical whisper tiny size. // let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?; // Ok(x) // } // // fn run_one(d: &Self::PreProcessData) -> Result { // softmax(d, D::Minus1) // } // // const ITERS: usize = 100; // } fn run(iters: Option) -> Result<()> { use std::hint::black_box; let iters = iters.unwrap_or(B::ITERS); let d = B::preprocess()?; let start = std::time::Instant::now(); for _iter in 0..iters { let _res = black_box(B::run_one(black_box(&d))?); } println!("{:?}", start.elapsed() / iters as u32); Ok(()) } fn main() -> Result<()> { run::(None)?; Ok(()) }