diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index bf57a91c..b5d74e12 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -30,6 +30,7 @@ zip = { workspace = true } [dev-dependencies] anyhow = { workspace = true } +clap = { workspace = true } [features] default = [] diff --git a/candle-core/examples/conv1d_benchmark.rs b/candle-core/examples/conv1d_benchmark.rs deleted file mode 100644 index 52fae5e8..00000000 --- a/candle-core/examples/conv1d_benchmark.rs +++ /dev/null @@ -1,24 +0,0 @@ -#[cfg(feature = "mkl")] -extern crate intel_mkl_src; - -#[cfg(feature = "accelerate")] -extern crate accelerate_src; - -use anyhow::Result; -use candle_core::{Device, Tensor}; - -pub const N_ITERS: usize = 5; - -fn main() -> Result<()> { - let inp = Tensor::randn(0f32, 1., (1, 384, 3000), &Device::Cpu)?; - let w = Tensor::randn(0f32, 1., (384, 384, 3), &Device::Cpu)?; - let res = inp.conv1d(&w, 0, 1); - println!("{res:?}"); - let start = std::time::Instant::now(); - for i in 0..N_ITERS { - let res = inp.conv1d(&w, 0, 1); - println!("{i} {res:?}"); - } - println!("{:?}", start.elapsed() / N_ITERS as u32); - Ok(()) -} diff --git a/candle-core/examples/cpu_benchmarks.rs b/candle-core/examples/cpu_benchmarks.rs new file mode 100644 index 00000000..4cc710fb --- /dev/null +++ b/candle-core/examples/cpu_benchmarks.rs @@ -0,0 +1,95 @@ +/// This example contains some simple benchmarks so that it's easy to run them in perf etc. +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + +use candle_core::{Device, Result, Tensor}; +use clap::{Parser, Subcommand}; + +trait Benchmark { + type PreProcessData; + type RunResult; + + fn preprocess() -> Result; + fn run_one(_: &Self::PreProcessData) -> Result; + + const ITERS: usize; +} + +// Conv1d example as used in whisper. +struct Conv1d; +impl Benchmark for Conv1d { + type PreProcessData = (Tensor, Tensor); + type RunResult = Tensor; + fn preprocess() -> Result { + let inp = Tensor::randn(0f32, 1., (1, 384, 3000), &Device::Cpu)?; + let w = Tensor::randn(0f32, 1., (384, 384, 3), &Device::Cpu)?; + Ok((inp, w)) + } + + fn run_one(d: &Self::PreProcessData) -> Result { + d.0.conv1d(&d.1, 0, 1) + } + + const ITERS: usize = 5; +} + +// Conv2d example as used in stable-diffusion. +struct Conv2d; +impl Benchmark for Conv2d { + type PreProcessData = (Tensor, Tensor); + type RunResult = Tensor; + + fn preprocess() -> Result { + let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?; + let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?; + Ok((inp, w)) + } + + fn run_one(d: &Self::PreProcessData) -> Result { + d.0.conv2d(&d.1, 0, 1) + } + + const ITERS: usize = 1; +} + +fn run(iters: Option) -> Result<()> { + use std::hint::black_box; + + let iters = iters.unwrap_or(B::ITERS); + let d = B::preprocess()?; + let start = std::time::Instant::now(); + for _iter in 0..iters { + let _res = black_box(B::run_one(black_box(&d))?); + } + println!("{:?}", start.elapsed() / iters as u32); + Ok(()) +} + +#[derive(Subcommand, Debug, Clone)] +enum Task { + Conv1d, + Conv2d, +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +pub struct Args { + /// The benchmark to be run. + #[command(subcommand)] + task: Task, + + #[arg(long)] + iters: Option, +} + +fn main() -> Result<()> { + let args = Args::parse(); + match args.task { + Task::Conv1d => run::(args.iters)?, + Task::Conv2d => run::(args.iters)?, + } + Ok(()) +}