From 559944146fea5ba0baa0f23fcf67062f04ca22da Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 10 Sep 2023 16:56:28 +0100 Subject: [PATCH] Add an im2col based benchmark. (#800) * Add an im2col based benchmark. * Reshape the final result. --- candle-nn/examples/cpu_benchmarks.rs | 73 +++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/candle-nn/examples/cpu_benchmarks.rs b/candle-nn/examples/cpu_benchmarks.rs index 20c92dbb..012456ec 100644 --- a/candle-nn/examples/cpu_benchmarks.rs +++ b/candle-nn/examples/cpu_benchmarks.rs @@ -6,7 +6,7 @@ extern crate intel_mkl_src; extern crate accelerate_src; use candle::quantized::GgmlType; -use candle::{Device, Result, Tensor, D}; +use candle::{CpuStorage, Device, Layout, Result, Shape, Tensor, D}; use clap::{Parser, Subcommand}; trait Benchmark { @@ -19,6 +19,48 @@ trait Benchmark { const ITERS: usize; } +struct Im2Col(usize, usize); +impl candle::CustomOp1 for Im2Col { + fn name(&self) -> &'static str { + "im2col" + } + + fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)> { + let &Self(h_k, w_k) = self; + let (b, c, h, w) = layout.shape().dims4()?; + let (h_out, w_out) = (h - h_k + 1, w - w_k + 1); + let slice = storage.as_slice::()?; + let src = match layout.contiguous_offsets() { + None => candle::bail!("input has to be contiguous"), + Some((o1, o2)) => &slice[o1..o2], + }; + let mut dst = vec![0f32; b * h_out * w_out * c * h_k * w_k]; + let (s_b, s_c, s_h) = (c * h * w, h * w, w); + for b_idx in 0..b { + let src_idx = b_idx * s_b; + let dst_idx = b_idx * h_out * w_out * c * h_k * w_k; + for h_idx in 0..h_out { + let dst_idx = dst_idx + h_idx * w_out * c * h_k * w_k; + for w_idx in 0..w_out { + let dst_idx = dst_idx + w_idx * c * h_k * w_k; + for c_idx in 0..c { + let dst_idx = dst_idx + c_idx * h_k * w_k; + let src_idx = c_idx * s_c + src_idx; + for h_k_idx in 0..h_k { + let src_idx = src_idx + (h_idx + h_k_idx) * s_h + w_idx; + let dst_idx = dst_idx + h_k_idx * w_k; + dst[dst_idx..dst_idx + w_k] + .copy_from_slice(&src[src_idx..src_idx + w_k]) + } + } + } + } + } + let storage = candle::WithDType::to_cpu_storage_owned(dst); + Ok((storage, (b * h_out * w_out, c * h_k * w_k).into())) + } +} + // Conv1d example as used in whisper. struct Conv1d; impl Benchmark for Conv1d { @@ -53,7 +95,32 @@ impl Benchmark for Conv2d { d.0.conv2d(&d.1, 0, 1, 1, 1) } - const ITERS: usize = 1; + const ITERS: usize = 5; +} + +// Conv2d example as used in stable-diffusion, im2col implementation. +struct Conv2dIm2Col; +impl Benchmark for Conv2dIm2Col { + type PreProcessData = (Tensor, Tensor); + type RunResult = Tensor; + + fn preprocess() -> Result { + let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?; + let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?; + Ok((inp, w)) + } + + fn run_one(d: &Self::PreProcessData) -> Result { + // d.0.conv2d(&d.1, 0, 1, 1, 1) + let (b, _, h, w) = d.0.dims4()?; + let (h_k, w_k) = (3, 3); + let (h_out, w_out) = (h - h_k + 1, w - w_k + 1); + let col = d.0.apply_op1_no_bwd(&Im2Col(h_k, w_k))?; + let res = col.matmul(&d.1.flatten_from(1)?.t()?)?; + res.reshape((b, (), h_out, w_out)) + } + + const ITERS: usize = 5; } struct Matmul; @@ -145,6 +212,7 @@ fn run(iters: Option) -> Result<()> { enum Task { Conv1d, Conv2d, + Conv2dIm2Col, Matmul, Qmatmul, Softmax, @@ -167,6 +235,7 @@ fn main() -> Result<()> { match args.task { Task::Conv1d => run::(args.iters)?, Task::Conv2d => run::(args.iters)?, + Task::Conv2dIm2Col => run::(args.iters)?, Task::Matmul => run::(args.iters)?, Task::Softmax => run::(args.iters)?, Task::SoftmaxLastDim => run::(args.iters)?,