mirror of
https://github.com/huggingface/candle.git
synced 2025-06-15 18:28:24 +00:00
Optimize the cat operation on contiguous tensors (#1855)
* Add a specialized kernel for copy2d. * Move the cat operations. * Avoid transpositions in cat. * Bugfix. * Bugfix for the cuda kernel. * Add a benchmark. * Add more testing. * Test fix. * Faster kernel. * Add the missing kernel. * Tweak the test. * Add a metal kernel. * Fix for the metal kernel. * Get the tests to pass on metal. * Also use this opportunity to fix the metal kernel for ELU. * Add some bf16 kernels. * Clippy fixes.
This commit is contained in:
@ -238,6 +238,23 @@ impl Benchmark for QMatMul {
|
||||
const ITERS: usize = 100;
|
||||
}
|
||||
|
||||
struct Cat;
|
||||
impl Benchmark for Cat {
|
||||
type PreProcessData = (Tensor, Tensor);
|
||||
type RunResult = Tensor;
|
||||
fn preprocess() -> Result<Self::PreProcessData> {
|
||||
let lhs = Tensor::randn(0f32, 1., (1, 32, 2000, 128), &Device::Cpu)?;
|
||||
let rhs = Tensor::randn(0f32, 1., (1, 32, 1, 128), &Device::Cpu)?;
|
||||
Ok((lhs, rhs))
|
||||
}
|
||||
|
||||
fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
|
||||
Tensor::cat(&[&d.0, &d.1], 2)
|
||||
}
|
||||
|
||||
const ITERS: usize = 1000;
|
||||
}
|
||||
|
||||
struct Softmax;
|
||||
impl Benchmark for Softmax {
|
||||
type PreProcessData = Tensor;
|
||||
@ -295,6 +312,7 @@ enum Task {
|
||||
Qmatmul,
|
||||
Softmax,
|
||||
SoftmaxLastDim,
|
||||
Cat,
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
@ -319,6 +337,7 @@ fn main() -> Result<()> {
|
||||
Task::Softmax => run::<Softmax>(args.iters)?,
|
||||
Task::SoftmaxLastDim => run::<SoftmaxLastDim>(args.iters)?,
|
||||
Task::Qmatmul => run::<QMatMul>(args.iters)?,
|
||||
Task::Cat => run::<Cat>(args.iters)?,
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
Reference in New Issue
Block a user