diff --git a/candle-core/benches/bench_main.rs b/candle-core/benches/bench_main.rs index 9cb1cf8b..990246c0 100644 --- a/candle-core/benches/bench_main.rs +++ b/candle-core/benches/bench_main.rs @@ -4,11 +4,12 @@ use criterion::criterion_main; criterion_main!( benchmarks::affine::benches, + benchmarks::copy::benches, + benchmarks::conv_transpose2d::benches, benchmarks::matmul::benches, + benchmarks::qmatmul::benches, benchmarks::random::benches, benchmarks::reduce::benches, + benchmarks::unary::benches, benchmarks::where_cond::benches, - benchmarks::conv_transpose2d::benches, - benchmarks::qmatmul::benches, - benchmarks::unary::benches ); diff --git a/candle-core/benches/benchmarks/copy.rs b/candle-core/benches/benchmarks/copy.rs new file mode 100644 index 00000000..f850266a --- /dev/null +++ b/candle-core/benches/benchmarks/copy.rs @@ -0,0 +1,38 @@ +use crate::benchmarks::{BenchDevice, BenchDeviceHandler}; +use candle_core::{Device, Tensor, WithDType}; +use criterion::{black_box, criterion_group, Criterion, Throughput}; +use std::time::Instant; + +fn run_copy_mask_benchmark(c: &mut Criterion, device: &Device, name: &str) { + let batch_size = 128; + let in_seq_len = 1; + let kv_seq_len = 1024; + + let attn_mask = vec![vec![vec![D::zero(); kv_seq_len]; in_seq_len]; batch_size]; + let size_in_bytes = batch_size * in_seq_len * kv_seq_len * D::DTYPE.size_in_bytes(); + + let mut group = c.benchmark_group(device.bench_name(name)); + group.throughput(Throughput::Bytes(size_in_bytes as u64)); + group.bench_function("iter", move |b| { + b.iter_custom(|iters| { + let attn_masks = vec![attn_mask.clone(); iters as usize]; + let start = Instant::now(); + for attn_mask in attn_masks.into_iter() { + let tensor = Tensor::new(black_box(attn_mask), device).unwrap(); + black_box(tensor); + } + device.sync().unwrap(); + start.elapsed() + }) + }); + group.finish(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let handler = BenchDeviceHandler::new().unwrap(); + for device in handler.devices { + run_copy_mask_benchmark::(c, &device, "copy_mask"); + } +} + +criterion_group!(benches, criterion_benchmark); diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs index b0d2244f..34f45d3d 100644 --- a/candle-core/benches/benchmarks/mod.rs +++ b/candle-core/benches/benchmarks/mod.rs @@ -1,5 +1,6 @@ pub(crate) mod affine; pub(crate) mod conv_transpose2d; +pub(crate) mod copy; pub(crate) mod matmul; pub(crate) mod qmatmul; pub(crate) mod random; diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs index 130be7e0..8d0b8b35 100644 --- a/candle-core/src/device.rs +++ b/candle-core/src/device.rs @@ -103,7 +103,63 @@ impl NdArray for Vec { +impl NdArray for Vec { + fn shape(&self) -> Result { + Ok(Shape::from(self.len())) + } + + fn to_cpu_storage(&self) -> CpuStorage { + S::to_cpu_storage(self.as_slice()) + } +} + +impl NdArray for Vec<&[S]> { + fn shape(&self) -> Result { + if self.is_empty() { + crate::bail!("empty array") + } + let n = self.len(); + let m = self[0].len(); + for v in self.iter() { + if v.len() != m { + crate::bail!("two elements have different len {m} {}", v.len()) + } + } + Ok(Shape::from((n, m))) + } + + fn to_cpu_storage(&self) -> CpuStorage { + let data = self.iter().copied().flatten().copied().collect::>(); + S::to_cpu_storage_owned(data) + } +} + +impl NdArray for Vec> { + fn shape(&self) -> Result { + if self.is_empty() { + crate::bail!("empty array") + } + let n = self.len(); + let m = self[0].len(); + for v in self.iter() { + if v.len() != m { + crate::bail!("two elements have different len {m} {}", v.len()) + } + } + Ok(Shape::from((n, m))) + } + + fn to_cpu_storage(&self) -> CpuStorage { + let len: usize = self.iter().map(|v| v.len()).sum(); + let mut dst = Vec::with_capacity(len); + for v in self.iter() { + dst.extend(v.iter().copied()); + } + S::to_cpu_storage_owned(dst) + } +} + +impl NdArray for Vec>> { fn shape(&self) -> Result { if self.is_empty() { crate::bail!("empty array") @@ -120,9 +176,57 @@ impl NdArray for Vec { } fn to_cpu_storage(&self) -> CpuStorage { - // This allocates intermediary memory and shouldn't be necessary. - let storages = self.iter().map(|v| v.to_cpu_storage()).collect::>(); - CpuStorage::concat(storages.as_slice()).unwrap() + if self.is_empty() { + return S::to_cpu_storage_owned(vec![]); + } + let len: usize = self + .iter() + .map(|v| v.iter().map(|v| v.len()).sum::()) + .sum(); + let mut dst = Vec::with_capacity(len); + for v1 in self.iter() { + for v2 in v1.iter() { + dst.extend(v2.iter().copied()); + } + } + S::to_cpu_storage_owned(dst) + } +} + +impl NdArray for Vec>>> { + fn shape(&self) -> Result { + if self.is_empty() { + crate::bail!("empty array") + } + let shape0 = self[0].shape()?; + let n = self.len(); + for v in self.iter() { + let shape = v.shape()?; + if shape != shape0 { + crate::bail!("two elements have different shapes {shape:?} {shape0:?}") + } + } + Ok(Shape::from([[n].as_slice(), shape0.dims()].concat())) + } + + fn to_cpu_storage(&self) -> CpuStorage { + let len: usize = self + .iter() + .map(|v| { + v.iter() + .map(|v| v.iter().map(|v| v.len()).sum::()) + .sum::() + }) + .sum(); + let mut dst = Vec::with_capacity(len); + for v1 in self.iter() { + for v2 in v1.iter() { + for v3 in v2.iter() { + dst.extend(v3.iter().copied()); + } + } + } + S::to_cpu_storage_owned(dst) } } diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index 8767bc8c..309e705e 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -1811,3 +1811,26 @@ fn test_flip_3d_channels() -> Result<()> { candle_core::test_utils::assert_tensor_eq(&flipped, &expected)?; Ok(()) } + +#[test] +fn tensor_new() -> Result<()> { + let t1 = Tensor::new(vec![1f32, 2.0, 3.0], &Device::Cpu)?; + assert_eq!(t1.to_vec1::()?, [1.0, 2.0, 3.0]); + let t2 = Tensor::new(vec![vec![1f32, 2., 3.], vec![4., 5., 6.]], &Device::Cpu)?; + assert_eq!(t2.to_vec2::()?, [[1., 2., 3.], [4., 5., 6.]]); + let t3 = Tensor::new( + vec![ + vec![vec![1f32, 2., 3.], vec![4., 5., 6.]], + vec![vec![3f32, 1., 4.], vec![1., 5., 9.]], + ], + &Device::Cpu, + )?; + assert_eq!( + t3.to_vec3::()?, + [ + [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], + [[3.0, 1.0, 4.0], [1.0, 5.0, 9.0]] + ] + ); + Ok(()) +}