diff --git a/candle-core/examples/basics.rs b/candle-core/examples/basics.rs index d028db66..9e1a5d78 100644 --- a/candle-core/examples/basics.rs +++ b/candle-core/examples/basics.rs @@ -5,6 +5,8 @@ use anyhow::Result; use candle_core::{Device, Tensor}; fn main() -> Result<()> { + let mut file = std::fs::File::open("ggml.bin")?; + let data = candle_core::ggml::Content::read(&mut file, &Device::Cpu)?; let a = Tensor::randn(0f32, 1., (2, 3), &Device::Cpu)?; let b = Tensor::randn(0f32, 1., (3, 4), &Device::Cpu)?; let c = a.matmul(&b)?; diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs index 8563721c..483632ce 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend.rs @@ -1010,12 +1010,18 @@ impl Map2 for MatMul { }; let c_skip: usize = m * n; - let dst_shape: Shape = (m, n).into(); - let dst_strides = dst_shape.stride_contiguous(); - let dst_rs = dst_strides[0]; - let dst_cs = dst_strides[1]; - let mut dst = vec![T::zero(); b * m * n]; + + let (dst_rs, dst_cs) = if m == 1 { + (1, 1) + } else if n == 1 { + (1, 1) + } else { + let dst_shape: Shape = (m, n).into(); + let dst_strides = dst_shape.stride_contiguous(); + (dst_strides[0], dst_strides[1]) + }; + let num_threads = crate::utils::get_num_threads(); let parallelism = if num_threads > 1 { Parallelism::Rayon(num_threads) diff --git a/candle-examples/examples/llama2-c/weights.rs b/candle-examples/examples/llama2-c/weights.rs index b78418ce..2a32cafd 100644 --- a/candle-examples/examples/llama2-c/weights.rs +++ b/candle-examples/examples/llama2-c/weights.rs @@ -111,6 +111,7 @@ impl TransformerWeights { // matrix column major rather than row major. This ends up speeding up text generation from // 120 token/s to 220 token/s on a Ryzen 2600X. let tr = device.is_cpu() && !candle::utils::has_mkl(); + let tr = false; let tr = |x: Tensor| if tr { x.t()?.contiguous()?.t() } else { Ok(x) }; let mut ws = std::collections::HashMap::new(); let mut insert = |name: &str, t: Tensor| {