Moondream first bits.

Optimize the cat operation on contiguous tensors (#1855 )
* Add a specialized kernel for copy2d. * Move the cat operations. * Avoid transpositions in cat. * Bugfix. * Bugfix for the cuda kernel. * Add a benchmark. * Add more testing. * Test fix. * Faster kernel. * Add the missing kernel. * Tweak the test. * Add a metal kernel. * Fix for the metal kernel. * Get the tests to pass on metal. * Also use this opportunity to fix the metal kernel for ELU. * Add some bf16 kernels. * Clippy fixes.
2025-06-17 02:58:50 +00:00 · 2024-03-17 17:49:56 +01:00 · 2024-03-17 10:49:13 +01:00 · 2024-03-17 08:09:43 +01:00 · 2024-03-16 22:25:46 +01:00 · 2024-03-15 08:37:27 +01:00
61 changed files with 1808 additions and 520 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,7 +19,7 @@ exclude = [
 resolver = "2"

 [workspace.package]
-version = "0.4.1"
+version = "0.4.2"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@ -31,14 +31,14 @@ license = "MIT OR Apache-2.0"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
-candle = { path = "./candle-core", package = "candle-core", version = "0.4.1" }
-candle-datasets = { path = "./candle-datasets", version = "0.4.1" }
-candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.1" }
-candle-kernels = { path = "./candle-kernels", version = "0.4.1" }
-candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.1" }
-candle-nn = { path = "./candle-nn", version = "0.4.1" }
-candle-onnx = { path = "./candle-onnx", version = "0.4.1" }
-candle-transformers = { path = "./candle-transformers", version = "0.4.1" }
+candle = { path = "./candle-core", package = "candle-core", version = "0.4.2" }
+candle-datasets = { path = "./candle-datasets", version = "0.4.2" }
+candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.2" }
+candle-kernels = { path = "./candle-kernels", version = "0.4.2" }
+candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.2" }
+candle-nn = { path = "./candle-nn", version = "0.4.2" }
+candle-onnx = { path = "./candle-onnx", version = "0.4.2" }
+candle-transformers = { path = "./candle-transformers", version = "0.4.2" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
 cudarc = { version = "0.10.0", features = ["f16"] }
--- a/README.md
+++ b/README.md
@ -175,6 +175,7 @@ And then head over to
 - [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
 - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
 - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
+- [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.

 If you have an addition to this list, please submit a pull request.

--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@ -98,6 +98,19 @@ pub trait BackendStorage: Sized {
    ) -> Result<Self>;

    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;
+
+    #[allow(clippy::too_many_arguments)]
+    // Similar to cudaMemcpy2D, though values are in elements and not in bytes.
+    fn copy2d(
+        &self,
+        _: &mut Self,
+        _d1: usize,
+        _d2: usize,
+        _src_stride1: usize,
+        _dst_stride1: usize,
+        _src_offset: usize,
+        _dst_offset: usize,
+    ) -> Result<()>;
 }

 pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
@ -5,6 +5,7 @@ use half::{bf16, f16};
 use rayon::prelude::*;

 const USE_IM2COL_CONV1D: bool = true;
+const USE_IM2COL_CONV1D_TR: bool = true;
 const USE_IM2COL_CONV2D: bool = true;

 // TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
@ -1022,6 +1023,26 @@ impl<'a, I: IntDType> Map2 for IndexAdd<'a, I> {
    }
 }

+#[allow(clippy::too_many_arguments)]
+fn copy2d_<T: Copy>(
+    src: &[T],
+    dst: &mut [T],
+    d1: usize,
+    d2: usize,
+    src_stride1: usize,
+    dst_stride1: usize,
+    src_offset: usize,
+    dst_offset: usize,
+) {
+    for i1 in 0..d1 {
+        let dst_idx = i1 * dst_stride1 + dst_offset;
+        let src_idx = i1 * src_stride1 + src_offset;
+        let dst = &mut dst[dst_idx..dst_idx + d2];
+        let src = &src[src_idx..src_idx + d2];
+        dst.copy_from_slice(src)
+    }
+}
+
 fn copy_strided_src_<T: Copy>(src: &[T], dst: &mut [T], dst_offset: usize, src_l: &Layout) {
    match src_l.strided_blocks() {
        crate::StridedBlocks::SingleBlock { start_offset, len } => {
@ -1256,6 +1277,34 @@ impl Map1 for Im2Col {
    }
 }

+struct Col2Im1D {
+    stride: usize,
+}
+
+impl Map1 for Col2Im1D {
+    fn f<T: WithDType>(&self, col: &[T], l: &Layout) -> Result<Vec<T>> {
+        let (b_size, l_in, c_out, k_size) = l.shape().dims4()?;
+        let stride = self.stride;
+        let l_out = (l_in - 1) * stride + k_size;
+        let mut im = vec![T::zero(); b_size * c_out * l_out];
+        let (dst_s0, dst_s1) = (c_out * l_out, l_out);
+        let (src_s0, src_s1, src_s2) = (c_out * k_size * l_in, c_out * k_size, k_size);
+        for l_in_i in 0..l_in {
+            for k_i in 0..k_size {
+                let l_out_i = l_in_i * stride + k_i;
+                for b_i in 0..b_size {
+                    for c_i in 0..c_out {
+                        let dst_idx = b_i * dst_s0 + c_i * dst_s1 + l_out_i;
+                        let src_idx = b_i * src_s0 + l_in_i * src_s1 + c_i * src_s2 + k_i;
+                        im[dst_idx] += col[src_idx]
+                    }
+                }
+            }
+        }
+        Ok(im)
+    }
+}
+
 struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);

 impl<'a> Map2 for ConvTranspose1D<'a> {
@ -2423,6 +2472,48 @@ impl BackendStorage for CpuStorage {
        }
    }

+    fn copy2d(
+        &self,
+        dst: &mut Self,
+        d1: usize,
+        d2: usize,
+        src_s: usize,
+        dst_s: usize,
+        src_o: usize,
+        dst_o: usize,
+    ) -> Result<()> {
+        match (self, dst) {
+            (Self::U8(src), Self::U8(dst)) => copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o),
+            (Self::U32(src), Self::U32(dst)) => {
+                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
+            }
+            (Self::I64(src), Self::I64(dst)) => {
+                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
+            }
+            (Self::BF16(src), Self::BF16(dst)) => {
+                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
+            }
+            (Self::F16(src), Self::F16(dst)) => {
+                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
+            }
+            (Self::F32(src), Self::F32(dst)) => {
+                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
+            }
+            (Self::F64(src), Self::F64(dst)) => {
+                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
+            }
+            (_, dst) => {
+                return Err(Error::DTypeMismatchBinaryOp {
+                    lhs: self.dtype(),
+                    rhs: dst.dtype(),
+                    op: "copy2d",
+                }
+                .bt());
+            }
+        }
+        Ok(())
+    }
+
    fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> {
        match (self, dst) {
            (Self::U8(src), Self::U8(dst)) => copy_strided_src_(src, dst, dst_offset, src_l),
@ -2511,7 +2602,52 @@ impl BackendStorage for CpuStorage {
        kernel_l: &Layout,
        params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
-        ConvTranspose1D(params).map(self, l, kernel, kernel_l)
+        let can_use_col2im = kernel_l.is_contiguous()
+            && params.dilation == 1
+            && params.padding == 0
+            && params.output_padding == 0;
+        if USE_IM2COL_CONV1D_TR && can_use_col2im {
+            let (b_size, c_in, l_in) = l.shape().dims3()?;
+            let (c_in2, c_out, k_size) = kernel_l.shape().dims3()?;
+            if !kernel_l.is_contiguous() {
+                crate::bail!(
+                    "convtr1d: the second argument (kernel) has to be contiguous {kernel_l:?}"
+                )
+            }
+            if c_in != c_in2 {
+                crate::bail!(
+                    "convtr1d: shape mismatch on c_in {:?} {:?}",
+                    l.shape(),
+                    kernel_l.shape()
+                )
+            }
+            let col = {
+                // This merges the last two dimensions of the kernel together.
+                let kernel_l_mm = Layout::new(
+                    (b_size, c_in, k_size * c_out).into(),
+                    vec![0, k_size * c_out, 1],
+                    kernel_l.start_offset(),
+                );
+                self.matmul(
+                    kernel,
+                    (
+                        b_size,
+                        /* m */ l_in,
+                        /* n */ c_out * k_size,
+                        /* k */ c_in,
+                    ),
+                    &l.transpose(1, 2)?,
+                    &kernel_l_mm,
+                )?
+            };
+            let col_l = Layout::contiguous((b_size, l_in, c_out, k_size));
+            Col2Im1D {
+                stride: params.stride,
+            }
+            .map(&col, &col_l)
+        } else {
+            ConvTranspose1D(params).map(self, l, kernel, kernel_l)
+        }
    }

    fn conv2d(
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@ -2145,6 +2145,67 @@ impl BackendStorage for CudaStorage {
        Ok(Self { slice, device })
    }

+    fn copy2d(
+        &self,
+        dst: &mut Self,
+        d1: usize,
+        d2: usize,
+        src_s: usize,
+        dst_s: usize,
+        src_o: usize,
+        dst_o: usize,
+    ) -> Result<()> {
+        let dev = &self.device;
+        let d1 = d1 as u32;
+        let d2 = d2 as u32;
+        let dst_s = dst_s as u32;
+        let src_s = src_s as u32;
+        let (src, dst, kname) = match (&self.slice, &mut dst.slice) {
+            (S::U8(s), S::U8(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_u8",
+            ),
+            (S::U32(s), S::U32(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_u32",
+            ),
+            (S::I64(s), S::I64(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_i64",
+            ),
+            (S::BF16(s), S::BF16(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_bf16",
+            ),
+            (S::F16(s), S::F16(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_f16",
+            ),
+            (S::F32(s), S::F32(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_f32",
+            ),
+            (S::F64(s), S::F64(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_f64",
+            ),
+            _ => Err(CudaError::InternalError("dtype mismatch in copy2d"))?,
+        };
+        let func = dev.get_or_load_func(kname, kernels::FILL)?;
+        let cfg = LaunchConfig::for_num_elems(d1 * d2);
+        let params = (src, dst, d1, d2, src_s, dst_s);
+        // SAFETY: ffi.
+        unsafe { func.launch(cfg, params) }.w()?;
+        Ok(())
+    }
+
    fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> {
        let src_shape = src_l.shape();
        let dims = src_shape.dims();
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -65,12 +65,13 @@ impl std::fmt::Debug for Tensor {
 }

 /// Options for Tensor pretty printing
+#[derive(Debug, Clone)]
 pub struct PrinterOptions {
-    precision: usize,
-    threshold: usize,
-    edge_items: usize,
-    line_width: usize,
-    sci_mode: Option<bool>,
+    pub precision: usize,
+    pub threshold: usize,
+    pub edge_items: usize,
+    pub line_width: usize,
+    pub sci_mode: Option<bool>,
 }

 static PRINT_OPTS: std::sync::Mutex<PrinterOptions> =
@ -89,6 +90,10 @@ impl PrinterOptions {
    }
 }

+pub fn print_options() -> &'static std::sync::Mutex<PrinterOptions> {
+    &PRINT_OPTS
+}
+
 pub fn set_print_options(options: PrinterOptions) {
    *PRINT_OPTS.lock().unwrap() = options
 }
@ -117,6 +122,26 @@ pub fn set_print_options_full() {
    }
 }

+pub fn set_line_width(line_width: usize) {
+    PRINT_OPTS.lock().unwrap().line_width = line_width
+}
+
+pub fn set_precision(precision: usize) {
+    PRINT_OPTS.lock().unwrap().precision = precision
+}
+
+pub fn set_edge_items(edge_items: usize) {
+    PRINT_OPTS.lock().unwrap().edge_items = edge_items
+}
+
+pub fn set_threshold(threshold: usize) {
+    PRINT_OPTS.lock().unwrap().threshold = threshold
+}
+
+pub fn set_sci_mode(sci_mode: Option<bool>) {
+    PRINT_OPTS.lock().unwrap().sci_mode = sci_mode
+}
+
 struct FmtSize {
    current_size: usize,
 }
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@ -23,7 +23,15 @@ pub enum DType {
 }

 #[derive(Debug, PartialEq, Eq)]
-pub struct DTypeParseError;
+pub struct DTypeParseError(String);
+
+impl std::fmt::Display for DTypeParseError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "cannot parse '{}' as a dtype", self.0)
+    }
+}
+
+impl std::error::Error for DTypeParseError {}

 impl std::str::FromStr for DType {
    type Err = DTypeParseError;
@ -36,7 +44,7 @@ impl std::str::FromStr for DType {
            "f16" => Ok(Self::F16),
            "f32" => Ok(Self::F32),
            "f64" => Ok(Self::F64),
-            _ => Err(DTypeParseError),
+            _ => Err(DTypeParseError(s.to_string())),
        }
    }
 }
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -154,6 +154,19 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

+    fn copy2d(
+        &self,
+        _: &mut Self,
+        _: usize,
+        _: usize,
+        _: usize,
+        _: usize,
+        _: usize,
+        _: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -166,6 +166,19 @@ impl crate::backend::BackendStorage for MetalStorage {
        Err(Error::NotCompiledWithMetalSupport)
    }

+    fn copy2d(
+        &self,
+        _: &mut Self,
+        _: usize,
+        _: usize,
+        _: usize,
+        _: usize,
+        _: usize,
+        _: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
--- a/candle-core/src/layout.rs
+++ b/candle-core/src/layout.rs
@ -70,7 +70,7 @@ impl Layout {
        self.shape.is_fortran_contiguous(&self.stride)
    }

-    pub(crate) fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
+    pub fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
        let dims = self.shape().dims();
        if dim >= dims.len() {
            Err(Error::DimOutOfRange {
@ -99,7 +99,7 @@ impl Layout {
        })
    }

-    pub(crate) fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
+    pub fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
        let rank = self.shape.rank();
        if rank <= dim1 || rank <= dim2 {
            Err(Error::UnexpectedNumberOfDims {
@ -120,7 +120,7 @@ impl Layout {
        })
    }

-    pub(crate) fn permute(&self, idxs: &[usize]) -> Result<Self> {
+    pub fn permute(&self, idxs: &[usize]) -> Result<Self> {
        let is_permutation =
            idxs.len() == self.shape.rank() && (0..idxs.len()).all(|i| idxs.contains(&i));
        if !is_permutation {
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -67,6 +67,7 @@ pub mod shape;
 mod storage;
 mod strided_index;
 mod tensor;
+mod tensor_cat;
 pub mod test_utils;
 pub mod utils;
 mod variable;
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@ -9,7 +9,7 @@ use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}
 use std::collections::HashMap;
 use std::ffi::c_void;
 use std::path::Path;
-use std::sync::{Arc, Mutex, RwLock, TryLockError};
+use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard, TryLockError};

 /// Simple way to catch lock error without
 /// depending on T
@ -60,7 +60,8 @@ impl From<String> for MetalError {
    }
 }

-type AllocatedBuffers = Arc<RwLock<HashMap<(NSUInteger, MTLResourceOptions), Vec<Arc<Buffer>>>>>;
+type BufferMap = HashMap<(NSUInteger, MTLResourceOptions), Vec<Arc<Buffer>>>;
+type AllocatedBuffers = Arc<RwLock<BufferMap>>;

 #[derive(Clone)]
 pub struct MetalDevice {
@ -68,7 +69,7 @@ pub struct MetalDevice {
    device: metal::Device,

    /// Single command queue for the entire device.
-    command_queue: metal::CommandQueue,
+    command_queue: CommandQueue,
    /// One command buffer at a time.
    /// The scheduler works by allowing multiple
    /// [ComputeCommandEncoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc)
@ -78,7 +79,7 @@ pub struct MetalDevice {
    /// Despite what the documentation says, command buffers are NOT ordered. They are ordered
    /// for their START time, but there's no guarantee that command buffer1 will finish before
    /// command buffer2 starts (or there are metal bugs there)
-    command_buffer: Arc<RwLock<metal::CommandBuffer>>,
+    command_buffer: Arc<RwLock<CommandBuffer>>,
    /// Keeps track of the current amount of compute command encoders on the current
    /// command buffer
    /// Arc, RwLock because of the interior mutability.
@ -87,7 +88,7 @@ pub struct MetalDevice {
    compute_per_buffer: usize,
    /// Simple keeper struct to keep track of the already compiled kernels so we can reuse them.
    /// Heavily used by [`candle_metal_kernels`]
-    kernels: Arc<candle_metal_kernels::Kernels>,
+    kernels: Arc<Kernels>,
    /// Simple allocator struct.
    /// The buffers are stored in size buckets since ML tends to use similar shapes over and over.
    /// We store the buffers in [`Arc`] because it's much faster than Obj-c internal ref counting
@ -99,7 +100,7 @@ pub struct MetalDevice {
    /// operation, so that this buffer is not being used by another kernel at the same time.
    /// Arc is the CPU reference count, it doesn't mean anything on the GPU side of things.
    ///
-    /// Whenever we actually allocate a new buffer, we make a full sweep to cleanup unused buffers
+    /// Whenever we actually allocate a new buffer, we make a full sweep to clean up unused buffers
    /// (strong_count = 1).
    buffers: AllocatedBuffers,
    /// Seed for random number generation.
@ -145,6 +146,8 @@ impl MetalDevice {
            command_buffer = self.command_queue.new_command_buffer().to_owned();
            *command_buffer_lock = command_buffer.clone();
            *index = 0;
+
+            self.drop_unused_buffers()?;
        }
        *index += 1;
        Ok(command_buffer)
@ -163,6 +166,7 @@ impl MetalDevice {
        command_buffer.commit();
        command_buffer.wait_until_completed();
        *command_buffer = self.command_queue.new_command_buffer().to_owned();
+
        Ok(())
    }

@ -199,39 +203,25 @@ impl MetalDevice {
    }

    /// Creates a new buffer from data.
-    /// The buffer is [MTLPrivate](https://developer.apple.com/documentation/metal/mtlstoragemode)
+    /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode)
    ///
-    /// This method will block the computation because of the
-    /// lack of lifetime management through the GPU.
-    /// Internal comment for technical details.
+    /// Does not require synchronization, as [newBufferWithBytes](https://developer.apple.com/documentation/metal/mtldevice/1433429-newbufferwithbytes)
+    /// allocates the buffer and copies over the existing data before returning the MTLBuffer.
    pub fn new_buffer_with_data<T>(&self, data: &[T]) -> Result<Arc<Buffer>> {
        let size = core::mem::size_of_val(data) as NSUInteger;
-        let tmp = self.device.new_buffer_with_data(
-            data.as_ptr() as *const core::ffi::c_void,
+        let new_buffer = self.device.new_buffer_with_data(
+            data.as_ptr() as *const c_void,
            size,
-            metal::MTLResourceOptions::StorageModeManaged,
+            MTLResourceOptions::StorageModeManaged,
        );
-        let real = self.allocate_buffer(
-            size,
-            metal::MTLResourceOptions::StorageModePrivate,
-            "with_data",
-        )?;
-        let command_buffer = self.command_buffer()?;
-        command_buffer.set_label("with_data");
-        let blit = command_buffer.new_blit_command_encoder();
-        blit.set_label("with_data_blit");
-        blit.copy_from_buffer(&tmp, 0, &real, 0, tmp.length());
-        blit.end_encoding();
+        let mut buffers = self.buffers.try_write().map_err(MetalError::from)?;
+        let subbuffers = buffers
+            .entry((size, MTLResourceOptions::StorageModeManaged))
+            .or_insert(vec![]);

-        // This is necessary, for mmaped safetensors
-        // Because of the unsafe slice cast we're doing.
-        // The slice might not live long enough for metal
-        // To actually fill the GPU buffer.
-        // Putting this wait forces the GPU buffer to be filled
-        // with the actual data allowing the CPU storage to do
-        // deallocate properly.
-        self.wait_until_completed()?;
-        Ok(real)
+        let new_buffer = Arc::new(new_buffer);
+        subbuffers.push(new_buffer.clone());
+        Ok(new_buffer)
    }

    pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result<Arc<Buffer>> {
@ -255,6 +245,40 @@ impl MetalDevice {
        Ok(buffer)
    }

+    fn find_available_buffer(
+        &self,
+        size: NSUInteger,
+        option: MTLResourceOptions,
+        buffers: &RwLockWriteGuard<BufferMap>,
+    ) -> Option<Arc<Buffer>> {
+        let mut best_buffer: Option<&Arc<Buffer>> = None;
+        let mut best_buffer_size: NSUInteger = NSUInteger::MAX;
+        for ((buffer_size, buffer_option), subbuffers) in buffers.iter() {
+            if buffer_size >= &size && buffer_size < &best_buffer_size && buffer_option == &option {
+                for sub in subbuffers {
+                    if Arc::strong_count(sub) == 1 {
+                        best_buffer = Some(sub);
+                        best_buffer_size = *buffer_size;
+                    }
+                }
+            }
+        }
+        return best_buffer.map(|b| b.clone());
+    }
+
+    fn drop_unused_buffers(&self) -> Result<()> {
+        let mut buffers = self.buffers.try_write().map_err(MetalError::from)?;
+        for subbuffers in buffers.values_mut() {
+            let newbuffers = subbuffers
+                .iter()
+                .filter(|s| Arc::strong_count(*s) > 1)
+                .map(Arc::clone)
+                .collect();
+            *subbuffers = newbuffers;
+        }
+        Ok(())
+    }
+
    /// The critical allocator algorithm
    fn allocate_buffer(
        &self,
@ -263,24 +287,18 @@ impl MetalDevice {
        _name: &str,
    ) -> Result<Arc<Buffer>> {
        let mut buffers = self.buffers.try_write().map_err(MetalError::from)?;
+        if let Some(b) = self.find_available_buffer(size, option, &buffers) {
+            // Cloning also ensures we increment the strong count
+            return Ok(b.clone());
+        }
+
+        let size = buf_size(size);
        let subbuffers = buffers.entry((size, option)).or_insert(vec![]);

-        for sub in &mut *subbuffers {
-            if Arc::strong_count(sub) == 1 {
-                return Ok(sub.clone());
-            }
-        }
        let new_buffer = self.device.new_buffer(size as NSUInteger, option);
        let new_buffer = Arc::new(new_buffer);
        subbuffers.push(new_buffer.clone());
-        for subbuffers in buffers.values_mut() {
-            let newbuffers = subbuffers
-                .iter()
-                .filter(|s| Arc::strong_count(s) > 1)
-                .map(Arc::clone)
-                .collect();
-            *subbuffers = newbuffers;
-        }
+
        Ok(new_buffer)
    }

@ -305,6 +323,8 @@ pub struct MetalStorage {
    buffer: Arc<metal::Buffer>,
    /// a reference to the device owning this buffer
    device: MetalDevice,
+    /// The count of allocated elements in the buffer
+    count: usize,
    /// The dtype is kept since buffers are untyped.
    dtype: DType,
 }
@ -386,7 +406,7 @@ impl BackendStorage for MetalStorage {
            )
            .map_err(MetalError::from)?;
        }
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), el, dtype))
    }

    fn powf(&self, layout: &Layout, pow: f64) -> Result<Self> {
@ -402,6 +422,7 @@ impl BackendStorage for MetalStorage {
            let name = match self.dtype {
                DType::F32 => "powf_f32",
                DType::F16 => "powf_f16",
+                DType::BF16 => "powf_bf16",
                dtype => crate::bail!("Metal contiguous powf {dtype:?} not implemented"),
            };
            candle_metal_kernels::call_powf(
@ -419,6 +440,7 @@ impl BackendStorage for MetalStorage {
            let name = match self.dtype {
                DType::F32 => "powf_f32_strided",
                DType::F16 => "powf_f16_strided",
+                DType::BF16 => "powf_bf16_strided",
                dtype => crate::bail!("Metal strided powf {dtype:?} not implemented"),
            };
            candle_metal_kernels::call_powf_strided(
@ -435,7 +457,7 @@ impl BackendStorage for MetalStorage {
            )
            .map_err(MetalError::from)?;
        }
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), el, dtype))
    }

    fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
@ -451,6 +473,7 @@ impl BackendStorage for MetalStorage {
            let name = match self.dtype {
                DType::F32 => "elu_f32",
                DType::F16 => "elu_f16",
+                DType::BF16 => "elu_bf16",
                dtype => crate::bail!("Metal contiguous elu {dtype:?} not implemented"),
            };
            candle_metal_kernels::call_elu(
@ -468,6 +491,7 @@ impl BackendStorage for MetalStorage {
            let name = match self.dtype {
                DType::F32 => "elu_f32_strided",
                DType::F16 => "elu_f16_strided",
+                DType::BF16 => "elu_bf16_strided",
                dtype => crate::bail!("Metal strided elu {dtype:?} not implemented"),
            };
            candle_metal_kernels::call_elu_strided(
@ -484,7 +508,7 @@ impl BackendStorage for MetalStorage {
            )
            .map_err(MetalError::from)?;
        }
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), el, dtype))
    }

    fn reduce_op(&self, op: ReduceOp, layout: &Layout, sum_dims: &[usize]) -> Result<Self> {
@ -562,7 +586,7 @@ impl BackendStorage for MetalStorage {
        )
        .map_err(MetalError::from)?;

-        Ok(Self::new(buffer, device, dtype))
+        Ok(Self::new(buffer, device, dst_el, dtype))
    }

    fn cmp(&self, op: CmpOp, rhs: &Self, lhs_l: &Layout, rhs_l: &Layout) -> Result<Self> {
@ -654,7 +678,7 @@ impl BackendStorage for MetalStorage {
            .map_err(MetalError::from)?;
        }
        command_buffer.set_label("to_dtype");
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), el_count, dtype))
    }

    fn unary_impl<B: UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
@ -774,7 +798,7 @@ impl BackendStorage for MetalStorage {
            )
            .map_err(MetalError::from)?;
        }
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), el_count, dtype))
    }

    fn binary_impl<B: BinaryOpT>(
@ -835,7 +859,7 @@ impl BackendStorage for MetalStorage {
            &buffer,
        )
        .map_err(MetalError::from)?;
-        Ok(Self::new(buffer, device, dtype))
+        Ok(Self::new(buffer, device, el, dtype))
    }

    fn conv1d(
@ -880,6 +904,7 @@ impl BackendStorage for MetalStorage {
        let col = Self {
            buffer: dst,
            device,
+            count: dst_el,
            dtype: self.dtype,
        };
        let l_out = params.l_out();
@ -964,6 +989,7 @@ impl BackendStorage for MetalStorage {
        let col = Self {
            buffer: dst,
            device,
+            count: dst_el,
            dtype: self.dtype,
        };
        let h_out = params.out_h();
@ -1049,7 +1075,7 @@ impl BackendStorage for MetalStorage {
            &buffer,
        )
        .map_err(MetalError::from)?;
-        Ok(Self::new(buffer, self.device.clone(), self.dtype))
+        Ok(Self::new(buffer, self.device.clone(), dst_el, self.dtype))
    }

    fn gather(&self, src_l: &Layout, ids: &Self, ids_l: &Layout, dim: usize) -> Result<Self> {
@ -1083,7 +1109,7 @@ impl BackendStorage for MetalStorage {
            &buffer,
        )
        .map_err(MetalError::from)?;
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), dst_el, dtype))
    }

    fn scatter_add(
@ -1106,7 +1132,15 @@ impl BackendStorage for MetalStorage {
            None => Err(crate::Error::RequiresContiguous { op: "scatter-add" }.bt())?,
        };
        let name = match (ids.dtype, self.dtype) {
+            (DType::U8, DType::F32) => "sa_u8_f32",
+            (DType::U8, DType::F16) => "sa_u8_f16",
+            (DType::U8, DType::BF16) => "sa_u8_bf16",
            (DType::U32, DType::F32) => "sa_u32_f32",
+            (DType::U32, DType::F16) => "sa_u32_f16",
+            (DType::U32, DType::BF16) => "sa_u32_bf16",
+            (DType::I64, DType::F32) => "sa_i64_f32",
+            (DType::I64, DType::F16) => "sa_i64_f16",
+            (DType::I64, DType::BF16) => "sa_i64_bf16",
            _ => Err(MetalError::UnexpectedDType {
                msg: "scatter-add ids should be u8/u32/i64",
                expected: DType::U32,
@ -1172,7 +1206,7 @@ impl BackendStorage for MetalStorage {
            &buffer,
        )
        .map_err(MetalError::from)?;
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), dst_el, dtype))
    }

    fn index_add(
@ -1254,7 +1288,73 @@ impl BackendStorage for MetalStorage {
            &buffer,
        )
        .map_err(MetalError::from)?;
-        Ok(Self::new(buffer, self.device.clone(), self.dtype()))
+        Ok(Self::new(
+            buffer,
+            self.device.clone(),
+            b * m * n,
+            self.dtype(),
+        ))
+    }
+
+    fn copy2d(
+        &self,
+        dst: &mut Self,
+        d1: usize,
+        d2: usize,
+        src_s: usize,
+        dst_s: usize,
+        src_o: usize,
+        dst_o: usize,
+    ) -> Result<()> {
+        if self.dtype() != dst.dtype() {
+            crate::bail!(
+                "copy2d with inconsistent dtypes {:?} {:?}",
+                self.dtype(),
+                dst.dtype()
+            )
+        }
+        let command_buffer = self.device.command_buffer()?;
+        if src_s == d2 && dst_s == d2 {
+            command_buffer.set_label("copy2d_contiguous");
+            let blit = command_buffer.new_blit_command_encoder();
+            blit.set_label("copy2d_contiguous");
+            let src_offset = (src_o * self.dtype.size_in_bytes()) as NSUInteger;
+            let length = (d1 * d2 * self.dtype.size_in_bytes()) as NSUInteger;
+            let dst_offset = (dst_o * dst.dtype().size_in_bytes()) as NSUInteger;
+            blit.copy_from_buffer(&self.buffer, src_offset, dst.buffer(), dst_offset, length);
+            blit.end_encoding();
+        } else {
+            let el_count = d1 * d2;
+            if el_count == 0 {
+                return Ok(());
+            }
+            let kernel_name = match self.dtype {
+                DType::F32 => candle_metal_kernels::copy2d::FLOAT,
+                DType::F16 => candle_metal_kernels::copy2d::HALF,
+                DType::BF16 => candle_metal_kernels::copy2d::BFLOAT,
+                DType::I64 => candle_metal_kernels::copy2d::I64,
+                DType::U32 => candle_metal_kernels::copy2d::U32,
+                DType::U8 => candle_metal_kernels::copy2d::U8,
+                dtype => crate::bail!("Metal copy2d {dtype:?} not implemented"),
+            };
+            candle_metal_kernels::call_copy2d(
+                &self.device.device,
+                &command_buffer,
+                &self.device.kernels,
+                kernel_name,
+                &self.buffer,
+                &dst.buffer,
+                d1,
+                d2,
+                src_s,
+                dst_s,
+                src_o * self.dtype.size_in_bytes(),
+                dst_o * self.dtype.size_in_bytes(),
+            )
+            .map_err(MetalError::from)?;
+            command_buffer.set_label("copy2d");
+        }
+        Ok(())
    }

    fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> {
@ -1303,10 +1403,11 @@ impl BackendStorage for MetalStorage {
 }

 impl MetalStorage {
-    pub fn new(buffer: Arc<Buffer>, device: MetalDevice, dtype: DType) -> Self {
+    pub fn new(buffer: Arc<Buffer>, device: MetalDevice, count: usize, dtype: DType) -> Self {
        Self {
            buffer,
            device,
+            count,
            dtype,
        }
    }
@ -1521,29 +1622,23 @@ impl MetalStorage {
            (buffer, dtype)
        };
        command_buffer.set_label("binary");
-        Ok(Self::new(buffer, device.clone(), dtype))
+        Ok(Self::new(buffer, device.clone(), el_count, dtype))
    }

    pub(crate) fn to_cpu<T: Clone>(&self) -> Result<Vec<T>> {
-        let length = self.buffer.length() as usize;
-        let size = self.dtype.size_in_bytes();
-        if length % size != 0 {
-            crate::bail!(
-                "The Metal buffer length is not aligned with dtype {:?}",
-                self.dtype
-            );
-        }
-        let buffer = self.device.new_buffer_managed(self.buffer.length())?;
+        let size = (self.count * self.dtype.size_in_bytes()) as NSUInteger;
+
+        let buffer = self.device.new_buffer_managed(size)?;
        {
            let command_buffer = self.device.command_buffer()?;
            command_buffer.set_label("to_cpu");
            let blit = command_buffer.new_blit_command_encoder();
            blit.set_label("blit_to_cpu");
-            blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
+            blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, size);
            blit.end_encoding();
        }
        self.device.wait_until_completed()?;
-        Ok(read_to_vec(&buffer, length / size))
+        Ok(read_to_vec(&buffer, self.count))
    }
 }

@ -1561,7 +1656,7 @@ impl BackendDevice for MetalDevice {
        let buffers = Arc::new(RwLock::new(HashMap::new()));
        let compute_per_buffer = match std::env::var("CANDLE_METAL_COMPUTE_PER_BUFFER") {
            Ok(val) => val.parse()?,
-            _ => 10,
+            _ => 50,
        };
        let seed = Arc::new(Mutex::new(device.new_buffer_with_data(
            [299792458].as_ptr() as *const c_void,
@ -1593,7 +1688,12 @@ impl BackendDevice for MetalDevice {
    fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<MetalStorage> {
        let size = shape.elem_count() * dtype.size_in_bytes();
        let buffer = self.allocate_zeros(size)?;
-        Ok(MetalStorage::new(buffer, self.clone(), dtype))
+        Ok(MetalStorage::new(
+            buffer,
+            self.clone(),
+            shape.elem_count(),
+            dtype,
+        ))
    }

    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
@ -1603,16 +1703,21 @@ impl BackendDevice for MetalDevice {
    }

    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<Self::Storage> {
-        let buffer = match storage {
-            CpuStorage::U8(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::U32(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::I64(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::BF16(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::F16(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::F32(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::F64(storage) => self.new_buffer_with_data(storage),
-        }?;
-        Ok(Self::Storage::new(buffer, self.clone(), storage.dtype()))
+        let (count, buffer) = match storage {
+            CpuStorage::U8(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::U32(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::I64(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::BF16(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::F16(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::F32(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::F64(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+        };
+        Ok(Self::Storage::new(
+            buffer?,
+            self.clone(),
+            count,
+            storage.dtype(),
+        ))
    }

    fn rand_uniform(
@ -1643,7 +1748,12 @@ impl BackendDevice for MetalDevice {
        )
        .map_err(MetalError::from)?;

-        Ok(Self::Storage::new(buffer, self.clone(), dtype))
+        Ok(Self::Storage::new(
+            buffer,
+            self.clone(),
+            shape.elem_count(),
+            dtype,
+        ))
    }

    fn rand_normal(
@ -1674,7 +1784,12 @@ impl BackendDevice for MetalDevice {
        )
        .map_err(MetalError::from)?;

-        Ok(Self::Storage::new(buffer, self.clone(), dtype))
+        Ok(Self::Storage::new(
+            buffer,
+            self.clone(),
+            shape.elem_count(),
+            dtype,
+        ))
    }

    fn set_seed(&self, seed: u64) -> Result<()> {
@ -1685,7 +1800,7 @@ impl BackendDevice for MetalDevice {
        let seed_buffer = self.seed.try_lock().map_err(MetalError::from)?;
        let contents = seed_buffer.contents();
        unsafe {
-            std::ptr::copy([seed].as_ptr(), contents as *mut u32, 4);
+            std::ptr::copy([seed].as_ptr(), contents as *mut u32, 1);
        }
        seed_buffer.did_modify_range(metal::NSRange::new(0, 4));

@ -1693,6 +1808,10 @@ impl BackendDevice for MetalDevice {
    }
 }

+fn buf_size(size: NSUInteger) -> NSUInteger {
+    (size - 1).next_power_of_two() as NSUInteger
+}
+
 fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
    let ptr = buffer.contents() as *const T;
    assert!(!ptr.is_null());
--- a/candle-core/src/quantized/cuda.rs
+++ b/candle-core/src/quantized/cuda.rs
@ -313,7 +313,7 @@ impl QCudaStorage {
        }

        let data_f32 = self.dequantize(n * k)?;
-        let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0);
+        let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0).broadcast_as((b, k, n))?;
        let out = storage.matmul(&data_f32, (b, m, n, k), layout, &rhs_l)?;
        let mut out_shape = layout.shape().dims().to_vec();
        out_shape.pop();
--- a/candle-core/src/quantized/metal.rs
+++ b/candle-core/src/quantized/metal.rs
@ -106,7 +106,12 @@ impl QMetalStorage {
        }

        let buffer = self.device.new_buffer_with_data(&out)?;
-        Ok(MetalStorage::new(buffer, self.device.clone(), DType::F32))
+        Ok(MetalStorage::new(
+            buffer,
+            self.device.clone(),
+            elem_count,
+            DType::F32,
+        ))
    }

    pub fn quantize(&mut self, src: &MetalStorage) -> Result<()> {
@ -170,7 +175,7 @@ impl QMetalStorage {
            &dst,
        )
        .map_err(MetalError::from)?;
-        let dst_storage = crate::MetalStorage::new(dst, device, DType::F32);
+        let dst_storage = crate::MetalStorage::new(dst, device, dst_shape.elem_count(), DType::F32);
        Ok((dst_storage, dst_shape))
    }
 }
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -398,7 +398,7 @@ impl QMatMul {
            _ => DEQUANTIZE_ALL.with(|b| *b),
        };
        let t = if dequantize {
-            let tensor = qtensor.dequantize(&Device::Cpu)?;
+            let tensor = qtensor.dequantize(&qtensor.device())?;
            Self::Tensor(tensor)
        } else {
            Self::QTensor(qtensor)
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -701,4 +701,32 @@ impl Storage {
            .bt()),
        }
    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn copy2d(
+        &self,
+        dst: &mut Self,
+        d1: usize,
+        d2: usize,
+        src_s: usize,
+        dst_s: usize,
+        src_o: usize,
+        dst_o: usize,
+    ) -> Result<()> {
+        match (self, dst) {
+            (Self::Cpu(src), Self::Cpu(dst)) => src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o),
+            (Self::Cuda(src), Self::Cuda(dst)) => {
+                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
+            }
+            (Self::Metal(src), Self::Metal(dst)) => {
+                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "copy2d",
+            }
+            .bt()),
+        }
+    }
 }
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -666,7 +666,7 @@ impl Tensor {
        Ok(from_storage(storage, self.shape(), op, false))
    }

-    fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> {
+    pub(crate) fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> {
        if dim >= self.dims().len() {
            Err(Error::DimOutOfRange {
                shape: self.shape().clone(),
@ -2149,152 +2149,6 @@ impl Tensor {
        Self::cat(&args, dim)
    }

-    /// Concatenates two or more tensors along a particular dimension.
-    ///
-    /// All tensors must of the same rank, and the output will have
-    /// the same rank
-    ///
-    /// ```rust
-    /// # use candle_core::{Tensor, DType, Device};
-    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
-    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
-    ///
-    /// let c = Tensor::cat(&[&a, &b], 0)?;
-    /// assert_eq!(c.shape().dims(), &[4, 3]);
-    ///
-    /// let c = Tensor::cat(&[&a, &b], 1)?;
-    /// assert_eq!(c.shape().dims(), &[2, 6]);
-    /// # Ok::<(), candle_core::Error>(())
-    /// ```
-    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let dim = dim.to_index(arg0.shape(), "cat")?;
-        for arg in args {
-            arg.as_ref().check_dim(dim, "cat")?;
-        }
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg0.rank() != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: arg0.rank(),
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx != dim && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-        }
-        if dim == 0 {
-            Self::cat0(args)
-        } else {
-            // TODO: Avoid these transpositions and have an implementation that works
-            // for dim != 0...
-            let args: Vec<Tensor> = args
-                .iter()
-                .map(|a| a.as_ref().transpose(0, dim))
-                .collect::<Result<Vec<_>>>()?;
-            let cat = Self::cat0(&args)?;
-            cat.transpose(0, dim)
-        }
-    }
-
-    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let rank = arg0.rank();
-        let device = arg0.device();
-        let dtype = arg0.dtype();
-        let first_dims = arg0.shape().dims();
-        let mut cat_dims = first_dims.to_vec();
-        cat_dims[0] = 0;
-        let mut offsets = vec![0usize];
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg.dtype() != dtype {
-                Err(Error::DTypeMismatchBinaryOp {
-                    lhs: dtype,
-                    rhs: arg.dtype(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if arg.device().location() != device.location() {
-                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: device.location(),
-                    rhs: arg.device().location(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if rank != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: rank,
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx == 0 {
-                    cat_dims[0] += v2;
-                }
-                if dim_idx != 0 && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-            let next_offset = offsets.last().unwrap() + arg.elem_count();
-            offsets.push(next_offset);
-        }
-        let shape = Shape::from(cat_dims);
-        let op = BackpropOp::new(args, |args| Op::Cat(args, 0));
-        let mut storage = device.zeros(&shape, dtype)?;
-        for (arg, &offset) in args.iter().zip(offsets.iter()) {
-            let arg = arg.as_ref();
-            arg.storage()
-                .copy_strided_src(&mut storage, offset, arg.layout())?;
-        }
-        Ok(from_storage(storage, shape, op, false))
-    }
-
    /// Pad the input tensor using 0s along dimension `dim`. This adds `left` elements before the
    /// input tensor values and `right` elements after.
    pub fn pad_with_zeros<D: Dim>(&self, dim: D, left: usize, right: usize) -> Result<Self> {
--- a/candle-core/src/tensor_cat.rs
+++ b/candle-core/src/tensor_cat.rs
@ -0,0 +1,240 @@
+use crate::{shape::Dim, Error, Result, Shape, Tensor};
+
+impl Tensor {
+    /// Concatenates two or more tensors along a particular dimension.
+    ///
+    /// All tensors must of the same rank, and the output will have
+    /// the same rank
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = Tensor::cat(&[&a, &b], 0)?;
+    /// assert_eq!(c.shape().dims(), &[4, 3]);
+    ///
+    /// let c = Tensor::cat(&[&a, &b], 1)?;
+    /// assert_eq!(c.shape().dims(), &[2, 6]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let dim = dim.to_index(arg0.shape(), "cat")?;
+        for arg in args {
+            arg.as_ref().check_dim(dim, "cat")?;
+        }
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg0.rank() != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: arg0.rank(),
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx != dim && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+        }
+        if dim == 0 {
+            Self::cat0(args)
+        } else {
+            let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous());
+            if all_contiguous {
+                Self::cat_contiguous(args, dim)
+            } else {
+                let args: Vec<Tensor> = args
+                    .iter()
+                    .map(|a| a.as_ref().transpose(0, dim))
+                    .collect::<Result<Vec<_>>>()?;
+                let cat = Self::cat0(&args)?;
+                cat.transpose(0, dim)
+            }
+        }
+    }
+
+    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let rank = arg0.rank();
+        let device = arg0.device();
+        let dtype = arg0.dtype();
+        let first_dims = arg0.shape().dims();
+        let mut cat_dims = first_dims.to_vec();
+        cat_dims[0] = 0;
+        let mut offsets = vec![0usize];
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg.dtype() != dtype {
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: dtype,
+                    rhs: arg.dtype(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if arg.device().location() != device.location() {
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: device.location(),
+                    rhs: arg.device().location(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if rank != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: rank,
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx == 0 {
+                    cat_dims[0] += v2;
+                }
+                if dim_idx != 0 && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+            let next_offset = offsets.last().unwrap() + arg.elem_count();
+            offsets.push(next_offset);
+        }
+        let shape = Shape::from(cat_dims);
+        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0));
+        let mut storage = device.zeros(&shape, dtype)?;
+        for (arg, &offset) in args.iter().zip(offsets.iter()) {
+            let arg = arg.as_ref();
+            arg.storage()
+                .copy_strided_src(&mut storage, offset, arg.layout())?;
+        }
+        Ok(crate::tensor::from_storage(storage, shape, op, false))
+    }
+
+    fn cat_contiguous<A: AsRef<Tensor>>(args: &[A], dim: usize) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let rank = arg0.rank();
+        let device = arg0.device();
+        let dtype = arg0.dtype();
+        let first_dims = arg0.shape().dims();
+        let mut cat_dims = first_dims.to_vec();
+        cat_dims[dim] = 0;
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg.dtype() != dtype {
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: dtype,
+                    rhs: arg.dtype(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if arg.device().location() != device.location() {
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: device.location(),
+                    rhs: arg.device().location(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if rank != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: rank,
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx == dim {
+                    cat_dims[dim] += v2;
+                }
+                if dim_idx != dim && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+        }
+        let cat_target_dim_len = cat_dims[dim];
+        let block_size: usize = cat_dims.iter().skip(1 + dim).product();
+        let shape = Shape::from(cat_dims);
+        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim));
+        let mut storage = device.zeros(&shape, dtype)?;
+        let mut dst_o = 0;
+        for arg in args.iter() {
+            let arg = arg.as_ref();
+            let arg_dims = arg.shape().dims();
+            let d1: usize = arg_dims.iter().take(dim).product();
+            let d2 = block_size * arg_dims[dim];
+            let dst_s = block_size * cat_target_dim_len;
+            let src_o = arg.layout().start_offset();
+            arg.storage().copy2d(
+                &mut storage,
+                d1,
+                d2,
+                /* src_s */ d2,
+                dst_s,
+                src_o,
+                dst_o,
+            )?;
+            dst_o += d2;
+        }
+        Ok(crate::tensor::from_storage(storage, shape, op, false))
+    }
+}
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -53,26 +53,36 @@ fn conv1d(dev: &Device) -> Result<()> {
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
-    let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 2, 7]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [
-            0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
-            4.7076, -5.9745, -0.8276, 1.621
-        ],
-    );
-    let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1, 2)?;
-    assert_eq!(res.dims(), [1, 4, 7]);
-    assert_eq!(
-        test_utils::to_vec2_round(&res.squeeze(0)?, 4)?,
-        [
-            [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819],
-            [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721],
-            [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113],
-            [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488]
-        ]
-    );
+
+    // conv-transposes are not implemented for metal.
+    if dev.is_metal() {
+        return Ok(());
+    }
+
+    let w = w.transpose(0, 1)?;
+    // The CPU kernels applied in the contiguous and non contiguous cases are different.
+    for w in [w.clone(), w.contiguous()?] {
+        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 1)?;
+        assert_eq!(res.dims(), [1, 2, 7]);
+        assert_eq!(
+            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+            [
+                0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
+                4.7076, -5.9745, -0.8276, 1.621
+            ],
+        );
+        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 2)?;
+        assert_eq!(res.dims(), [1, 4, 7]);
+        assert_eq!(
+            test_utils::to_vec2_round(&res.squeeze(0)?, 4)?,
+            [
+                [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819],
+                [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721],
+                [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113],
+                [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488]
+            ]
+        );
+    }
    Ok(())
 }

@ -158,31 +168,33 @@ fn conv2d(dev: &Device) -> Result<()> {
            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
        ]
    );
-    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-    assert_eq!(res.dims(), [1, 2, 7, 7]);
-    assert_eq!(
-        test_utils::to_vec3_round(&res.i(0)?, 4)?,
-        [
+    if !dev.is_metal() {
+        let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
+        assert_eq!(res.dims(), [1, 2, 7, 7]);
+        assert_eq!(
+            test_utils::to_vec3_round(&res.i(0)?, 4)?,
            [
-                [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277],
-                [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375],
-                [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889],
-                [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632],
-                [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985],
-                [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114],
-                [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579]
-            ],
-            [
-                [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211],
-                [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131],
-                [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621],
-                [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142],
-                [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059],
-                [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516],
-                [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171]
+                [
+                    [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277],
+                    [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375],
+                    [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889],
+                    [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632],
+                    [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985],
+                    [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114],
+                    [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579]
+                ],
+                [
+                    [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211],
+                    [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131],
+                    [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621],
+                    [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142],
+                    [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059],
+                    [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516],
+                    [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171]
+                ]
            ]
-        ]
-    );
+        );
+    }
    // Dilations.
    let res = t.conv2d(&w, 0, 1, 2, 1)?;
    assert_eq!(res.dims(), [1, 2, 1, 1]);
@ -191,36 +203,44 @@ fn conv2d(dev: &Device) -> Result<()> {
        [2.45, -2.3504],
    );

-    // Transpose and dilations.
-    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?;
-    assert_eq!(res.dims(), [1, 2, 9, 9]);
-    assert_eq!(
-        test_utils::to_vec3_round(&res.i(0)?, 4)?,
-        [
+    if !dev.is_metal() {
+        // Transpose and dilations.
+        let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?;
+        assert_eq!(res.dims(), [1, 2, 9, 9]);
+        assert_eq!(
+            test_utils::to_vec3_round(&res.i(0)?, 4)?,
            [
-                [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277],
-                [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499],
-                [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376],
-                [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141],
-                [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822],
-                [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03],
-                [-2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, -3.5024],
-                [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787],
-                [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579]
-            ],
-            [
-                [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211],
-                [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278],
-                [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861],
-                [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185],
-                [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642],
-                [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957],
-                [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856],
-                [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908],
-                [-5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, 1.0171]
+                [
+                    [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277],
+                    [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499],
+                    [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376],
+                    [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141],
+                    [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822],
+                    [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03],
+                    [
+                        -2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51,
+                        -3.5024
+                    ],
+                    [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787],
+                    [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579]
+                ],
+                [
+                    [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211],
+                    [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278],
+                    [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861],
+                    [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185],
+                    [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642],
+                    [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957],
+                    [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856],
+                    [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908],
+                    [
+                        -5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827,
+                        1.0171
+                    ]
+                ]
            ]
-        ]
-    );
+        );
+    }
    Ok(())
 }

@ -274,6 +294,12 @@ fn conv2d_small(dev: &Device) -> Result<()> {
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000
        ]
    );
+
+    // conv-transposes are not implemented for metal
+    if dev.is_metal() {
+        return Ok(());
+    }
+
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
    assert_eq!(res.dims(), [1, 1, 3, 3]);
    assert_eq!(
@ -375,6 +401,10 @@ print(w.grad.shape)
 print(w.grad[0])
 */
 fn conv2d_grad(dev: &Device) -> Result<()> {
+    // conv-transposes are not implemented for metal
+    if dev.is_metal() {
+        return Ok(());
+    }
    use candle_core::Var;
    let t = Var::from_slice(
        &[
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -1,3 +1,4 @@
+#![allow(clippy::approx_constant)]
 use anyhow::{Context, Result};
 use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var};

@ -96,24 +97,24 @@ fn unary_grad(device: &Device) -> Result<()> {
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        y.to_vec1::<f32>()?,
-        [20.085537, 2.7182817, 54.59815, 1.1618342]
+        test_utils::to_vec1_round(&y, 4)?,
+        [20.0855, 2.7183, 54.5982, 1.1618]
    );
    assert_eq!(
-        grad_x.to_vec1::<f32>()?,
-        [20.085537, 2.7182817, 54.59815, 1.1618342]
+        test_utils::to_vec1_round(grad_x, 4)?,
+        [20.0855, 2.7183, 54.5982, 1.1618]
    );
    let y = x.exp()?.sqr()?;
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        y.to_vec1::<f32>()?,
-        [403.4288, 7.3890557, 2980.9578, 1.3498588]
+        test_utils::to_vec1_round(&y, 3)?,
+        [403.429, 7.389, 2980.958, 1.35]
    );
    // exp(x)^2 = exp(2*x)
    assert_eq!(
-        grad_x.to_vec1::<f32>()?,
-        [806.8576, 14.778111, 5961.9155, 2.6997175]
+        test_utils::to_vec1_round(grad_x, 2)?,
+        [806.86, 14.78, 5961.92, 2.7]
    );
    let y = x.sin()?;
    let grads = y.backward()?;
@ -261,6 +262,7 @@ fn unary_grad(device: &Device) -> Result<()> {
    let y = elu_x.elu(2.)?;
    let grads = y.backward()?;
    let grad_x = grads.get(&elu_x).context("no grad for x")?;
+
    assert_eq!(
        test_utils::to_vec1_round(&y, 4)?,
        [-1.2642, 0.0000, -1.7293, 3.0000]
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
@ -2,6 +2,9 @@ use candle_core::{test_device, test_utils, Device, IndexOp, Result, Tensor};

 // https://github.com/huggingface/candle/issues/364
 fn avg_pool2d(dev: &Device) -> Result<()> {
+    if dev.is_metal() {
+        return Ok(());
+    }
    let data: Vec<f32> = vec![
        1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
    ];
@ -19,6 +22,9 @@ fn avg_pool2d(dev: &Device) -> Result<()> {
 }

 fn max_pool2d(dev: &Device) -> Result<()> {
+    if dev.is_metal() {
+        return Ok(());
+    }
    let data: Vec<f32> = vec![
        1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1.,
    ];
@ -43,6 +49,9 @@ res = torch.nn.functional.avg_pool2d(t, 2)
 print(res)
 */
 fn avg_pool2d_pytorch(dev: &Device) -> Result<()> {
+    if dev.is_metal() {
+        return Ok(());
+    }
    let t = Tensor::new(
        &[
            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -672,6 +672,31 @@ fn cat(device: &Device) -> Result<()> {
            [2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0]
        ]
    );
+
+    // 3D
+    let t1 = Tensor::arange(0, 48i64, device)?.reshape((2, 6, 4))?;
+    let t2 = Tensor::arange(100, 124i64, device)?.reshape((2, 3, 4))?;
+    let t3 = Tensor::arange(10000, 10032i64, device)?.reshape((2, 4, 4))?;
+
+    let t_cat = Tensor::cat(&[&t1, &t2, &t3], 1)?;
+
+    let t1 = t1.t()?.contiguous()?.t()?;
+    let t2 = t2.t()?.contiguous()?.t()?;
+    let t3 = t3.t()?.contiguous()?.t()?;
+    let t_cat2 = Tensor::cat(&[&t1, &t2, &t3], 1)?;
+
+    let diff = t_cat.eq(&t_cat2)?.to_dtype(DType::F32)?.sum_all()?;
+    assert_eq!(diff.to_vec0::<f32>()?, 104.0);
+    assert_eq!(t_cat.i((0, 0, 0))?.to_vec0::<i64>()?, 0);
+    assert_eq!(t_cat.i((0, 4, 0))?.to_vec0::<i64>()?, 16);
+    assert_eq!(t_cat.i((0, 5, 0))?.to_vec0::<i64>()?, 20);
+    assert_eq!(t_cat.i((1, 5, 0))?.to_vec0::<i64>()?, 44);
+    assert_eq!(t_cat.i((0, 6, 0))?.to_vec0::<i64>()?, 100);
+    assert_eq!(t_cat.i((1, 6, 0))?.to_vec0::<i64>()?, 112);
+    assert_eq!(t_cat.i((0, 6, 1))?.to_vec0::<i64>()?, 101);
+    assert_eq!(t_cat.i((0, 7, 1))?.to_vec0::<i64>()?, 105);
+    assert_eq!(t_cat.i((0, 12, 1))?.to_vec0::<i64>()?, 10013);
+    assert_eq!(t_cat.i((1, 12, 3))?.to_vec0::<i64>()?, 10031);
    Ok(())
 }

@ -1080,8 +1105,33 @@ fn broadcasting(device: &Device) -> Result<()> {
 fn randn(device: &Device) -> Result<()> {
    let tensor = Tensor::randn(0f32, 1f32, (5, 3), device)?;
    assert_eq!(tensor.dims(), [5, 3]);
+    // Check that the seed gets updated by checking that
+    // a new series of numbers is generated each time
+    let tensor2 = Tensor::randn(0f32, 1f32, (5, 3), device)?;
+    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
    let tensor = Tensor::rand(0f32, 1f32, (5, 3), device)?;
    assert_eq!(tensor.dims(), [5, 3]);
+    // Check that the seed gets updated by checking that
+    // a new series of numbers is generated each time
+    let tensor2 = Tensor::rand(0f32, 1f32, (5, 3), device)?;
+    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
+    // We do not expect deterministic elements at any index.
+    // There once was a bug that had a deterministic zero element in evenly sized tensors.
+    const N: usize = 2;
+    let v = (0..100)
+        .map(|_| Tensor::randn(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
+        .collect::<Result<Vec<_>>>()?;
+    assert!(
+        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
+        "There are deterministic values in the randn tensors"
+    );
+    let v = (0..100)
+        .map(|_| Tensor::rand(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
+        .collect::<Result<Vec<_>>>()?;
+    assert!(
+        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
+        "There are deterministic values in the rand tensors"
+    );
    Ok(())
 }

--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -100,6 +100,4 @@ required-features = ["candle-datasets"]
 name = "encodec"
 required-features = ["symphonia"]

-[[example]]
-name = "metavoice"
-required-features = ["symphonia"]
+
--- a/candle-examples/examples/convmixer/main.rs
+++ b/candle-examples/examples/convmixer/main.rs
@ -28,7 +28,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/convnext/main.rs
+++ b/candle-examples/examples/convnext/main.rs
@ -93,7 +93,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/dinov2/main.rs
+++ b/candle-examples/examples/dinov2/main.rs
@ -31,7 +31,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/efficientnet/main.rs
+++ b/candle-examples/examples/efficientnet/main.rs
@ -47,7 +47,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/efficientvit/main.rs
+++ b/candle-examples/examples/efficientvit/main.rs
@ -66,7 +66,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/gemma/README.md
+++ b/candle-examples/examples/gemma/README.md
@ -1,4 +1,4 @@
-# candle-mistral: 2b and 7b LLMs from Google DeepMind
+# candle-gemma: 2b and 7b LLMs from Google DeepMind

 [Gemma](https://ai.google.dev/gemma/docs) is a collection of lightweight open
 models published by Google Deepmind with a 2b and a 7b variant.
--- a/candle-examples/examples/metavoice/main.rs
+++ b/candle-examples/examples/metavoice/main.rs
@ -10,9 +10,8 @@ use std::io::Write;

 use candle_transformers::generation::LogitsProcessor;
 use candle_transformers::models::encodec;
-use candle_transformers::models::metavoice::{
-    adapters, gpt, speaker_encoder, tokenizers, transformer,
-};
+use candle_transformers::models::metavoice::{adapters, gpt, tokenizers, transformer};
+use candle_transformers::models::quantized_metavoice::transformer as qtransformer;

 use candle::{DType, IndexOp, Tensor};
 use candle_nn::VarBuilder;
@ -21,60 +20,6 @@ use rand::{distributions::Distribution, SeedableRng};

 pub const ENCODEC_NTOKENS: u32 = 1024;

-fn conv<T>(samples: &mut Vec<f32>, data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>)
-where
-    T: symphonia::core::sample::Sample,
-    f32: symphonia::core::conv::FromSample<T>,
-{
-    use symphonia::core::audio::Signal;
-    use symphonia::core::conv::FromSample;
-    samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
-}
-
-fn pcm_decode<P: AsRef<std::path::Path>>(path: P) -> anyhow::Result<(Vec<f32>, u32)> {
-    use symphonia::core::audio::{AudioBufferRef, Signal};
-
-    let src = std::fs::File::open(path)?;
-    let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default());
-    let hint = symphonia::core::probe::Hint::new();
-    let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
-    let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
-    let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?;
-    let mut format = probed.format;
-    let track = format
-        .tracks()
-        .iter()
-        .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
-        .expect("no supported audio tracks");
-    let mut decoder = symphonia::default::get_codecs()
-        .make(&track.codec_params, &Default::default())
-        .expect("unsupported codec");
-    let track_id = track.id;
-    let sample_rate = track.codec_params.sample_rate.unwrap_or(0);
-    let mut pcm_data = Vec::new();
-    while let Ok(packet) = format.next_packet() {
-        while !format.metadata().is_latest() {
-            format.metadata().pop();
-        }
-        if packet.track_id() != track_id {
-            continue;
-        }
-        match decoder.decode(&packet)? {
-            AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
-            AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
-        }
-    }
-    Ok((pcm_data, sample_rate))
-}
-
 #[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)]
 enum ArgDType {
    F32,
@ -82,6 +27,11 @@ enum ArgDType {
    Bf16,
 }

+enum Transformer {
+    Normal(transformer::Model),
+    Quantized(qtransformer::Model),
+}
+
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
@ -96,6 +46,10 @@ struct Args {
    #[arg(long)]
    prompt: String,

+    /// Use the quantized version of the model.
+    #[arg(long)]
+    quantized: bool,
+
    /// The guidance scale.
    #[arg(long, default_value_t = 3.0)]
    guidance_scale: f64,
@ -125,14 +79,9 @@ struct Args {
    #[arg(long)]
    second_stage_weights: Option<String>,

-    #[arg(long)]
-    speaker_encoder_weights: Option<String>,
-
    #[arg(long)]
    encodec_weights: Option<String>,

-    /// The speaker embeddings, either an audio files in which case they are extracted, or a
-    /// safetensors file with the embeddings already extracted.
    #[arg(long)]
    spk_emb: Option<String>,

@ -140,13 +89,6 @@ struct Args {
    dtype: ArgDType,
 }

-fn mel_filters() -> Result<Vec<f32>> {
-    let mel_bytes = include_bytes!("melfilters40.bytes").as_slice();
-    let mut mel_filters = vec![0f32; mel_bytes.len() / 4];
-    <byteorder::LittleEndian as byteorder::ByteOrder>::read_f32_into(mel_bytes, &mut mel_filters);
-    Ok(mel_filters)
-}
-
 fn main() -> Result<()> {
    use tracing_chrome::ChromeLayerBuilder;
    use tracing_subscriber::prelude::*;
@ -184,10 +126,6 @@ fn main() -> Result<()> {
    };
    let fs_tokenizer = tokenizers::BPE::from_json(first_stage_tokenizer, 512)?;

-    let first_stage_weights = match &args.first_stage_weights {
-        Some(w) => std::path::PathBuf::from(w),
-        None => repo.get("first_stage.safetensors")?,
-    };
    let second_stage_weights = match &args.second_stage_weights {
        Some(w) => std::path::PathBuf::from(w),
        None => repo.get("second_stage.safetensors")?,
@ -203,10 +141,27 @@ fn main() -> Result<()> {
        ArgDType::F16 => DType::F16,
        ArgDType::Bf16 => DType::BF16,
    };
-    let first_stage_vb =
-        unsafe { VarBuilder::from_mmaped_safetensors(&[first_stage_weights], dtype, &device)? };
+
    let first_stage_config = transformer::Config::cfg1b_v0_1();
-    let mut first_stage_model = transformer::Model::new(&first_stage_config, first_stage_vb)?;
+    let mut first_stage_model = if args.quantized {
+        let filename = match &args.first_stage_weights {
+            Some(w) => std::path::PathBuf::from(w),
+            None => repo.get("first_stage_q4k.gguf")?,
+        };
+        let vb =
+            candle_transformers::quantized_var_builder::VarBuilder::from_gguf(filename, &device)?;
+        let first_stage_model = qtransformer::Model::new(&first_stage_config, vb)?;
+        Transformer::Quantized(first_stage_model)
+    } else {
+        let first_stage_weights = match &args.first_stage_weights {
+            Some(w) => std::path::PathBuf::from(w),
+            None => repo.get("first_stage.safetensors")?,
+        };
+        let first_stage_vb =
+            unsafe { VarBuilder::from_mmaped_safetensors(&[first_stage_weights], dtype, &device)? };
+        let first_stage_model = transformer::Model::new(&first_stage_config, first_stage_vb)?;
+        Transformer::Normal(first_stage_model)
+    };

    let second_stage_vb =
        unsafe { VarBuilder::from_mmaped_safetensors(&[second_stage_weights], dtype, &device)? };
@ -227,41 +182,16 @@ fn main() -> Result<()> {
    let prompt_tokens = fs_tokenizer.encode(&args.prompt)?;
    let mut tokens = prompt_tokens.clone();
    println!("{tokens:?}");
-    let safetensors_embeddings = args
-        .spk_emb
-        .as_ref()
-        .map_or(true, |v| v.ends_with("safetensors"));
-    let spk_emb = if safetensors_embeddings {
-        let spk_emb_file = match &args.spk_emb {
-            Some(w) => std::path::PathBuf::from(w),
-            None => repo.get("spk_emb.safetensors")?,
-        };
-        let spk_emb = candle::safetensors::load(&spk_emb_file, &candle::Device::Cpu)?;
-        match spk_emb.get("spk_emb") {
-            None => anyhow::bail!("missing spk_emb tensor in {spk_emb_file:?}"),
-            Some(spk_emb) => spk_emb.to_dtype(dtype)?.to_device(&device)?,
-        }
-    } else {
-        let weights = match &args.speaker_encoder_weights {
-            Some(w) => std::path::PathBuf::from(w),
-            None => repo.get("speaker_encoder.safetensors")?,
-        };
-        let mel_filters = mel_filters()?;
-        let config = speaker_encoder::Config::cfg();
-        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[weights], dtype, &device)? };
-        let model = speaker_encoder::Model::new(config, vb)?;
-        let (pcm, sample_rate) = pcm_decode(&args.spk_emb.unwrap())?;
-        if sample_rate != 16_000 {
-            eprintln!("WARNING: speaker embedding input should use a 16kHz sample rate!")
-        }
-        model.embed_utterance(
-            &pcm,
-            &mel_filters,
-            /* rate */ 1.3,
-            /* min_c */ 0.75,
-            &device,
-        )?
+    let spk_emb_file = match &args.spk_emb {
+        Some(w) => std::path::PathBuf::from(w),
+        None => repo.get("spk_emb.safetensors")?,
    };
+    let spk_emb = candle::safetensors::load(&spk_emb_file, &candle::Device::Cpu)?;
+    let spk_emb = match spk_emb.get("spk_emb") {
+        None => anyhow::bail!("missing spk_emb tensor in {spk_emb_file:?}"),
+        Some(spk_emb) => spk_emb.to_dtype(dtype)?,
+    };
+    let spk_emb = spk_emb.to_device(&device)?;
    let mut logits_processor = LogitsProcessor::new(args.seed, Some(args.temperature), Some(0.95));

    // First stage generation.
@ -271,7 +201,12 @@ fn main() -> Result<()> {
        let ctxt = &tokens[start_pos..];
        let input = Tensor::new(ctxt, &device)?;
        let input = Tensor::stack(&[&input, &input], 0)?;
-        let logits = first_stage_model.forward(&input, &spk_emb, tokens.len() - context_size)?;
+        let logits = match &mut first_stage_model {
+            Transformer::Normal(m) => m.forward(&input, &spk_emb, tokens.len() - context_size)?,
+            Transformer::Quantized(m) => {
+                m.forward(&input, &spk_emb, tokens.len() - context_size)?
+            }
+        };
        let logits0 = logits.i((0, 0))?;
        let logits1 = logits.i((1, 0))?;
        let logits = ((logits0 * args.guidance_scale)? + logits1 * (1. - args.guidance_scale))?;
--- a/candle-examples/examples/metavoice/melfilters40.bytes
+++ b/candle-examples/examples/metavoice/melfilters40.bytes
--- a/candle-examples/examples/mobileone/main.rs
+++ b/candle-examples/examples/mobileone/main.rs
@ -63,7 +63,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/repvgg/main.rs
+++ b/candle-examples/examples/repvgg/main.rs
@ -78,7 +78,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/resnet/main.rs
+++ b/candle-examples/examples/resnet/main.rs
@ -45,7 +45,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/rwkv/main.rs
+++ b/candle-examples/examples/rwkv/main.rs
@ -141,7 +141,7 @@ impl std::fmt::Display for Which {
 impl Which {
    fn model_id(&self) -> &'static str {
        match self {
-            Self::Eagle7b => "RWKV/HF_v5-Eagle-7B",
+            Self::Eagle7b => "RWKV/v5-Eagle-7B-HF",
            Self::World1b5 => "RWKV/rwkv-5-world-1b5",
            Self::World3b => "RWKV/rwkv-5-world-3b",
            Self::World6_1b6 => "paperfun/rwkv",
--- a/candle-examples/examples/stable-diffusion/main.rs
+++ b/candle-examples/examples/stable-diffusion/main.rs
@ -96,6 +96,10 @@ struct Args {
    /// information.
    #[arg(long, default_value_t = 0.8)]
    img2img_strength: f64,
+
+    /// The seed to use when generating random samples.
+    #[arg(long)]
+    seed: Option<u64>,
 }

 #[derive(Debug, Clone, Copy, clap::ValueEnum, PartialEq, Eq)]
@ -374,6 +378,7 @@ fn run(args: Args) -> Result<()> {
        use_flash_attn,
        img2img,
        img2img_strength,
+        seed,
        ..
    } = args;

@ -427,6 +432,9 @@ fn run(args: Args) -> Result<()> {

    let scheduler = sd_config.build_scheduler(n_steps)?;
    let device = candle_examples::device(cpu)?;
+    if let Some(seed) = seed {
+        device.set_seed(seed)?;
+    }
    let use_guide_scale = guidance_scale > 1.0;

    let which = match sd_version {
--- a/candle-examples/examples/stable-lm/README.md
+++ b/candle-examples/examples/stable-lm/README.md
@ -10,11 +10,6 @@ order to be able to use it.

 Other available models are Stable-Code-3B, StableLM-2 and Zephyr variants.

-StableLM-2 uses a Tiktoken based GPT-3.5/GPT-4 tokenizer not supported by
-Candle, so to run it you can download a somewhat compatible
-[tokenizer.json](https://huggingface.co/Xenova/gpt-4/resolve/main/tokenizer.json?download=true)
-and pass it via the --tokenizer-file argument.
-
 ## Running some example

 ```bash
--- a/candle-examples/examples/stable-lm/main.rs
+++ b/candle-examples/examples/stable-lm/main.rs
@ -239,14 +239,7 @@ fn main() -> Result<()> {
    ));
    let tokenizer_filename = match args.tokenizer_file {
        Some(file) => std::path::PathBuf::from(file),
-        None => match args.which {
-            Which::V1Orig | Which::V1 | Which::V1Zephyr | Which::Code => {
-                repo.get("tokenizer.json")?
-            }
-            Which::V2 | Which::V2Zephyr => api
-                .model("lmz/candle-stablelm".to_string())
-                .get("tokenizer-gpt4.json")?,
-        },
+        None => repo.get("tokenizer.json")?,
    };
    let filenames = match args.weight_files {
        Some(files) => files
--- a/candle-examples/examples/vgg/main.rs
+++ b/candle-examples/examples/vgg/main.rs
@ -33,7 +33,7 @@ struct Args {
 pub fn main() -> anyhow::Result<()> {
    let args = Args::parse();
    let device = candle_examples::device(args.cpu)?;
-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;

    println!("loaded image {image:?}");

--- a/candle-examples/examples/vit/main.rs
+++ b/candle-examples/examples/vit/main.rs
@ -28,7 +28,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-flash-attn/Cargo.toml
+++ b/candle-flash-attn/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-flash-attn"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2021"

 description = "Flash attention layer for the candle ML framework."
@ -11,7 +11,7 @@ license = "MIT OR Apache-2.0"
 readme = "README.md"

 [dependencies]
-candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.4.1" }
+candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.4.2" }
 half = { version = "2.3.1", features = ["num-traits"] }

 [build-dependencies]
--- a/candle-kernels/Cargo.toml
+++ b/candle-kernels/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-kernels"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2021"

 description = "CUDA kernels for Candle"
--- a/candle-kernels/src/fill.cu
+++ b/candle-kernels/src/fill.cu
@ -10,11 +10,39 @@ __device__ void fill_with(T *buf, T value, const size_t numel) {
 extern "C" __global__ void fill_u8(uint8_t *buf, uint8_t value, const size_t numel) { fill_with(buf, value, numel); }
 extern "C" __global__ void fill_u32(uint32_t *buf, uint32_t value, const size_t numel) { fill_with(buf, value, numel); }
 extern "C" __global__ void fill_i64(int64_t *buf, int64_t value, const size_t numel) { fill_with(buf, value, numel); }
-extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); }
 extern "C" __global__ void fill_f32(float *buf, float value, const size_t numel) { fill_with(buf, value, numel); }
 extern "C" __global__ void fill_f64(double *buf, double value, const size_t numel) { fill_with(buf, value, numel); }

+template<typename T>
+__device__ void copy2d(const T *src, T *dst, uint32_t d1, uint32_t d2, uint32_t src_s, uint32_t dst_s) {
+  uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= d1 * d2) {
+    return;
+  }
+  uint32_t idx1 = idx / d2;
+  uint32_t idx2 = idx - d2 * idx1;
+  dst[idx1 * dst_s + idx2] = src[idx1 * src_s + idx2];
+}
+
+#define COPY2D_OP(TYPENAME, FNNAME) \
+extern "C" __global__ \
+void FNNAME(const TYPENAME *src, TYPENAME *dst, uint32_t d1, uint32_t d2, uint32_t src_s, uint32_t dst_s) { \
+  copy2d(src, dst, d1, d2, src_s, dst_s); \
+} \
+
+COPY2D_OP(float, copy2d_f32)
+COPY2D_OP(double, copy2d_f64)
+COPY2D_OP(uint8_t, copy2d_u8)
+COPY2D_OP(uint32_t, copy2d_u32)
+COPY2D_OP(int64_t, copy2d_i64)
+
+#if __CUDA_ARCH__ >= 530
+extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); }
+COPY2D_OP(__half, copy2d_f16)
+#endif
+
 #if __CUDA_ARCH__ >= 800
 #include <cuda_bf16.h>
 extern "C" __global__ void fill_bf16(__nv_bfloat16 *buf, __nv_bfloat16 value, const size_t numel) { fill_with(buf, value, numel); }
+COPY2D_OP(__nv_bfloat16, copy2d_bf16)
 #endif
--- a/candle-metal-kernels/Cargo.toml
+++ b/candle-metal-kernels/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-metal-kernels"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2021"

 description = "Metal kernels for Candle"
--- a/candle-metal-kernels/src/affine.metal
+++ b/candle-metal-kernels/src/affine.metal
@ -89,7 +89,7 @@ kernel void FN_NAME( \
        return; \
    } \
    const TYPENAME x = input[id]; \
-    output[id] = TYPENAME((x > 0)?x: mul * exp(x - 1)); \
+    output[id] = TYPENAME((x > 0)?x: mul * (exp(x) - 1)); \
 } \
 kernel void FN_NAME##_strided( \
    constant size_t &dim, \
--- a/candle-metal-kernels/src/indexing.metal
+++ b/candle-metal-kernels/src/indexing.metal
@ -167,11 +167,16 @@ kernel void NAME( \

 INDEX_OP(is_u32_f32, uint, float)
 INDEX_OP(is_u32_f16, uint, half)
+
 GATHER_OP(gather_u32_f32, uint, float)
 GATHER_OP(gather_u32_f16, uint, half)
-SCATTER_ADD_OP(sa_u32_f32, uint, float)
-SCATTER_ADD_OP(sa_u32_f16, uint, half)

+SCATTER_ADD_OP(sa_u32_f32, uint32_t, float)
+SCATTER_ADD_OP(sa_u8_f32, uint8_t, float)
+SCATTER_ADD_OP(sa_i64_f32, int64_t, float)
+SCATTER_ADD_OP(sa_u32_f16, uint32_t, half)
+SCATTER_ADD_OP(sa_u8_f16, uint8_t, half)
+SCATTER_ADD_OP(sa_i64_f16, int64_t, half)

 #if defined(__HAVE_BFLOAT__)
 INDEX_OP(is_u32_bf16, uint32_t, bfloat)
@ -180,6 +185,10 @@ INDEX_OP(is_u8_bf16, uint8_t, bfloat)
 INDEX_ADD_OP(ia_i64_bf16, int64_t, bfloat)
 INDEX_ADD_OP(ia_u32_bf16, uint32_t, bfloat)
 INDEX_ADD_OP(ia_u8_bf16, uint8_t, bfloat)
+
+SCATTER_ADD_OP(sa_u32_bf16, uint32_t, bfloat)
+SCATTER_ADD_OP(sa_u8_bf16, uint8_t, bfloat)
+SCATTER_ADD_OP(sa_i64_bf16, int64_t, bfloat)
 #endif

 INDEX_ADD_OP(ia_u32_f16, uint32_t, half)
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
@ -127,6 +127,16 @@ pub enum Source {
    Quantized,
 }

+pub mod copy2d {
+    pub struct Kernel(pub &'static str);
+    pub const FLOAT: Kernel = Kernel("copy2d_f32");
+    pub const HALF: Kernel = Kernel("copy2d_f16");
+    pub const BFLOAT: Kernel = Kernel("copy2d_bf16");
+    pub const I64: Kernel = Kernel("copy2d_i64");
+    pub const U32: Kernel = Kernel("copy2d_u32");
+    pub const U8: Kernel = Kernel("copy2d_u8");
+}
+
 macro_rules! ops{
    ($($name:ident),+) => {

@ -365,6 +375,46 @@ pub fn call_unary_contiguous(
    Ok(())
 }

+#[allow(clippy::too_many_arguments)]
+pub fn call_copy2d(
+    device: &Device,
+    command_buffer: &CommandBufferRef,
+    kernels: &Kernels,
+    name: copy2d::Kernel,
+    input: &Buffer,
+    output: &Buffer,
+    d1: usize,
+    d2: usize,
+    src_s: usize,
+    dst_s: usize,
+    src_o_in_bytes: usize,
+    dst_o_in_bytes: usize,
+) -> Result<(), MetalKernelError> {
+    let pipeline = kernels.load_pipeline(device, Source::Unary, name.0)?;
+    let encoder = command_buffer.new_compute_command_encoder();
+    encoder.set_compute_pipeline_state(&pipeline);
+    set_params!(
+        encoder,
+        (
+            d1,
+            d2,
+            src_s,
+            dst_s,
+            (input, src_o_in_bytes),
+            (output, dst_o_in_bytes)
+        )
+    );
+
+    let width: usize = d1 * d2;
+    let (thread_group_count, thread_group_size) = linear_split(&pipeline, width);
+
+    encoder.use_resource(input, metal::MTLResourceUsage::Read);
+    encoder.use_resource(output, metal::MTLResourceUsage::Write);
+    encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
+    encoder.end_encoding();
+    Ok(())
+}
+
 #[allow(clippy::too_many_arguments)]
 pub fn call_unary_strided(
    device: &Device,
@ -1558,8 +1608,10 @@ pub fn call_random_uniform(

    set_params!(encoder, (length, min, max, seed, buffer));

-    encoder.use_resource(seed, metal::MTLResourceUsage::Read);
-    encoder.use_resource(seed, metal::MTLResourceUsage::Write);
+    encoder.use_resource(
+        seed,
+        metal::MTLResourceUsage::Read | metal::MTLResourceUsage::Write,
+    );
    encoder.use_resource(buffer, metal::MTLResourceUsage::Write);
    encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
    encoder.end_encoding();
@ -1589,8 +1641,10 @@ pub fn call_random_normal(

    set_params!(encoder, (length, mean, stddev, seed, buffer));

-    encoder.use_resource(seed, metal::MTLResourceUsage::Read);
-    encoder.use_resource(seed, metal::MTLResourceUsage::Write);
+    encoder.use_resource(
+        seed,
+        metal::MTLResourceUsage::Read | metal::MTLResourceUsage::Write,
+    );
    encoder.use_resource(buffer, metal::MTLResourceUsage::Write);
    encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
    encoder.end_encoding();
--- a/candle-metal-kernels/src/random.metal
+++ b/candle-metal-kernels/src/random.metal
@ -123,16 +123,20 @@ template<typename T> METAL_FUNC void rand_uniform(
        return;
    }

+    // Evenly sized vectors need an offset when writing the mirror element.
+    uint off = 1 - size % 2;
    float diff = abs(min - max);
-    HybridTaus rng = HybridTaus::init({ulong(seed), tid, 1, 1});
+    uint s = atomic_load_explicit(seed, memory_order_relaxed);
+    HybridTaus rng = HybridTaus::init({ulong(s), tid, 1, 1});
    out[tid] = static_cast<T>(rng.rand() * diff + min);
    if (tid == 0) {
        atomic_store_explicit(seed, uint(rng.rand() * UNIF01_NORM32), memory_order_relaxed);
-        // Return early if tid == 0, otherwise we will write to out[size].
-        return;
+        // Return early if tid == 0 && off == 0, otherwise we will write to out[size].
+        if (off == 0)
+            return;
    }
    // Use symmetry to fill the other half of the array.
-    out[size - tid] = static_cast<T>(rng.rand() * diff + min);
+    out[size - off - tid] = static_cast<T>(rng.rand() * diff + min);
 }

 // Create Gaussian normal distribution using Box-Muller transform:
@ -148,7 +152,10 @@ template<typename T> METAL_FUNC void normal(
    if (tid >= size) {
        return;
    }
-    HybridTaus rng = HybridTaus::init({ulong(seed), tid, 1, 1});
+    // Evenly sized vectors need an offset when writing the mirror element.
+    uint off = 1 - size % 2;
+    uint s = atomic_load_explicit(seed, memory_order_relaxed);
+    HybridTaus rng = HybridTaus::init({ulong(s), tid, 1, 1});
    float u1 = rng.rand();
    float u2 = rng.rand();

@ -162,11 +169,12 @@ template<typename T> METAL_FUNC void normal(

    if (tid == 0) {
        atomic_store_explicit(seed, uint(rng.rand() * UNIF01_NORM32), memory_order_relaxed);
-        // Return early if tid == 0, otherwise we will write to out[size].
-        return;
+        // Return early if tid == 0 && off == 0, otherwise we will write to out[size].
+        if (off == 0)
+            return;
    }
    // Use symmetry to fill the other half of the array.
-    out[size - tid] = static_cast<T>(z1);
+    out[size - off - tid] = static_cast<T>(z1);
 }

 #define UNIFORM_OP(NAME, T)                             \
--- a/candle-metal-kernels/src/tests.rs
+++ b/candle-metal-kernels/src/tests.rs
@ -1066,3 +1066,107 @@ fn random() {
    validate_random!(f16);
    validate_random!(bf16);
 }
+
+fn run_scatter_add<T: Clone, I: Clone + std::fmt::Debug>(
+    input: &[T],
+    ids: &[I],
+    shape: &[usize],
+    dim: usize,
+    name: &'static str,
+) -> Vec<T> {
+    let device = device();
+    let kernels = Kernels::new();
+    let command_queue = device.new_command_queue();
+    let command_buffer = command_queue.new_command_buffer();
+    let options = MTLResourceOptions::StorageModeManaged;
+    let input_buffer = new_buffer(&device, input);
+    let ids_buffer = new_buffer(&device, ids);
+    let output = device.new_buffer(std::mem::size_of_val(input) as u64, options);
+    call_scatter_add(
+        &device,
+        command_buffer,
+        &kernels,
+        name,
+        shape,
+        shape,
+        dim,
+        &input_buffer,
+        0,
+        &ids_buffer,
+        0,
+        &output,
+    )
+    .unwrap();
+    command_buffer.commit();
+    command_buffer.wait_until_completed();
+    read_to_vec(&output, input.len())
+}
+
+#[test]
+fn scatter_add() {
+    let ids_u8 = [0u8, 0, 1, 0, 2, 2, 3, 3];
+    let ids_u32 = [0u32, 0, 1, 0, 2, 2, 3, 3];
+    let ids_i64 = [0i64, 0, 1, 0, 2, 2, 3, 3];
+
+    let input_f32 = [5.0f32, 1.0, 7.0, 2.0, 3.0, 2.0, 1.0, 3.0];
+    let input_f16 = input_f32
+        .iter()
+        .map(|v| f16::from_f32(*v))
+        .collect::<Vec<_>>();
+    let input_bf16 = input_f32
+        .iter()
+        .map(|v| bf16::from_f32(*v))
+        .collect::<Vec<_>>();
+
+    let output_dim1_f32 = vec![8.0, 7.0, 5.0, 4.0, 0.0, 0.0, 0.0, 0.0];
+    let output_dim1_f16 = output_dim1_f32
+        .iter()
+        .map(|v| f16::from_f32(*v))
+        .collect::<Vec<_>>();
+    let output_dim1_bf16 = output_dim1_f32
+        .iter()
+        .map(|v| bf16::from_f32(*v))
+        .collect::<Vec<_>>();
+
+    let output_dim2_f32 = vec![5.0, 3.0, 7.0, 0.0, 3.0, 2.0, 1.0, 3.0];
+    let output_dim2_f16 = output_dim2_f32
+        .iter()
+        .map(|v| f16::from_f32(*v))
+        .collect::<Vec<_>>();
+    let output_dim2_bf16 = output_dim2_f32
+        .iter()
+        .map(|v| bf16::from_f32(*v))
+        .collect::<Vec<_>>();
+
+    for (shape, output_f32, output_f16, output_bf16) in [
+        (vec![8], output_dim1_f32, output_dim1_f16, output_dim1_bf16),
+        (
+            vec![4, 2],
+            output_dim2_f32,
+            output_dim2_f16,
+            output_dim2_bf16,
+        ),
+    ] {
+        for results in [
+            run_scatter_add(&input_f32, &ids_u8, &shape, 0, "sa_u8_f32"),
+            run_scatter_add(&input_f32, &ids_u32, &shape, 0, "sa_u32_f32"),
+            run_scatter_add(&input_f32, &ids_i64, &shape, 0, "sa_i64_f32"),
+        ] {
+            assert_eq!(results, output_f32);
+        }
+        for results in [
+            run_scatter_add(&input_f16, &ids_u8, &shape, 0, "sa_u8_f16"),
+            run_scatter_add(&input_f16, &ids_u32, &shape, 0, "sa_u32_f16"),
+            run_scatter_add(&input_f16, &ids_i64, &shape, 0, "sa_i64_f16"),
+        ] {
+            assert_eq!(results, output_f16);
+        }
+        for results in [
+            run_scatter_add(&input_bf16, &ids_u8, &shape, 0, "sa_u8_bf16"),
+            run_scatter_add(&input_bf16, &ids_u32, &shape, 0, "sa_u32_bf16"),
+            run_scatter_add(&input_bf16, &ids_i64, &shape, 0, "sa_i64_bf16"),
+        ] {
+            assert_eq!(results, output_bf16);
+        }
+    }
+}
--- a/candle-metal-kernels/src/unary.metal
+++ b/candle-metal-kernels/src/unary.metal
@ -102,6 +102,30 @@ UNARY(NAME, half, NAME##_f16, NAME##_f16_strided);
 #define BFLOAT_UNARY_OP(NAME) \
 UNARY(NAME, bfloat, NAME##_bf16, NAME##_bf16_strided);

+#define COPY2D(FN_NAME, TYPENAME) \
+kernel void FN_NAME( \
+    constant size_t &d1, \
+    constant size_t &d2, \
+    constant size_t &src_s, \
+    constant size_t &dst_s, \
+    device const TYPENAME *input,  \
+    device TYPENAME *output, \
+    uint tid [[ thread_position_in_grid ]] \
+) { \
+    if (tid >= d1 * d2) { \
+        return; \
+    } \
+    size_t idx1 = tid / d2; \
+    size_t idx2 = tid - idx1 * d2; \
+    size_t src_idx = idx1 * src_s + idx2; \
+    size_t dst_idx = idx1 * dst_s + idx2; \
+    output[dst_idx] = input[src_idx]; \
+}
+
+COPY2D(copy2d_f32, float)
+COPY2D(copy2d_f16, half)
+COPY2D(copy2d_u8, uint8_t)
+COPY2D(copy2d_u32, uint32_t)

 UNARY_OP(cos)
 UNARY_OP(sin)
@ -128,6 +152,7 @@ UNARY(id, uint32_t, copy_u32, copy_u32_strided)

 #if __METAL_VERSION__ >= 220
 UNARY(id, int64_t, copy_i64, copy_i64_strided)
+COPY2D(copy2d_i64, int64_t)
 #endif

 #if defined(__HAVE_BFLOAT__)
@ -151,4 +176,6 @@ BFLOAT_UNARY_OP(recip)
 BFLOAT_UNARY_OP(relu)

 UNARY(id, bfloat, copy_bf16, copy_bf16_strided)
+
+COPY2D(copy2d_bf64, bfloat)
 #endif
--- a/candle-nn/examples/cpu_benchmarks.rs
+++ b/candle-nn/examples/cpu_benchmarks.rs
@ -238,6 +238,23 @@ impl Benchmark for QMatMul {
    const ITERS: usize = 100;
 }

+struct Cat;
+impl Benchmark for Cat {
+    type PreProcessData = (Tensor, Tensor);
+    type RunResult = Tensor;
+    fn preprocess() -> Result<Self::PreProcessData> {
+        let lhs = Tensor::randn(0f32, 1., (1, 32, 2000, 128), &Device::Cpu)?;
+        let rhs = Tensor::randn(0f32, 1., (1, 32, 1, 128), &Device::Cpu)?;
+        Ok((lhs, rhs))
+    }
+
+    fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+        Tensor::cat(&[&d.0, &d.1], 2)
+    }
+
+    const ITERS: usize = 1000;
+}
+
 struct Softmax;
 impl Benchmark for Softmax {
    type PreProcessData = Tensor;
@ -295,6 +312,7 @@ enum Task {
    Qmatmul,
    Softmax,
    SoftmaxLastDim,
+    Cat,
 }

 #[derive(Parser, Debug)]
@ -319,6 +337,7 @@ fn main() -> Result<()> {
        Task::Softmax => run::<Softmax>(args.iters)?,
        Task::SoftmaxLastDim => run::<SoftmaxLastDim>(args.iters)?,
        Task::Qmatmul => run::<QMatMul>(args.iters)?,
+        Task::Cat => run::<Cat>(args.iters)?,
    }
    Ok(())
 }
--- a/candle-nn/src/ops.rs
+++ b/candle-nn/src/ops.rs
@ -74,7 +74,7 @@ pub fn dropout(xs: &Tensor, drop_p: f32) -> Result<Tensor> {
    xs * mask
 }

-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct Dropout {
    drop_p: f32,
 }
@ -238,7 +238,8 @@ impl candle::CustomOp1 for SoftmaxLastDim {
            &output,
        )
        .unwrap();
-        let newstorage = candle::MetalStorage::new(output, device.clone(), storage.dtype());
+        let newstorage =
+            candle::MetalStorage::new(output, device.clone(), elem_count, storage.dtype());
        Ok((newstorage, layout.shape().clone()))
    }
 }
--- a/candle-onnx/Cargo.toml
+++ b/candle-onnx/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-onnx"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2021"

 description = "ONNX support for Candle"
@ -10,8 +10,8 @@ categories = ["science"]
 license = "MIT OR Apache-2.0"

 [dependencies]
-candle = { path = "../candle-core", package = "candle-core", version = "0.4.1" }
-candle-nn = { path = "../candle-nn", version = "0.4.1" }
+candle = { path = "../candle-core", package = "candle-core", version = "0.4.2" }
+candle-nn = { path = "../candle-nn", version = "0.4.2" }
 prost = "0.12.1"

 [build-dependencies]
--- a/candle-transformers/src/models/metavoice.rs
+++ b/candle-transformers/src/models/metavoice.rs
@ -2,7 +2,7 @@ use candle::{DType, Device, Error as E, IndexOp, Module, Result, Tensor, D};
 use candle_nn::{embedding, linear_b, rms_norm, Embedding, Linear, RmsNorm, VarBuilder};

 // Equivalent to torch.repeat_interleave
-fn repeat_interleave(img: &Tensor, repeats: usize, dim: usize) -> Result<Tensor> {
+pub(crate) fn repeat_interleave(img: &Tensor, repeats: usize, dim: usize) -> Result<Tensor> {
    let img = img.unsqueeze(dim + 1)?;
    let mut dims = img.dims().to_vec();
    dims[dim + 1] = repeats;
@ -55,12 +55,12 @@ pub mod speaker_encoder {
                    layer_idx,
                    ..Default::default()
                };
-                let in_c = if layer_idx == 0 {
-                    cfg.mel_n_channels
-                } else {
-                    cfg.model_hidden_size
-                };
-                let lstm = candle_nn::lstm(in_c, cfg.model_hidden_size, c, vb_l.clone())?;
+                let lstm = candle_nn::lstm(
+                    cfg.mel_n_channels,
+                    cfg.model_hidden_size,
+                    c,
+                    vb_l.pp(layer_idx),
+                )?;
                lstms.push(lstm)
            }
            let linear = linear_b(
@ -143,7 +143,7 @@ pub mod speaker_encoder {
                .iter()
                .flat_map(|s| [mel[s.0], mel[s.1]])
                .collect::<Vec<_>>();
-            let mels = Tensor::from_vec(mels, (1, mel_slices.len(), 2), device)?;
+            let mels = Tensor::from_vec(mels, (mel_slices.len(), 2), device)?;
            let partial_embeds = self.forward(&mels)?;
            let raw_embed = partial_embeds.mean(0)?;
            let norm = raw_embed.sqr()?.sum_all()?.sqrt()?;
@ -181,6 +181,7 @@ pub mod tokenizers {
        pub end_of_text: usize,
        pub offset: usize,
        pub ranks: HashMap<Vec<u8>, Rank>,
+        span: tracing::Span,
    }

    impl BPE {
@ -231,6 +232,7 @@ pub mod tokenizers {
                end_of_text,
                offset,
                ranks,
+                span: tracing::span!(tracing::Level::TRACE, "bpe"),
            })
        }

@ -310,6 +312,7 @@ pub mod tokenizers {
        }

        pub fn encode(&self, text: &str) -> Result<Vec<u32>> {
+            let _enter = self.span.enter();
            let mut bpe_tokens: Vec<u32> = Vec::new();
            for word in self.re.find_iter(text) {
                let word = word.map_err(E::wrap)?;
@ -426,6 +429,7 @@ pub mod gpt {
        c_attn: Linear,
        c_proj: Linear,
        n_head: usize,
+        span: tracing::Span,
    }

    impl SelfAttention {
@ -444,12 +448,14 @@ pub mod gpt {
                c_attn,
                c_proj,
                n_head: cfg.n_head,
+                span: tracing::span!(tracing::Level::TRACE, "self-attn"),
            })
        }
    }

    impl Module for SelfAttention {
        fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
            let (b, t, c) = xs.dims3()?;
            let c_x = xs
                .apply(&self.c_attn)?
@ -474,11 +480,13 @@ pub mod gpt {
        Gelu {
            c_fc: Linear,
            c_proj: Linear,
+            span: tracing::Span,
        },
        Swiglu {
            w1: Linear,
            w3: Linear,
            c_proj: Linear,
+            span: tracing::Span,
        },
    }

@ -489,7 +497,11 @@ pub mod gpt {
                NonLinearityType::Gelu => {
                    let c_fc = linear_b(cfg.n_embd, hidden_dim, cfg.bias, vb.pp("c_fc"))?;
                    let c_proj = linear_b(hidden_dim, cfg.n_embd, cfg.bias, vb.pp("c_proj"))?;
-                    Self::Gelu { c_fc, c_proj }
+                    Self::Gelu {
+                        c_fc,
+                        c_proj,
+                        span: tracing::span!(tracing::Level::TRACE, "mlp-gelu"),
+                    }
                }
                NonLinearityType::Swiglu => {
                    let hidden_dim = (2 * hidden_dim) / 3;
@ -502,7 +514,12 @@ pub mod gpt {
                    let w1 = linear_b(cfg.n_embd, hidden_dim, cfg.bias, vb.pp("w1"))?;
                    let w3 = linear_b(cfg.n_embd, hidden_dim, cfg.bias, vb.pp("w3"))?;
                    let c_proj = linear_b(hidden_dim, cfg.n_embd, cfg.bias, vb.pp("c_proj"))?;
-                    Self::Swiglu { w1, w3, c_proj }
+                    Self::Swiglu {
+                        w1,
+                        w3,
+                        c_proj,
+                        span: tracing::span!(tracing::Level::TRACE, "mlp-swiglu"),
+                    }
                }
            };
            Ok(slf)
@ -512,8 +529,17 @@ pub mod gpt {
    impl Module for MLP {
        fn forward(&self, xs: &Tensor) -> Result<Tensor> {
            match self {
-                Self::Gelu { c_fc, c_proj } => xs.apply(c_fc)?.gelu()?.apply(c_proj),
-                Self::Swiglu { w1, w3, c_proj } => {
+                Self::Gelu { c_fc, c_proj, span } => {
+                    let _enter = span.enter();
+                    xs.apply(c_fc)?.gelu()?.apply(c_proj)
+                }
+                Self::Swiglu {
+                    w1,
+                    w3,
+                    c_proj,
+                    span,
+                } => {
+                    let _enter = span.enter();
                    let w1 = xs.apply(w1)?;
                    let w3 = xs.apply(w3)?;
                    (w1.silu()? * w3)?.apply(c_proj)
@ -528,6 +554,7 @@ pub mod gpt {
        ln_2: Norm,
        attn: SelfAttention,
        mlp: MLP,
+        span: tracing::Span,
    }

    impl Block {
@ -541,12 +568,14 @@ pub mod gpt {
                ln_2,
                attn,
                mlp,
+                span: tracing::span!(tracing::Level::TRACE, "gpt-block"),
            })
        }
    }

    impl Module for Block {
        fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
            let xs = (xs + xs.apply(&self.ln_1)?.apply(&self.attn))?;
            let xs = (&xs + xs.apply(&self.ln_2)?.apply(&self.mlp))?;
            Ok(xs)
@ -563,6 +592,7 @@ pub mod gpt {
        lm_heads: Vec<Linear>,
        cfg: Config,
        dtype: DType,
+        span: tracing::Span,
    }

    impl Model {
@ -598,6 +628,7 @@ pub mod gpt {
                lm_heads,
                cfg,
                dtype: vb.dtype(),
+                span: tracing::span!(tracing::Level::TRACE, "gpt"),
            })
        }

@ -606,6 +637,7 @@ pub mod gpt {
        }

        pub fn forward(&self, idx: &Tensor) -> Result<Vec<Tensor>> {
+            let _enter = self.span.enter();
            let device = idx.device();
            let (b, _num_hierarchies, t) = idx.dims3()?;
            let pos = Tensor::arange(0u32, t as u32, device)?;
@ -664,15 +696,15 @@ pub mod transformer {
            }
        }

-        fn n_local_heads(&self) -> usize {
+        pub(crate) fn n_local_heads(&self) -> usize {
            self.n_local_heads.unwrap_or(self.n_head)
        }

-        fn head_dim(&self) -> usize {
+        pub(crate) fn head_dim(&self) -> usize {
            self.dim / self.n_head
        }

-        fn intermediate_size(&self) -> usize {
+        pub(crate) fn intermediate_size(&self) -> usize {
            match self.intermediate_size {
                Some(intermediate_size) => intermediate_size,
                None => {
@ -689,6 +721,7 @@ pub mod transformer {
        w1: Linear,
        w2: Linear,
        w3: Linear,
+        span: tracing::Span,
    }

    impl FeedForward {
@ -697,12 +730,18 @@ pub mod transformer {
            let w1 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w1"))?;
            let w2 = linear_b(i_size, cfg.dim, false, vb.pp("w2"))?;
            let w3 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w3"))?;
-            Ok(Self { w1, w2, w3 })
+            Ok(Self {
+                w1,
+                w2,
+                w3,
+                span: tracing::span!(tracing::Level::TRACE, "feed-forward"),
+            })
        }
    }

    impl Module for FeedForward {
        fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
            let swiglu = (candle_nn::ops::silu(&xs.apply(&self.w1)?)? * xs.apply(&self.w3))?;
            swiglu.apply(&self.w2)
        }
@ -718,6 +757,7 @@ pub mod transformer {
        head_dim: usize,
        n_head: usize,
        kv_cache: Option<(Tensor, Tensor)>,
+        span: tracing::Span,
    }

    impl Attention {
@ -736,10 +776,12 @@ pub mod transformer {
                head_dim,
                n_head: cfg.n_head,
                kv_cache: None,
+                span: tracing::span!(tracing::Level::TRACE, "feed-forward"),
            })
        }

        fn forward(&mut self, xs: &Tensor, _pos: usize, mask: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
            let (b_sz, seqlen, _) = xs.dims3()?;

            let qkv = xs.apply(&self.wqkv)?;
@ -793,6 +835,7 @@ pub mod transformer {
        feed_forward: FeedForward,
        ffn_norm: RmsNorm,
        attention_norm: RmsNorm,
+        span: tracing::Span,
    }

    impl Block {
@ -806,10 +849,12 @@ pub mod transformer {
                feed_forward,
                ffn_norm,
                attention_norm,
+                span: tracing::span!(tracing::Level::TRACE, "block"),
            })
        }

        fn forward(&mut self, xs: &Tensor, pos: usize, mask: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
            let hs = xs.apply(&self.attention_norm)?;
            let hs = (xs + self.attention.forward(&hs, pos, mask))?;
            &hs + hs.apply(&self.ffn_norm)?.apply(&self.feed_forward)
@ -829,6 +874,7 @@ pub mod transformer {
        norm: RmsNorm,
        output: Linear,
        spk_cond_mask: Tensor,
+        span: tracing::Span,
    }

    impl Model {
@ -865,6 +911,7 @@ pub mod transformer {
                norm,
                output,
                spk_cond_mask,
+                span: tracing::span!(tracing::Level::TRACE, "transformer"),
            })
        }

@ -875,6 +922,7 @@ pub mod transformer {
        }

        pub fn forward(&mut self, xs: &Tensor, spk_emb: &Tensor, pos: usize) -> Result<Tensor> {
+            let _enter = self.span.enter();
            let (_b_sz, seqlen) = xs.dims2()?;
            let mask: Vec<_> = (0..seqlen)
                .flat_map(|i| (0..seqlen).map(move |j| if i < j { f32::NEG_INFINITY } else { 0. }))
@ -905,14 +953,19 @@ pub mod adapters {
    // https://github.com/metavoiceio/metavoice-src/blob/9078234c496d76adbec06df789b6b04b1875f129/fam/llm/adapters/tilted_encodec.py
    pub struct TiltedEncodec {
        end_of_audio_token: u32,
+        span: tracing::Span,
    }

    impl TiltedEncodec {
        pub fn new(end_of_audio_token: u32) -> Self {
-            Self { end_of_audio_token }
+            Self {
+                end_of_audio_token,
+                span: tracing::span!(tracing::Level::TRACE, "tilted-encodec"),
+            }
        }

        pub fn decode(&self, tokens: &[Vec<u32>]) -> (Vec<u32>, Vec<Vec<u32>>) {
+            let _enter = self.span.enter();
            let mut text_ids = vec![];
            let mut extracted_audio_ids = vec![];
            let mut min_audio_ids_len = usize::MAX;
@ -941,14 +994,19 @@ pub mod adapters {
    // https://github.com/metavoiceio/metavoice-src/blob/9078234c496d76adbec06df789b6b04b1875f129/fam/llm/adapters/flattened_encodec.py#L4
    pub struct FlattenedInterleavedEncodec2Codebook {
        end_of_audio_token: u32,
+        span: tracing::Span,
    }

    impl FlattenedInterleavedEncodec2Codebook {
        pub fn new(end_of_audio_token: u32) -> Self {
-            Self { end_of_audio_token }
+            Self {
+                end_of_audio_token,
+                span: tracing::span!(tracing::Level::TRACE, "encodec2codebook"),
+            }
        }

        pub fn decode(&self, tokens: &[u32]) -> (Vec<u32>, Vec<u32>, Vec<u32>) {
+            let _enter = self.span.enter();
            let mut text_ids = vec![];
            let mut audio_ids1 = vec![];
            let mut audio_ids2 = vec![];
--- a/candle-transformers/src/models/mod.rs
+++ b/candle-transformers/src/models/mod.rs
@ -23,6 +23,7 @@ pub mod mistral;
 pub mod mixformer;
 pub mod mixtral;
 pub mod mobileone;
+pub mod moondream;
 pub mod mpt;
 pub mod persimmon;
 pub mod phi;
@ -30,6 +31,7 @@ pub mod quantized_blip;
 pub mod quantized_blip_text;
 pub mod quantized_llama;
 pub mod quantized_llama2_c;
+pub mod quantized_metavoice;
 pub mod quantized_mistral;
 pub mod quantized_mixformer;
 pub mod quantized_mpt;
--- a/candle-transformers/src/models/moondream.rs
+++ b/candle-transformers/src/models/moondream.rs
@ -0,0 +1,174 @@
+#![allow(unused)]
+use crate::models::phi;
+use candle::{Module, Result, Tensor};
+use candle_nn::{linear_b, Linear, VarBuilder};
+
+// https://github.com/vikhyat/moondream/blob/main/moondream/configuration_moondream.py
+#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
+pub struct Config {
+    phi_config: phi::Config,
+    vision_config: VisionConfig,
+}
+
+#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
+pub struct VisionConfig {
+    image_embedding_dim: usize,
+    model_dim: usize,
+    hidden_dim: usize,
+    act: candle_nn::Activation,
+}
+
+impl VisionConfig {
+    pub fn v2() -> Self {
+        Self {
+            image_embedding_dim: 1152,
+            model_dim: 2048,
+            hidden_dim: 2048 * 4,
+            act: candle_nn::Activation::Silu,
+        }
+    }
+}
+
+impl Config {
+    pub fn v2() -> Self {
+        let phi_config = phi::Config {
+            vocab_size: 51200,
+            hidden_size: 2048,
+            intermediate_size: 8192,
+            num_hidden_layers: 24,
+            num_attention_heads: 32,
+            num_key_value_heads: None,
+            hidden_act: candle_nn::Activation::NewGelu,
+            max_position_embeddings: 2048,
+            tie_word_embeddings: false,
+            layer_norm_eps: 1e-5,
+            rope_theta: 10_000.,
+            partial_rotary_factor: 0.5,
+            qk_layernorm: false,
+        };
+        let vision_config = VisionConfig::v2();
+        Self {
+            phi_config,
+            vision_config,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct LinearPatchEmbedding {
+    linear: Linear,
+}
+
+#[derive(Debug, Clone)]
+struct Encoder {}
+
+impl Encoder {
+    fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
+        todo!()
+    }
+}
+
+impl Module for Encoder {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        todo!()
+    }
+}
+
+#[derive(Debug, Clone)]
+struct Mlp {
+    fc1: Linear,
+    act: candle_nn::Activation,
+    fc2: Linear,
+}
+
+impl Mlp {
+    fn new(
+        in_f: usize,
+        hidden_f: usize,
+        out_f: usize,
+        act: candle_nn::Activation,
+        vb: VarBuilder,
+    ) -> Result<Self> {
+        let fc1 = linear_b(in_f, hidden_f, true, vb.pp("fc1"))?;
+        let fc2 = linear_b(hidden_f, out_f, true, vb.pp("fc2"))?;
+        Ok(Self { fc1, act, fc2 })
+    }
+}
+
+impl Module for Mlp {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct VisionProjection {
+    mlp: Mlp,
+}
+
+impl VisionProjection {
+    fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
+        let mlp = Mlp::new(
+            cfg.image_embedding_dim,
+            cfg.hidden_dim,
+            cfg.model_dim,
+            cfg.act,
+            vb.pp("mlp"),
+        )?;
+        Ok(Self { mlp })
+    }
+}
+
+impl Module for VisionProjection {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        xs.apply(&self.mlp)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct VisionEncoder {
+    encoder: Encoder,
+    projection: VisionProjection,
+}
+
+impl VisionEncoder {
+    pub fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
+        let encoder = Encoder::new(cfg, vb.pp("vision.trunk"))?;
+        let projection = VisionProjection::new(cfg, vb.pp("projection"))?;
+        Ok(Self {
+            encoder,
+            projection,
+        })
+    }
+}
+
+impl Module for VisionEncoder {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let (b, c, hp1, wp2) = xs.dims4()?;
+        let (p1, p2) = (14, 14);
+        let h = hp1 / p1;
+        let w = wp2 / p2;
+        let xs = xs
+            .reshape((b, c, h, p1, h, p2))?
+            .permute((0, 2, 4, 1, 3, 5))?
+            .reshape((b, h * w, c * p1 * p2))?;
+        xs.apply(&self.encoder)?.apply(&self.projection)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Model {
+    text_model: phi::Model,
+    vision_encoder: VisionEncoder,
+}
+
+impl Model {
+    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let text_model = phi::Model::new(&cfg.phi_config, vb.pp("text_model"))?;
+        let vision_encoder = VisionEncoder::new(&cfg.vision_config, vb.pp("vision_encoder"))?;
+        Ok(Self {
+            text_model,
+            vision_encoder,
+        })
+    }
+}
--- a/candle-transformers/src/models/phi.rs
+++ b/candle-transformers/src/models/phi.rs
@ -106,7 +106,7 @@ impl Module for MLP {
    }
 }

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 struct Attention {
    q_proj: Linear,
    k_proj: Linear,
@ -265,7 +265,7 @@ impl Attention {
    }
 }

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 struct DecoderLayer {
    self_attn: Attention,
    mlp: MLP,
@ -304,7 +304,7 @@ impl DecoderLayer {
    }
 }

-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct Model {
    embed_tokens: Embedding,
    layers: Vec<DecoderLayer>,
--- a/candle-transformers/src/models/quantized_metavoice.rs
+++ b/candle-transformers/src/models/quantized_metavoice.rs
@ -0,0 +1,242 @@
+use crate::quantized_nn::{linear_b, Embedding, Linear, RmsNorm};
+pub use crate::quantized_var_builder::VarBuilder;
+
+use crate::models::metavoice::repeat_interleave;
+use candle::{Module, Result, Tensor, D};
+
+pub mod transformer {
+    use super::*;
+
+    type Config = crate::models::metavoice::transformer::Config;
+
+    #[derive(Debug, Clone)]
+    struct FeedForward {
+        w1: Linear,
+        w2: Linear,
+        w3: Linear,
+        span: tracing::Span,
+    }
+
+    impl FeedForward {
+        fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+            let i_size = cfg.intermediate_size();
+            let w1 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w1"))?;
+            let w2 = linear_b(i_size, cfg.dim, false, vb.pp("w2"))?;
+            let w3 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w3"))?;
+            Ok(Self {
+                w1,
+                w2,
+                w3,
+                span: tracing::span!(tracing::Level::TRACE, "feed-forward"),
+            })
+        }
+    }
+
+    impl Module for FeedForward {
+        fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
+            let swiglu = (candle_nn::ops::silu(&xs.apply(&self.w1)?)? * xs.apply(&self.w3))?;
+            swiglu.apply(&self.w2)
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct Attention {
+        wqkv: Linear,
+        wo: Linear,
+        dim: usize,
+        kv_size: usize,
+        n_local_heads: usize,
+        head_dim: usize,
+        n_head: usize,
+        kv_cache: Option<(Tensor, Tensor)>,
+        span: tracing::Span,
+    }
+
+    impl Attention {
+        fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+            let n_local_heads = cfg.n_local_heads();
+            let head_dim = cfg.head_dim();
+            let total_head_dim = (cfg.n_head + 2 * n_local_heads) * head_dim;
+            let wqkv = linear_b(cfg.dim, total_head_dim, false, vb.pp("wqkv"))?;
+            let wo = linear_b(cfg.dim, cfg.dim, false, vb.pp("wo"))?;
+            Ok(Self {
+                wqkv,
+                wo,
+                dim: cfg.dim,
+                kv_size: n_local_heads * head_dim,
+                n_local_heads,
+                head_dim,
+                n_head: cfg.n_head,
+                kv_cache: None,
+                span: tracing::span!(tracing::Level::TRACE, "attention"),
+            })
+        }
+
+        fn forward(&mut self, xs: &Tensor, _pos: usize, mask: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
+            let (b_sz, seqlen, _) = xs.dims3()?;
+
+            let qkv = xs.apply(&self.wqkv)?;
+            let q = qkv.narrow(D::Minus1, 0, self.dim)?;
+            let k = qkv.narrow(D::Minus1, self.dim, self.kv_size)?;
+            let v = qkv.narrow(D::Minus1, self.dim + self.kv_size, self.kv_size)?;
+            let q = q
+                .reshape((b_sz, seqlen, self.n_head, self.head_dim))?
+                .transpose(1, 2)?
+                .contiguous()?;
+            let k = k
+                .reshape((b_sz, seqlen, self.n_local_heads, self.head_dim))?
+                .transpose(1, 2)?;
+            let v = v
+                .reshape((b_sz, seqlen, self.n_local_heads, self.head_dim))?
+                .transpose(1, 2)?;
+
+            let (k, v) = match &self.kv_cache {
+                None => (k, v),
+                Some((prev_k, prev_v)) => {
+                    let k = Tensor::cat(&[prev_k, &k], 2)?;
+                    let v = Tensor::cat(&[prev_v, &v], 2)?;
+                    (k, v)
+                }
+            };
+            self.kv_cache = Some((k.clone(), v.clone()));
+
+            let k = repeat_interleave(&k, self.n_head / self.n_local_heads, 1)?;
+            let v = repeat_interleave(&v, self.n_head / self.n_local_heads, 1)?;
+
+            let scale = 1f64 / f64::sqrt(self.head_dim as f64);
+            let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?;
+
+            let attn_weights = attn_weights.broadcast_add(mask)?;
+            let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?;
+            let attn_output = attn_weights.matmul(&v)?;
+            attn_output
+                .transpose(1, 2)?
+                .reshape((b_sz, seqlen, self.dim))?
+                .apply(&self.wo)
+        }
+
+        fn clear_kv_cache(&mut self) {
+            self.kv_cache = None
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct Block {
+        attention: Attention,
+        feed_forward: FeedForward,
+        ffn_norm: RmsNorm,
+        attention_norm: RmsNorm,
+        span: tracing::Span,
+    }
+
+    impl Block {
+        fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+            let attention = Attention::new(cfg, vb.pp("attention"))?;
+            let feed_forward = FeedForward::new(cfg, vb.pp("feed_forward"))?;
+            let ffn_norm = RmsNorm::new(cfg.dim, cfg.norm_eps, vb.pp("ffn_norm"))?;
+            let attention_norm = RmsNorm::new(cfg.dim, cfg.norm_eps, vb.pp("attention_norm"))?;
+            Ok(Self {
+                attention,
+                feed_forward,
+                ffn_norm,
+                attention_norm,
+                span: tracing::span!(tracing::Level::TRACE, "block"),
+            })
+        }
+
+        fn forward(&mut self, xs: &Tensor, pos: usize, mask: &Tensor) -> Result<Tensor> {
+            let _enter = self.span.enter();
+            let hs = xs.apply(&self.attention_norm)?;
+            let hs = (xs + self.attention.forward(&hs, pos, mask))?;
+            &hs + hs.apply(&self.ffn_norm)?.apply(&self.feed_forward)
+        }
+
+        fn clear_kv_cache(&mut self) {
+            self.attention.clear_kv_cache()
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    pub struct Model {
+        tok_embeddings: Embedding,
+        pos_embeddings: Embedding,
+        speaker_cond_pos: Linear,
+        layers: Vec<Block>,
+        norm: RmsNorm,
+        output: Linear,
+        spk_cond_mask: Tensor,
+        span: tracing::Span,
+    }
+
+    impl Model {
+        pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+            let tok_embeddings = Embedding::new(cfg.vocab_size, cfg.dim, vb.pp("tok_embeddings"))?;
+            let pos_embeddings = Embedding::new(cfg.block_size, cfg.dim, vb.pp("pos_embeddings"))?;
+            let speaker_cond_pos = linear_b(
+                cfg.speaker_emb_dim,
+                cfg.dim,
+                false,
+                vb.pp("speaker_cond_pos"),
+            )?;
+            let mut layers = Vec::with_capacity(cfg.n_layer);
+            let vb_l = vb.pp("layers");
+            for layer_idx in 0..cfg.n_layer {
+                let layer = Block::new(cfg, vb_l.pp(layer_idx))?;
+                layers.push(layer)
+            }
+            let norm = RmsNorm::new(cfg.dim, cfg.norm_eps, vb.pp("norm"))?;
+            let output = linear_b(cfg.dim, cfg.vocab_size, false, vb.pp("output"))?;
+            let spk_cond_mask = Tensor::cat(
+                &[
+                    Tensor::ones((1, 1, cfg.dim), candle::DType::F32, vb.device())?,
+                    Tensor::zeros((1, 1, cfg.dim), candle::DType::F32, vb.device())?,
+                ],
+                0,
+            )?;
+            Ok(Self {
+                tok_embeddings,
+                pos_embeddings,
+                speaker_cond_pos,
+                layers,
+                norm,
+                output,
+                spk_cond_mask,
+                span: tracing::span!(tracing::Level::TRACE, "qtransformer"),
+            })
+        }
+
+        pub fn clear_kv_cache(&mut self) {
+            for layer in self.layers.iter_mut() {
+                layer.clear_kv_cache()
+            }
+        }
+
+        pub fn forward(&mut self, xs: &Tensor, spk_emb: &Tensor, pos: usize) -> Result<Tensor> {
+            let _enter = self.span.enter();
+            let (_b_sz, seqlen) = xs.dims2()?;
+            let mask: Vec<_> = (0..seqlen)
+                .flat_map(|i| (0..seqlen).map(move |j| if i < j { f32::NEG_INFINITY } else { 0. }))
+                .collect();
+            let mask = Tensor::from_slice(&mask, (1, 1, seqlen, seqlen), xs.device())?;
+            let input_pos = Tensor::arange(pos as u32, (pos + seqlen) as u32, xs.device())?;
+            let tok_embeddings = xs.apply(&self.tok_embeddings)?;
+            let pos_embeddings = input_pos.apply(&self.pos_embeddings)?;
+            let mut xs = tok_embeddings
+                .broadcast_add(&pos_embeddings)?
+                .broadcast_add(
+                    &spk_emb
+                        .apply(&self.speaker_cond_pos)?
+                        .broadcast_mul(&self.spk_cond_mask)?,
+                )?;
+            let mask = mask.to_dtype(xs.dtype())?;
+            for layer in self.layers.iter_mut() {
+                xs = layer.forward(&xs, pos, &mask)?
+            }
+            xs.narrow(1, seqlen - 1, 1)?
+                .apply(&self.norm)?
+                .apply(&self.output)
+        }
+    }
+}
--- a/candle-transformers/src/models/with_tracing.rs
+++ b/candle-transformers/src/models/with_tracing.rs
@ -116,6 +116,12 @@ impl QMatMul {
        let span = tracing::span!(tracing::Level::TRACE, "qmatmul");
        Ok(Self { inner, span })
    }
+
+    pub fn from_weights(ws: std::sync::Arc<candle::quantized::QTensor>) -> Result<Self> {
+        let inner = candle::quantized::QMatMul::from_arc(ws)?;
+        let span = tracing::span!(tracing::Level::TRACE, "qmatmul");
+        Ok(Self { inner, span })
+    }
 }

 impl Module for QMatMul {
--- a/candle-transformers/src/quantized_nn.rs
+++ b/candle-transformers/src/quantized_nn.rs
@ -35,6 +35,14 @@ pub struct Linear {
 }

 impl Linear {
+    pub fn from_arc(
+        weight: std::sync::Arc<candle::quantized::QTensor>,
+        bias: Option<Tensor>,
+    ) -> Result<Self> {
+        let weight = QMatMul::from_weights(weight)?;
+        Ok(Self { weight, bias })
+    }
+
    pub fn from_weights(weight: QMatMul, bias: Option<Tensor>) -> Self {
        Self { weight, bias }
    }
@ -50,6 +58,16 @@ impl Module for Linear {
    }
 }

+pub fn linear_b(in_dim: usize, out_dim: usize, bias: bool, vb: VarBuilder) -> Result<Linear> {
+    let bias = if bias {
+        Some(vb.get(out_dim, "bias")?.dequantize(vb.device())?)
+    } else {
+        None
+    };
+    let weight = QMatMul::new(in_dim, out_dim, vb)?;
+    Ok(Linear { weight, bias })
+}
+
 pub fn linear(in_dim: usize, out_dim: usize, vb: VarBuilder) -> Result<Linear> {
    let bias = vb.get(out_dim, "bias")?.dequantize(vb.device())?;
    let weight = QMatMul::new(in_dim, out_dim, vb)?;
--- a/candle-transformers/src/quantized_var_builder.rs
+++ b/candle-transformers/src/quantized_var_builder.rs
@ -3,6 +3,7 @@ use candle::{Device, Result, Shape};
 use std::sync::Arc;

 // VarBuilder specialized for QTensors
+#[derive(Clone)]
 pub struct VarBuilder {
    data: Arc<std::collections::HashMap<String, Arc<QTensor>>>,
    path: Vec<String>,
Author	SHA1	Message	Date
laurent	101a4c8389	Moondream first bits.	2024-03-17 17:49:56 +01:00
Laurent Mazare	ce9fbc3682	Optimize the cat operation on contiguous tensors (#1855 ) * Add a specialized kernel for copy2d. * Move the cat operations. * Avoid transpositions in cat. * Bugfix. * Bugfix for the cuda kernel. * Add a benchmark. * Add more testing. * Test fix. * Faster kernel. * Add the missing kernel. * Tweak the test. * Add a metal kernel. * Fix for the metal kernel. * Get the tests to pass on metal. * Also use this opportunity to fix the metal kernel for ELU. * Add some bf16 kernels. * Clippy fixes.	2024-03-17 10:49:13 +01:00
Thomas Santerre	db8b24ae92	Add support for index u8/i64 and input f16/bf16 scatter-add on metal (#1849 ) * add support and tests for scatter add on metal * add support for all datatypes	2024-03-17 08:09:43 +01:00
Laurent Mazare	74bf6994b1	Move the image tensor to the appropriate device. (#1856 )	2024-03-16 22:25:46 +01:00
Laurent Mazare	cdc4c172c4	Implement the error trait for DTypeParseError. (#1852 )	2024-03-15 08:37:27 +01:00
Jani Monoses	e1f9c3776d	StableLM-2 models were updated to use GPT-2 tokenization. (#1847 )	2024-03-14 21:01:36 +01:00
Tyler Rockwood	3318fe30fb	Update gemma README (#1843 ) * Update gemma README * Fixit	2024-03-13 21:41:36 +01:00
Thomas Santerre	2bb9c683b9	Update README.md (#1840 ) Adds the candle-einops to the readme as an external resource	2024-03-13 14:36:25 +01:00
Laurent Mazare	ff03fd3fb3	Expose some helper functions to create quantized models. (#1837 )	2024-03-12 11:30:24 +01:00
Laurent Mazare	df5f69444e	Properly handle the batch dimension in cuda quantized matmul. (#1832 )	2024-03-10 20:23:43 +01:00
Laurent Mazare	0c5eecbc0f	Add some tracing to metavoice. (#1826 )	2024-03-09 12:24:11 +01:00
Laurent Mazare	56c9d3ee7b	Fix the model path for rwkv. (#1825 )	2024-03-09 11:21:48 +01:00
Laurent Mazare	dd00482ea3	Quantized version of the metavoice model. (#1824 ) * Quantized version of the metavoice model. * Integrate the quantized version of metavoice.	2024-03-09 11:06:04 +01:00
Laurent Mazare	936f6a4840	Fix dequantization. (#1823 )	2024-03-08 23:12:13 +01:00
Laurent Mazare	3440cec3a0	Fast CPU kernel for transposed 1d convolutions. (#1822 ) * Fast CPU kernel for transposed 1d convolutions. * Bugfix.	2024-03-08 22:43:07 +01:00
Laurent Mazare	e7fc1daa21	Bump the crate versions to 0.4.2. (#1821 )	2024-03-08 22:01:51 +01:00
Niklas Hallqvist	be5b68cd0b	Metal random-generation bug fixes (#1811 ) * use_resource API misunderstood. It is not additive. Several usages must be bit-ORed together. * The seeding was incorrect and used the address instead of the value of the passed in seed. * Add a check that likely exhibits failure to update the seed between generation of random tensors. * Buffer overrun, the length given to the std::ptr::copy call was in bytes, and not 32-bit units. * By default seed the RNG with a time-based value, so that different runs may produce different output, just like the CPU engine. Use device.set_seed if determinism is warranted. * Revert "By default seed the RNG with a time-based value, so that different runs may produce different output, just like the CPU engine. Use device.set_seed if determinism is warranted." This reverts commit `d7302de9` Discussion in https://github.com/huggingface/candle/pull/1811#issuecomment-1983079119 * The Metal random kernel failed to set element N/2 of tensors with N elements, N being even. The reason was that all threads but thread 0 all created 2 random samples, but thread 0 only one, i.e. an odd number. In order to produce an even number of samples, the early termination of thread 0 should only everr occur for odd sized tensors. * Add a test catching any deterministic tensor element in rand and randn output. --------- Co-authored-by: niklas <niklas@appli.se> Co-authored-by: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com>	2024-03-08 16:11:50 +01:00
Laurent Mazare	ea984d0421	Expose more printer options. (#1817 )	2024-03-08 15:04:18 +01:00
Laurent Mazare	9634583781	Expose a couple layout methods. (#1816 )	2024-03-08 10:52:22 +01:00
Kirpal Grewal	758366160e	add clone to candle dropout (#1814 )	2024-03-08 08:18:01 +01:00
Niklas Hallqvist	0a3487a776	Add a --seed argument to the stable-diffusion example. (#1812 ) * Add a --seed argument to the stable-diffusion example. * Make the case when no seed is specified, that it will not be set, but use the engine's default. This will make the CPU engine work again when no --seed is given, and will cause a bailout when a seed is there, as the engine does not currently support it. --------- Co-authored-by: niklas <niklas@appli.se>	2024-03-08 08:17:36 +01:00
ivarflakstad	0c09d10f32	Improve metal buffer usage (#1807 ) * Improve metal buffer usage * Clone cpu storage when loading to reduce wait_until_complete calls * Use powers of two for buffer sizes so reuse is more likely. * Select best available buffer by size. * Add count to MetalStorage -> can use buffer with different size Co-authored-by: Chris Fleetwood <christopher.fleetwood@huggingface.co> * Simplify new buffer creation without blit copy. Revert &[] -> Vec * Add documentation on newBufferWithBytes safety / synchronization * Drop unused buffers after command buffer is done syncing. --------- Co-authored-by: Chris Fleetwood <christopher.fleetwood@huggingface.co>	2024-03-07 09:42:34 +01:00