Fixed matmul (display still broken without casting back to CPU first? )

Tmp state.
Fixing the kernels + launches to make them faster.
2025-06-17 11:08:52 +00:00 · 2023-11-10 20:09:25 +01:00 · 2023-11-10 15:35:46 +01:00 · 2023-11-10 11:14:51 +01:00 · 2023-11-10 02:18:14 +01:00 · 2023-11-10 01:24:49 +01:00
40 changed files with 449 additions and 2282 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -51,7 +51,6 @@ rayon = "1.7.0"
 rusttype = { version = "0.9", default-features = false }
 safetensors = "0.3.1"
 serde = { version = "1.0.171", features = ["derive"] }
-serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
 tokenizers = { version = "0.13.4", default-features = false }
--- a/README.md
+++ b/README.md
@ -69,8 +69,6 @@ We also provide a some command line based examples using state of the art models
  performance larger than all publicly available 13b models as of 2023-09-28.
 - [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code generation.
 - [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
- [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
-  (English/Chinese) general LLMs with 6b and 34b parameters.
 - [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
  the LLaMA model using the same quantization techniques as
  [llama.cpp](https://github.com/ggerganov/llama.cpp).
@ -176,9 +174,8 @@ If you have an addition to this list, please submit a pull request.
        - StableLM-3B-4E1T.
        - Replit-code-v1.5-3B.
        - Bert.
-        - Yi-6B and Yi-34B.
    - Text to text.
-        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
+        - T5 and its variants: FlanT5, MADLAD400 (translation), CoEdit (Grammar correction).
        - Marian MT (Machine Translation).
    - Whisper (multi-lingual support).
    - Text to image.
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@ -4,10 +4,13 @@ use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Layout, Result, Shape};
 use candle_metal_kernels;
 use candle_metal_kernels::Kernels;
-use half::f16;
+use core::mem;
+use half::{bf16, f16};
 use metal;
-use metal::{Buffer, CommandBuffer, CommandQueue, HeapDescriptor, MTLResourceOptions, NSUInteger};
-use std::sync::{Arc, RwLock};
+use metal::mps::matrix::encode_gemm;
+use metal::mps::Float32;
+use metal::{Buffer, CommandQueue, MTLResourceOptions, NSUInteger};
+use std::sync::Arc;

 /// Metal related errors
 #[derive(thiserror::Error, Debug)]
@ -35,8 +38,6 @@ impl From<String> for MetalError {
 pub struct MetalDevice {
    device: metal::Device,
    command_queue: metal::CommandQueue,
-    heap: metal::Heap,
-    command_buffer: Arc<RwLock<metal::CommandBuffer>>,
    kernels: Arc<candle_metal_kernels::Kernels>,
 }

@ -55,6 +56,10 @@ impl std::ops::Deref for MetalDevice {
 }

 impl MetalDevice {
+    // pub fn metal_device(&self) -> &metal::DeviceRef {
+    //     self.device.as_ref()
+    // }
+
    pub fn id(&self) -> NSUInteger {
        self.registry_id()
    }
@ -63,35 +68,6 @@ impl MetalDevice {
        &self.command_queue
    }

-    pub fn command_buffer(&self) -> std::sync::RwLockReadGuard<CommandBuffer> {
-        self.command_buffer.read().unwrap()
-    }
-
-    pub fn commit_wait_until_completed(&self) {
-        let mut old = self.command_buffer.try_write().unwrap();
-        let status = old.status();
-        use metal::MTLCommandBufferStatus::{
-            Committed, Completed, Enqueued, Error, NotEnqueued, Scheduled,
-        };
-        // match old.status() {}
-        if old.status() == metal::MTLCommandBufferStatus::Completed {
-            return;
-        }
-        old.commit();
-        old.wait_until_completed();
-        // let count = old.retain_count();
-        // println!("Count {count:?}");
-        let command_buffer = self.command_queue.new_command_buffer().to_owned();
-
-        *old = command_buffer;
-        // let count = old.retain_count();
-        // // println!("Count after {count:?}");
-        // old.release();
-        // let count = old.retain_count();
-        // println!("Count after release {count:?}");
-        // self.command_buffer.replace_with(|_| command_buffer)
-    }
-
    pub fn kernels(&self) -> &Kernels {
        &self.kernels
    }
@ -102,21 +78,9 @@ impl MetalDevice {

    pub fn new_buffer(&self, element_count: usize, dtype: DType) -> Buffer {
        let size = (element_count * dtype.size_in_bytes()) as NSUInteger;
-        // println!("Creating buffer {size}");
-        let buffer = self
-            .heap
-            .new_buffer(size, MTLResourceOptions::StorageModeShared)
-            .expect("New buffer");
-        // println!("{:?}", self.heap.used_size());
-        buffer
-    }
-
-    pub fn new_buffer_with_data<T>(&self, data: &[T]) -> Buffer {
-        let size = core::mem::size_of_val(data) as NSUInteger;
-        let option = metal::MTLResourceOptions::StorageModeShared;
-        // println!("Creating data buffer {size}");
+        // debug!("Allocate 1 - buffer size {size}");
        self.device
-            .new_buffer_with_data(data.as_ptr() as *const core::ffi::c_void, size, option)
+            .new_buffer(size, MTLResourceOptions::StorageModeManaged)
    }
 }

@ -143,11 +107,11 @@ impl BackendStorage for MetalStorage {
    }

    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        self.device.commit_wait_until_completed();
-
+        // TODO Is this necessary
+        // self.buffer.synchronize();
        match self.dtype {
            DType::U8 => Ok(CpuStorage::U8(
-                self.buffer.read_to_vec(self.buffer.length() as usize),
+                self.buffer.read_to_vec(self.buffer.length() as usize / 1),
            )),
            DType::U32 => Ok(CpuStorage::U32(
                self.buffer.read_to_vec(self.buffer.length() as usize / 4),
@ -177,52 +141,29 @@ impl BackendStorage for MetalStorage {
        let el = shape.elem_count();
        let dtype = self.dtype;

+        assert!(layout.is_contiguous());
+        assert_eq!(dtype, DType::F32);
+
        let mut buffer = device.new_buffer(el, self.dtype);
-        let command_buffer = self.device.command_buffer();
-        if layout.is_contiguous() && layout.start_offset() == 0 {
-            let name = match self.dtype {
-                DType::F32 => "affine_float",
-                DType::F16 => "affine_half",
-                dtype => todo!("Affine {dtype:?}"),
-            };
-            candle_metal_kernels::call_affine(
-                &device.device,
-                &command_buffer,
-                &device.kernels,
-                name,
-                el,
-                &self.buffer,
-                &mut buffer,
-                mul as f32,
-                add as f32,
-            )
-            .unwrap();
-        } else {
-            let name = match self.dtype {
-                DType::F32 => "affine_float_strided",
-                DType::F16 => "affine_half_strided",
-                dtype => todo!("Affine {dtype:?}"),
-            };
-            candle_metal_kernels::call_affine_strided(
-                &device.device,
-                &command_buffer,
-                &device.kernels,
-                name,
-                layout.dims(),
-                &self.buffer,
-                layout.stride(),
-                layout.start_offset() * dtype.size_in_bytes(),
-                &mut buffer,
-                mul as f32,
-                add as f32,
-            )
-            .unwrap();
-        }
-        Ok(Self {
+        let command_buffer = self.device.command_queue.new_command_buffer();
+        candle_metal_kernels::call_affine(
+            &device.device,
+            &command_buffer,
+            &device.kernels,
+            el,
+            &self.buffer,
+            &mut buffer,
+            mul as f32,
+            add as f32,
+        )
+        .unwrap();
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+        return Ok(Self {
            buffer,
            device: device.clone(),
            dtype,
-        })
+        });
    }

    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
@ -234,10 +175,10 @@ impl BackendStorage for MetalStorage {
    }

    fn reduce_op(&self, op: ReduceOp, layout: &Layout, sum_dims: &[usize]) -> Result<Self> {
+        // debug!("TODO reduce_op {op:?} {sum_dims:?}");
        assert!(sum_dims.len() == 1);
        assert!(sum_dims[0] == layout.shape().rank() - 1);
        assert!(layout.is_contiguous());
-        assert!(layout.start_offset() == 0);
        let device = self.device.clone();
        let src_stride = layout.stride();
        let src_dims = layout.shape().dims();
@ -273,7 +214,7 @@ impl BackendStorage for MetalStorage {
        }
        let dtype = if return_index { DType::U32 } else { self.dtype };
        let mut buffer = device.new_buffer(dst_el, dtype);
-        let command_buffer = self.device.command_buffer();
+        let command_buffer = self.device.command_queue.new_command_buffer();
        candle_metal_kernels::call_reduce_contiguous(
            &device.device,
            &command_buffer,
@ -285,6 +226,8 @@ impl BackendStorage for MetalStorage {
            &mut buffer,
        )
        .map_err(MetalError::from)?;
+        command_buffer.commit();
+        command_buffer.wait_until_completed();

        Ok(Self {
            buffer,
@ -302,12 +245,10 @@ impl BackendStorage for MetalStorage {
        let shape = layout.shape();
        let el_count = shape.elem_count();
        let mut buffer = device.new_buffer(el_count, dtype);
-        let command_buffer = device.command_buffer();
+        let command_buffer = device.command_queue.new_command_buffer();
        if layout.is_contiguous() {
            let kernel_name = match (self.dtype, dtype) {
                (DType::U32, DType::F32) => "cast_u32_f32",
-                (DType::F32, DType::F16) => "cast_f32_f16",
-                (DType::F16, DType::F32) => "cast_f16_f32",
                (left, right) => todo!("to dtype {left:?} - {right:?}"),
            };
            candle_metal_kernels::call_cast_contiguous(
@ -321,26 +262,22 @@ impl BackendStorage for MetalStorage {
            )
            .map_err(MetalError::from)?;
        } else {
-            let kernel_name = match (self.dtype, dtype) {
-                (DType::U32, DType::F32) => "cast_u32_f32_strided",
-                (DType::F32, DType::F16) => "cast_f32_f16_strided",
-                (DType::F16, DType::F32) => "cast_f16_f32_strided",
-                (left, right) => todo!("to dtype {left:?} - {right:?}"),
-            };
-            candle_metal_kernels::call_cast_strided(
-                &device.device,
-                &command_buffer,
-                &device.kernels,
-                kernel_name,
-                layout.dims(),
-                &self.buffer,
-                layout.stride(),
-                layout.start_offset() * self.dtype.size_in_bytes(),
-                &mut buffer,
-            )
-            .map_err(MetalError::from)?;
+            todo!(
+                "TODO Implement the kernel calling cast {:?}-{:?}",
+                self.dtype,
+                dtype
+            );
        }

+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+        // command_buffer.wait_until_scheduled();
+        // debug!(
+        //     "cast {:?} - {:?} - {:?}",
+        //     dtype,
+        //     self.buffer.length(),
+        //     buffer.length()
+        // );
        Ok(Self {
            buffer,
            device: device.clone(),
@ -354,96 +291,35 @@ impl BackendStorage for MetalStorage {
        let shape = layout.shape();
        let el_count = shape.elem_count();
        let mut buffer = device.new_buffer(el_count, dtype);
-        {
-            let command_buffer = device.command_buffer();
-            if layout.is_contiguous() && layout.start_offset() == 0 {
-                use candle_metal_kernels::unary::contiguous;
+        let command_buffer = device.command_queue.new_command_buffer();
+        if layout.is_contiguous() {
+            use candle_metal_kernels::unary::contiguous;

-                let kernel_name = match (B::KERNEL, dtype) {
-                    ("ucos", DType::F32) => contiguous::cos::FLOAT,
-                    ("usin", DType::F32) => contiguous::sin::FLOAT,
-                    ("usqr", DType::F32) => contiguous::sqr::FLOAT,
-                    ("usqrt", DType::F32) => contiguous::sqrt::FLOAT,
-                    ("uneg", DType::F32) => contiguous::neg::FLOAT,
-                    ("uexp", DType::F32) => contiguous::exp::FLOAT,
-                    ("ulog", DType::F32) => contiguous::log::FLOAT,
-                    ("ugelu", DType::F32) => contiguous::gelu::FLOAT,
-                    ("ugelu_erf", DType::F32) => contiguous::gelu_erf::FLOAT,
-                    ("uerf", DType::F32) => contiguous::erf::FLOAT,
-                    ("uceil", DType::F32) => contiguous::ceil::FLOAT,
-                    ("ufloor", DType::F32) => contiguous::floor::FLOAT,
-                    ("uround", DType::F32) => contiguous::round::FLOAT,
-                    ("ucos", DType::F16) => contiguous::cos::HALF,
-                    ("usin", DType::F16) => contiguous::sin::HALF,
-                    ("usqr", DType::F16) => contiguous::sqr::HALF,
-                    ("usqrt", DType::F16) => contiguous::sqrt::HALF,
-                    ("uneg", DType::F16) => contiguous::neg::HALF,
-                    ("uexp", DType::F16) => contiguous::exp::HALF,
-                    ("ulog", DType::F16) => contiguous::log::HALF,
-                    ("ugelu", DType::F16) => contiguous::gelu::HALF,
-                    ("ugelu_erf", DType::F16) => contiguous::gelu_erf::HALF,
-                    ("uerf", DType::F16) => contiguous::erf::HALF,
-                    ("uceil", DType::F16) => contiguous::ceil::HALF,
-                    ("ufloor", DType::F16) => contiguous::floor::HALF,
-                    ("uround", DType::F16) => contiguous::round::HALF,
-                    (name, dtype) => todo!("Match {name} - {dtype:?}"),
-                };
-                candle_metal_kernels::call_unary_contiguous(
-                    &device.device,
-                    &command_buffer,
-                    &device.kernels,
-                    kernel_name,
-                    el_count,
-                    &self.buffer,
-                    &mut buffer,
-                )
-                .map_err(MetalError::from)?;
-            } else {
-                use candle_metal_kernels::unary::strided;
-                let kernel_name = match (B::KERNEL, dtype) {
-                    ("ucos", DType::F32) => strided::cos::FLOAT,
-                    ("usin", DType::F32) => strided::sin::FLOAT,
-                    ("usqr", DType::F32) => strided::sqr::FLOAT,
-                    ("usqrt", DType::F32) => strided::sqrt::FLOAT,
-                    ("uneg", DType::F32) => strided::neg::FLOAT,
-                    ("uexp", DType::F32) => strided::exp::FLOAT,
-                    ("ulog", DType::F32) => strided::log::FLOAT,
-                    ("ugelu", DType::F32) => strided::gelu::FLOAT,
-                    ("ugelu_erf", DType::F32) => strided::gelu_erf::FLOAT,
-                    ("uerf", DType::F32) => strided::erf::FLOAT,
-                    ("uceil", DType::F32) => strided::ceil::FLOAT,
-                    ("ufloor", DType::F32) => strided::floor::FLOAT,
-                    ("uround", DType::F32) => strided::round::FLOAT,
-                    ("ucos", DType::F16) => strided::cos::HALF,
-                    ("usin", DType::F16) => strided::sin::HALF,
-                    ("usqr", DType::F16) => strided::sqr::HALF,
-                    ("usqrt", DType::F16) => strided::sqrt::HALF,
-                    ("uneg", DType::F16) => strided::neg::HALF,
-                    ("uexp", DType::F16) => strided::exp::HALF,
-                    ("ulog", DType::F16) => strided::log::HALF,
-                    ("ugelu", DType::F16) => strided::gelu::HALF,
-                    ("ugelu_erf", DType::F16) => strided::gelu_erf::HALF,
-                    ("uerf", DType::F16) => strided::erf::HALF,
-                    ("uceil", DType::F16) => strided::ceil::HALF,
-                    ("ufloor", DType::F16) => strided::floor::HALF,
-                    ("uround", DType::F16) => strided::round::HALF,
-                    (name, dtype) => todo!("Match {name} - {dtype:?}"),
-                };
-                candle_metal_kernels::call_unary_strided(
-                    &device.device,
-                    &command_buffer,
-                    &device.kernels,
-                    kernel_name,
-                    layout.dims(),
-                    &self.buffer,
-                    layout.stride(),
-                    layout.start_offset() * self.dtype.size_in_bytes(),
-                    &mut buffer,
-                    0,
-                )
-                .map_err(MetalError::from)?;
-            }
+            let kernel_name = match (B::KERNEL, dtype) {
+                ("ucos", DType::F32) => contiguous::cos::FLOAT,
+                ("usin", DType::F32) => contiguous::sin::FLOAT,
+                ("usqr", DType::F32) => contiguous::sqr::FLOAT,
+                ("usqrt", DType::F32) => contiguous::sqrt::FLOAT,
+                ("uneg", DType::F32) => contiguous::neg::FLOAT,
+                ("uexp", DType::F32) => contiguous::exp::FLOAT,
+                (name, dtype) => todo!("Match {name} - {dtype:?}"),
+            };
+            candle_metal_kernels::call_unary_contiguous(
+                &device.device,
+                &command_buffer,
+                &device.kernels,
+                kernel_name,
+                el_count,
+                &self.buffer,
+                &mut buffer,
+            )
+            .map_err(MetalError::from)?;
+        } else {
+            todo!("TODO Implement the kernel calling {}", B::KERNEL);
        }
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+
        Ok(Self {
            buffer,
            device: device.clone(),
@ -462,10 +338,8 @@ impl BackendStorage for MetalStorage {
        let shape = lhs_l.shape();
        let el_count = shape.elem_count();
        let mut buffer = device.new_buffer(el_count, dtype);
-        let command_buffer = device.command_buffer();
-        if (lhs_l.is_contiguous() && lhs_l.start_offset() == 0)
-            && (rhs_l.is_contiguous() && rhs_l.start_offset() == 0)
-        {
+        let command_buffer = device.command_queue.new_command_buffer();
+        if lhs_l.is_contiguous() && rhs_l.is_contiguous() {
            use candle_metal_kernels::binary::contiguous;

            let kernel_name = match (B::KERNEL, dtype) {
@ -477,14 +351,6 @@ impl BackendStorage for MetalStorage {
                ("bmul", DType::F32) => contiguous::mul::FLOAT,
                ("div", DType::F32) => contiguous::div::FLOAT,
                ("bdiv", DType::F32) => contiguous::div::FLOAT,
-                ("add", DType::F16) => contiguous::add::HALF,
-                ("badd", DType::F16) => contiguous::add::HALF,
-                ("sub", DType::F16) => contiguous::sub::HALF,
-                ("bsub", DType::F16) => contiguous::sub::HALF,
-                ("mul", DType::F16) => contiguous::mul::HALF,
-                ("bmul", DType::F16) => contiguous::mul::HALF,
-                ("div", DType::F16) => contiguous::div::HALF,
-                ("bdiv", DType::F16) => contiguous::div::HALF,
                (name, dtype) => todo!("Match {name} - {dtype:?}"),
            };
            candle_metal_kernels::call_binary_contiguous(
@ -506,10 +372,6 @@ impl BackendStorage for MetalStorage {
                ("bsub", DType::F32) => strided::sub::FLOAT,
                ("bmul", DType::F32) => strided::mul::FLOAT,
                ("bdiv", DType::F32) => strided::div::FLOAT,
-                ("badd", DType::F16) => strided::add::HALF,
-                ("bsub", DType::F16) => strided::sub::HALF,
-                ("bmul", DType::F16) => strided::mul::HALF,
-                ("bdiv", DType::F16) => strided::div::HALF,
                (name, dtype) => todo!("Match {name} - {dtype:?}"),
            };
            candle_metal_kernels::call_binary_strided(
@ -519,15 +381,18 @@ impl BackendStorage for MetalStorage {
                kernel_name,
                lhs_l.dims(),
                &self.buffer,
-                lhs_l.stride(),
-                lhs_l.start_offset() * self.dtype.size_in_bytes(),
+                &lhs_l.stride(),
+                lhs_l.start_offset(),
                &rhs.buffer,
-                rhs_l.stride(),
-                rhs_l.start_offset() * rhs.dtype.size_in_bytes(),
+                &rhs_l.stride(),
+                rhs_l.start_offset(),
                &mut buffer,
            )
            .map_err(MetalError::from)?;
        }
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+
        Ok(Self {
            buffer,
            device: device.clone(),
@ -549,25 +414,24 @@ impl BackendStorage for MetalStorage {
        let el = shape.elem_count();
        let dtype = t.dtype;
        let mut buffer = self.device.new_buffer(el, dtype);
-        let command_buffer = self.device.command_buffer();
+        let command_buffer = self.device.command_queue.new_command_buffer();
        candle_metal_kernels::call_where_cond_strided(
            &device.device,
            &command_buffer,
            &device.kernels,
            "where_u8_f32",
-            dims,
+            &dims,
            &self.buffer,
-            (
-                layout.stride(),
-                layout.start_offset() * self.dtype.size_in_bytes(),
-            ),
+            (layout.stride(), layout.start_offset()),
            &t.buffer,
-            (&t_l.stride(), t_l.start_offset() * t.dtype.size_in_bytes()),
+            (&t_l.stride(), t_l.start_offset()),
            &f.buffer,
-            (&f_l.stride(), f_l.start_offset() * f.dtype.size_in_bytes()),
+            (&f_l.stride(), f_l.start_offset()),
            &mut buffer,
        )
        .map_err(MetalError::from)?;
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
        Ok(Self {
            buffer,
            device,
@ -649,9 +513,7 @@ impl BackendStorage for MetalStorage {

    fn index_select(&self, ids: &Self, src_l: &Layout, ids_l: &Layout, dim: usize) -> Result<Self> {
        assert!(src_l.is_contiguous());
-        assert!(src_l.start_offset() == 0);
        assert!(ids_l.is_contiguous());
-        assert!(ids_l.start_offset() == 0);
        let left_size: usize = src_l.dims()[..dim].iter().product();
        let right_size: usize = src_l.dims()[dim + 1..].iter().product();
        let ids_el = ids_l.shape().elem_count();
@ -659,12 +521,13 @@ impl BackendStorage for MetalStorage {
        let dtype = self.dtype;
        let device = self.device();
        let mut buffer = device.new_buffer(dst_el, dtype);
+        let out = self.to_cpu_storage().unwrap();
        let name = match (ids.dtype, self.dtype) {
            (DType::U32, DType::F32) => "is_u32_f32",
-            (DType::U32, DType::F16) => "is_u32_f16",
            (left, right) => todo!("index select metal {left:?} {right:?}"),
        };
-        let command_buffer = self.device.command_buffer();
+        let command_buffer = self.device.command_queue.new_command_buffer();
+        // println!("INDEX SELECT");
        candle_metal_kernels::call_index_select(
            &device.device,
            &command_buffer,
@ -678,6 +541,8 @@ impl BackendStorage for MetalStorage {
            &mut buffer,
        )
        .map_err(MetalError::from)?;
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
        Ok(Self {
            buffer,
            device: device.clone(),
@ -706,18 +571,8 @@ impl BackendStorage for MetalStorage {
    ) -> Result<Self> {
        // Create descriptors
        use metal::mps::matrix::*;
-
-        let (type_id, size) = match self.dtype {
-            DType::F32 => (
-                metal::mps::MPS_FLOATBIT_ENCODING | 32,
-                core::mem::size_of::<f32>() as NSUInteger,
-            ),
-            DType::F16 => (
-                metal::mps::MPS_FLOATBIT_ENCODING | 16,
-                core::mem::size_of::<f16>() as NSUInteger,
-            ),
-            dtype => todo!("Dtype for matmul {dtype:?} is not supported"),
-        };
+        let type_id = metal::mps::MPS_FLOATBIT_ENCODING | 32;
+        let size = core::mem::size_of::<f32>() as NSUInteger;

        let elem_count = b * m * n;

@ -750,26 +605,7 @@ impl BackendStorage for MetalStorage {
                mnk: (m, n, k),
            })?
        };
-        let stride_left: u64 = match lhs_stride[..lhs_stride.len() - 2] {
-            [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride,
-            [stride] => stride,
-            [] => m * k,
-            _ => Err(MetalError::MatMulNonContiguous {
-                lhs_stride: lhs_stride.to_vec(),
-                rhs_stride: rhs_stride.to_vec(),
-                mnk: (m, n, k),
-            })?,
-        } as u64;
-        let stride_right: u64 = match rhs_stride[..rhs_stride.len() - 2] {
-            [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride,
-            [stride] => stride,
-            [] => n * k,
-            _ => Err(MetalError::MatMulNonContiguous {
-                lhs_stride: lhs_stride.to_vec(),
-                rhs_stride: rhs_stride.to_vec(),
-                mnk: (m, n, k),
-            })?,
-        } as u64;
+        // println!("{transpose_left} {transpose_right}");

        let b = b as NSUInteger;
        let m = m as NSUInteger;
@ -788,64 +624,56 @@ impl BackendStorage for MetalStorage {
        };
        let result_descriptor = MatrixDescriptor::init_single(m, n, n * size, type_id);

+        // Create matrix objects
+        let left_matrix = Matrix::init_with_buffer_descriptor(&self.buffer, &left_descriptor)
+            .ok_or_else(|| {
+                MetalError::from("Failed to create matrix multiplication kernel".to_string())
+            })?;
+        let right_matrix = Matrix::init_with_buffer_descriptor(&rhs.buffer, &right_descriptor)
+            .ok_or_else(|| {
+                MetalError::from("Failed to create matrix multiplication kernel".to_string())
+            })?;
+
        let out_buffer = self.device.new_buffer(elem_count, self.dtype);
+        let result_matrix = Matrix::init_with_buffer_descriptor(&out_buffer, &result_descriptor)
+            .ok_or_else(|| {
+                MetalError::from("Failed to create matrix multiplication kernel".to_string())
+            })?;

-        {
-            let command_buffer = self.device.command_buffer();
-            for bi in 0..b {
-                // Create matrix objects
-                let left_matrix = Matrix::init_with_buffer_descriptor(
-                    &self.buffer,
-                    (bi * stride_left + lhs_l.start_offset() as u64) * size,
-                    &left_descriptor,
-                )
-                .ok_or_else(|| {
-                    MetalError::from("Failed to create matrix multiplication kernel".to_string())
-                })?;
-                let right_matrix = Matrix::init_with_buffer_descriptor(
-                    &rhs.buffer,
-                    (bi * stride_right + rhs_l.start_offset() as u64) * size,
-                    &right_descriptor,
-                )
-                .ok_or_else(|| {
-                    MetalError::from("Failed to create matrix multiplication kernel".to_string())
-                })?;
+        let alpha = 1.0f64;
+        let beta = 0.0f64;
+        // Create kernel
+        let matrix_multiplication = MatrixMultiplication::init(
+            &self.device,
+            transpose_left,
+            transpose_right,
+            m,
+            n,
+            k,
+            alpha,
+            beta,
+        )
+        .ok_or_else(|| {
+            MetalError::from("Failed to create matrix multiplication kernel".to_string())
+        })?;

-                let result_matrix = Matrix::init_with_buffer_descriptor(
-                    &out_buffer,
-                    bi * m * n * size,
-                    &result_descriptor,
-                )
-                .ok_or_else(|| {
-                    MetalError::from("Failed to create matrix multiplication kernel".to_string())
-                })?;
+        matrix_multiplication.set_batch_size(b);

-                let alpha = 1.0f64;
-                let beta = 0.0f64;
-                // Create kernel
-                let matrix_multiplication = MatrixMultiplication::init(
-                    &self.device,
-                    transpose_left,
-                    transpose_right,
-                    m,
-                    n,
-                    k,
-                    alpha,
-                    beta,
-                )
-                .ok_or_else(|| {
-                    MetalError::from("Failed to create matrix multiplication kernel".to_string())
-                })?;
+        // Encode kernel to command buffer
+        let command_buffer = self.device.command_queue.new_command_buffer();
+        matrix_multiplication.encode_to_command_buffer(
+            command_buffer,
+            &left_matrix,
+            &right_matrix,
+            &result_matrix,
+        );
+        command_buffer.commit();
+        command_buffer.wait_until_completed();

-                // Encode kernel to command buffer
-                matrix_multiplication.encode_to_command_buffer(
-                    &command_buffer,
-                    &left_matrix,
-                    &right_matrix,
-                    &result_matrix,
-                );
-            }
-        }
+        // let left = self.buffer.read_to_vec::<f32>(10);
+        // let right = rhs.buffer.read_to_vec::<f32>(10);
+        // let out = out_buffer.read_to_vec::<f32>(40);
+        // todo!("Out {left:?} {right:?} {out:?}");

        Ok(Self {
            buffer: out_buffer,
@ -860,12 +688,11 @@ impl BackendStorage for MetalStorage {
        if el_count == 0 {
            return Ok(());
        }
-        let command_buffer = self.device.command_buffer();
+        let command_buffer = self.device.command_queue.new_command_buffer();
        let kernel_name = match self.dtype {
            DType::F32 => candle_metal_kernels::unary::strided::copy::FLOAT,
            DType::F16 => candle_metal_kernels::unary::strided::copy::HALF,
            DType::BF16 => candle_metal_kernels::unary::strided::copy::BFLOAT,
-            DType::U32 => candle_metal_kernels::unary::strided::copy::U32,
            dtype => todo!("copy_strided not implemented for {dtype:?}"),
        };
        candle_metal_kernels::call_unary_strided(
@ -875,12 +702,16 @@ impl BackendStorage for MetalStorage {
            kernel_name,
            src_l.dims(),
            &self.buffer,
-            src_l.stride(),
-            src_l.start_offset() * self.dtype.size_in_bytes(),
+            &src_l.stride(),
+            src_l.start_offset(),
            &mut dst.buffer,
-            dst_offset * dst.dtype.size_in_bytes(),
+            dst_offset,
        )
        .map_err(MetalError::from)?;
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+        // todo!("Output {:?}", dst.buffer.read_to_vec::<f32>(10));
+        // }
        Ok(())
    }
 }
@ -905,22 +736,24 @@ impl BackendDevice for MetalDevice {
    fn new(ordinal: usize) -> Result<Self> {
        let device = metal::Device::all().swap_remove(ordinal);

-        let command_queue = device.new_command_queue();
+        // let capture = metal::CaptureManager::shared();
+        // let descriptor = metal::CaptureDescriptor::new();
+        // descriptor.set_destination(metal::MTLCaptureDestination::GpuTraceDocument);
+        // descriptor.set_capture_device(&device);
+        // let mut dir = std::env::current_dir()?;
+        // dir.push("out.gputrace");
+        // descriptor.set_output_url(dir);

-        let descriptor = HeapDescriptor::new();
-        let mut size =
-            device.heap_buffer_size_and_align(100_000_000, MTLResourceOptions::StorageModeShared);
-        size.size += (size.size & (size.align - 1)) + size.align;
-        descriptor.set_size(size.size);
-        descriptor.set_storage_mode(metal::MTLStorageMode::Shared);
-        let heap = device.new_heap(&descriptor);
-        let command_buffer = Arc::new(RwLock::new(command_queue.new_command_buffer().to_owned()));
+        // capture
+        //     .start_capture(&descriptor)
+        //     .map_err(MetalError::from)?;
+        let command_queue = device.new_command_queue();
+        // let command_buffer = _command_queue.new_owned_command_buffer();
        let kernels = Arc::new(Kernels::new());
        Ok(Self {
            device,
-            heap,
            command_queue,
-            command_buffer,
+            // command_buffer,
            kernels,
        })
    }
@ -940,12 +773,9 @@ impl BackendDevice for MetalDevice {
    }

    fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<MetalStorage> {
-        let buffer = self.new_buffer(shape.elem_count(), dtype);
-        Ok(MetalStorage {
-            buffer,
-            device: self.clone(),
-            dtype,
-        })
+        // TODO Is there a faster way ?
+        let cpu_storage = crate::cpu_backend::CpuDevice.zeros_impl(shape, dtype)?;
+        self.storage_from_cpu_storage(&cpu_storage)
    }

    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
@ -955,15 +785,47 @@ impl BackendDevice for MetalDevice {
    }

    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<Self::Storage> {
+        let option = metal::MTLResourceOptions::StorageModeManaged;
        let buffer = match storage {
-            CpuStorage::U8(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::U32(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::I64(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::BF16(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::F16(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::F32(storage) => self.new_buffer_with_data(storage),
-            CpuStorage::F64(storage) => self.new_buffer_with_data(storage),
+            CpuStorage::U8(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<u8>()) as NSUInteger,
+                option,
+            ),
+            CpuStorage::U32(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<u32>()) as NSUInteger,
+                option,
+            ),
+            CpuStorage::I64(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<i64>()) as NSUInteger,
+                option,
+            ),
+            CpuStorage::BF16(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<bf16>()) as NSUInteger,
+                option,
+            ),
+            CpuStorage::F16(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<f16>()) as NSUInteger,
+                option,
+            ),
+            CpuStorage::F32(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<f32>()) as NSUInteger,
+                option,
+            ),
+            CpuStorage::F64(storage) => self.device.new_buffer_with_data(
+                storage.as_ptr() as *const core::ffi::c_void,
+                (storage.len() * mem::size_of::<f64>()) as NSUInteger,
+                option,
+            ),
        };
+        // TODO is that necessary ?
+        // buffer.did_modify_range(metal::NSRange::new(0, buffer.length()));
+        // debug!("Allocate 2 - buffer size {}", buffer.length());
        Ok(Self::Storage {
            buffer,
            device: self.clone(),
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -593,8 +593,7 @@ unary_op!(Recip, "recip", v, v.recip());
 unary_op!(Sqr, "sqr", v, v * v, vs_sqr, vd_sqr);
 unary_op!(Sqrt, "sqrt", v, v.sqrt(), vs_sqrt, vd_sqrt);

-/// Tanh based approximation of the `gelu` operation
-/// GeluErf is the more precise one.
+/// `gelu` operation
 /// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
 impl UnaryOpT for Gelu {
    const NAME: &'static str = "gelu";
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -157,6 +157,8 @@ pub(crate) fn from_storage<S: Into<Shape>>(
 ) -> Tensor {
    let dtype = storage.dtype();
    let device = storage.device();
+    let shape = shape.into();
+    // println!("{:?} {storage:?}", shape);
    let tensor_ = Tensor_ {
        id: TensorId::new(),
        storage: Arc::new(RwLock::new(storage)),
@ -166,7 +168,11 @@ pub(crate) fn from_storage<S: Into<Shape>>(
        dtype,
        device,
    };
-    Tensor(Arc::new(tensor_))
+    let result = Tensor(Arc::new(tensor_));
+    // todo!(" from_storage");
+    // let result = result.to_device(&Device::Cpu).unwrap();
+    // todo!(" {result}");
+    result
 }

 impl Tensor {
@ -856,20 +862,6 @@ impl Tensor {
        self.sum_impl(mean_dims, false)? * scale
    }

-    /// Returns the unbiased variance over the selected dimension.
-    pub fn var_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
-        let dim = dim.to_index(self.shape(), "var")?;
-        let mean = self.mean_keepdim(dim)?;
-        let squares = self.broadcast_sub(&mean)?.sqr()?;
-        squares.sum_impl(dim, true)? / (self.dim(dim)? - 1) as f64
-    }
-
-    /// Returns the unbiased variance over the selected dimension.
-    pub fn var<D: Dim>(&self, dim: D) -> Result<Self> {
-        let dim = dim.to_index(self.shape(), "var")?;
-        self.var_keepdim(dim)?.squeeze(dim)
-    }
-
    /// Gathers the maximum value across the selected dimension. The resulting shape has the same
    /// number of dimensions as the original tensor and the select dimension has a single element.
    pub fn max_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
@ -1863,10 +1855,7 @@ impl Tensor {
                    Storage::Metal(metal.storage_from_cpu_storage(storage)?)
                }
                (Storage::Cuda(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
-                (Storage::Metal(storage), Device::Cpu) => {
-                    println!("{storage:?} - {:?}", storage.to_cpu_storage()?);
-                    Storage::Cpu(storage.to_cpu_storage()?)
-                }
+                (Storage::Metal(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
                (Storage::Cuda(storage), Device::Cuda(cuda)) => {
                    // TODO: Avoid passing through the cpu storage here, especially if the gpu ids
                    // are the same.
--- a/candle-core/src/test_utils.rs
+++ b/candle-core/src/test_utils.rs
@ -4,7 +4,7 @@ use crate::{Result, Tensor};
 macro_rules! test_device {
    // TODO: Switch to generating the two last arguments automatically once concat_idents is
    // stable. https://github.com/rust-lang/rust/issues/29599
-    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident, $test_metal: ident) => {
+    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident) => {
        #[test]
        fn $test_cpu() -> Result<()> {
            $fn_name(&Device::Cpu)
@ -15,12 +15,6 @@ macro_rules! test_device {
        fn $test_cuda() -> Result<()> {
            $fn_name(&Device::new_cuda(0)?)
        }
-
-        #[cfg(feature = "metal")]
-        #[test]
-        fn $test_metal() -> Result<()> {
-            $fn_name(&Device::new_metal(0)?)
-        }
    };
 }

--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -563,35 +563,14 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(conv1d, conv1d_cpu, conv1d_gpu, conv1d_metal);
-test_device!(
-    conv1d_small,
-    conv1d_small_cpu,
-    conv1d_small_gpu,
-    conv1d_small_metal
-);
-test_device!(conv2d, conv2d_cpu, conv2d_gpu, conv2d_metal);
+test_device!(conv1d, conv1d_cpu, conv1d_gpu);
+test_device!(conv1d_small, conv1d_small_cpu, conv1d_small_gpu);
+test_device!(conv2d, conv2d_cpu, conv2d_gpu);
 test_device!(
    conv2d_non_square,
    conv2d_non_square_cpu,
-    conv2d_non_square_gpu,
-    conv2d_non_square_metal
-);
-test_device!(
-    conv2d_small,
-    conv2d_small_cpu,
-    conv2d_small_gpu,
-    conv2d_small_metal
-);
-test_device!(
-    conv2d_smaller,
-    conv2d_smaller_cpu,
-    conv2d_smaller_gpu,
-    conv2d_smaller_metal
-);
-test_device!(
-    conv2d_grad,
-    conv2d_grad_cpu,
-    conv2d_grad_gpu,
-    conv2_grad_metal
+    conv2d_non_square_gpu
 );
+test_device!(conv2d_small, conv2d_small_cpu, conv2d_small_gpu);
+test_device!(conv2d_smaller, conv2d_smaller_cpu, conv2d_smaller_gpu);
+test_device!(conv2d_grad, conv2d_grad_cpu, conv2d_grad_gpu);
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -315,29 +315,9 @@ fn binary_grad(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(
-    simple_grad,
-    simple_grad_cpu,
-    simple_grad_gpu,
-    simple_grad_metal
-);
-test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu, sum_grad_metal);
-test_device!(
-    matmul_grad,
-    matmul_grad_cpu,
-    matmul_grad_gpu,
-    matmul_grad_metal
-);
-test_device!(
-    grad_descent,
-    grad_descent_cpu,
-    grad_descent_gpu,
-    grad_descent_metal
-);
-test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu, unary_grad_metal);
-test_device!(
-    binary_grad,
-    binary_grad_cpu,
-    binary_grad_gpu,
-    binary_grad_metal
-);
+test_device!(simple_grad, simple_grad_cpu, simple_grad_gpu);
+test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu);
+test_device!(matmul_grad, matmul_grad_cpu, matmul_grad_gpu);
+test_device!(grad_descent, grad_descent_cpu, grad_descent_gpu);
+test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu);
+test_device!(binary_grad, binary_grad_cpu, binary_grad_gpu);
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -49,7 +49,7 @@ fn contiguous(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(contiguous, contiguous_cpu, contiguous_gpu, contiguous_metal);
+test_device!(contiguous, contiguous_cpu, contiguous_gpu);

 #[test]
 fn strided_blocks() -> Result<()> {
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
@ -98,17 +98,15 @@ fn upsample_nearest2d(dev: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu, avg_pool2d_metal);
+test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu);
 test_device!(
    avg_pool2d_pytorch,
    avg_pool2d_pytorch_cpu,
-    avg_pool2d_pytorch_gpu,
-    avg_pool2d_pytorch_metal
+    avg_pool2d_pytorch_gpu
 );
-test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu, max_pool2d_metal);
+test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu);
 test_device!(
    upsample_nearest2d,
    upsample_nearest2d_cpu,
-    upsample_nearest2d_gpu,
-    upsample_nearest2d_metal
+    upsample_nearest2d_gpu
 );
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -180,22 +180,6 @@ fn transpose(device: &Device) -> Result<()> {
    Ok(())
 }

-fn var(device: &Device) -> Result<()> {
-    // Values taken from https://pytorch.org/docs/stable/generated/torch.var.html
-    let data = &[
-        [0.2035f32, 1.2959, 1.8101, -0.4644],
-        [1.5027, -0.3270, 0.5905, 0.6538],
-        [-1.5745, 1.3330, -0.5596, -0.6548],
-        [0.1264, -0.5080, 1.6420, 0.1992],
-    ];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.var_keepdim(1)?, 4)?,
-        &[[1.0631], [0.559], [1.4893], [0.8258]]
-    );
-    Ok(())
-}
-
 fn sum(device: &Device) -> Result<()> {
    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
    let tensor = Tensor::new(data, device)?;
@ -1070,60 +1054,34 @@ fn randn(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal);
-test_device!(ones, ones_cpu, ones_gpu, ones_metal);
-test_device!(arange, arange_cpu, arange_gpu, arange_metal);
-test_device!(add_mul, add_mul_cpu, add_mul_gpu, add_mul_metal);
-test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu, tensor_2d_metal);
-test_device!(narrow, narrow_cpu, narrow_gpu, narrow_metal);
-test_device!(broadcast, broadcast_cpu, broadcast_gpu, broadcast_metal);
-test_device!(cat, cat_cpu, cat_gpu, cat_metal);
-test_device!(sum, sum_cpu, sum_gpu, sum_metal);
-test_device!(min, min_cpu, min_gpu, min_metal);
-test_device!(max, max_cpu, max_gpu, max_metal);
-test_device!(argmax, argmax_cpu, argmax_gpu, argmax_metal);
-test_device!(argmin, argmin_cpu, argmin_gpu, argmin_metal);
-test_device!(transpose, transpose_cpu, transpose_gpu, transpose_metal);
-test_device!(unary_op, unary_op_cpu, unary_op_gpu, unary_op_metal);
-test_device!(binary_op, binary_op_cpu, binary_op_gpu, binary_op_metal);
-test_device!(embeddings, embeddings_cpu, embeddings_gpu, embeddings_metal);
-test_device!(cmp, cmp_cpu, cmp_gpu, cmp_metal);
-test_device!(matmul, matmul_cpu, matmul_gpu, matmul_metal);
-test_device!(
-    broadcast_matmul,
-    broadcast_matmul_cpu,
-    broadcast_matmul_gpu,
-    broadcast_matmul_metal
-);
-test_device!(
-    broadcasting,
-    broadcasting_cpu,
-    broadcasting_gpu,
-    broadcasting_metal
-);
-test_device!(
-    index_select,
-    index_select_cpu,
-    index_select_gpu,
-    index_select_metal
-);
-test_device!(index_add, index_add_cpu, index_add_gpu, index_add_metal);
-test_device!(gather, gather_cpu, gather_gpu, gather_metal);
-test_device!(
-    scatter_add,
-    scatter_add_cpu,
-    scatter_add_gpu,
-    scatter_add_metal
-);
-test_device!(
-    slice_scatter,
-    slice_scatter_cpu,
-    slice_scatter_gpu,
-    slice_scatter_metal
-);
-test_device!(randn, randn_cpu, randn_gpu, randn_metal);
-test_device!(clamp, clamp_cpu, clamp_gpu, clamp_metal);
-test_device!(var, var_cpu, var_gpu, var_metal);
+test_device!(zeros, zeros_cpu, zeros_gpu);
+test_device!(ones, ones_cpu, ones_gpu);
+test_device!(arange, arange_cpu, arange_gpu);
+test_device!(add_mul, add_mul_cpu, add_mul_gpu);
+test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu);
+test_device!(narrow, narrow_cpu, narrow_gpu);
+test_device!(broadcast, broadcast_cpu, broadcast_gpu);
+test_device!(cat, cat_cpu, cat_gpu);
+test_device!(sum, sum_cpu, sum_gpu);
+test_device!(min, min_cpu, min_gpu);
+test_device!(max, max_cpu, max_gpu);
+test_device!(argmax, argmax_cpu, argmax_gpu);
+test_device!(argmin, argmin_cpu, argmin_gpu);
+test_device!(transpose, transpose_cpu, transpose_gpu);
+test_device!(unary_op, unary_op_cpu, unary_op_gpu);
+test_device!(binary_op, binary_op_cpu, binary_op_gpu);
+test_device!(embeddings, embeddings_cpu, embeddings_gpu);
+test_device!(cmp, cmp_cpu, cmp_gpu);
+test_device!(matmul, matmul_cpu, matmul_gpu);
+test_device!(broadcast_matmul, broadcast_matmul_cpu, broadcast_matmul_gpu);
+test_device!(broadcasting, broadcasting_cpu, broadcasting_gpu);
+test_device!(index_select, index_select_cpu, index_select_gpu);
+test_device!(index_add, index_add_cpu, index_add_gpu);
+test_device!(gather, gather_cpu, gather_gpu);
+test_device!(scatter_add, scatter_add_cpu, scatter_add_gpu);
+test_device!(slice_scatter, slice_scatter_cpu, slice_scatter_gpu);
+test_device!(randn, randn_cpu, randn_gpu);
+test_device!(clamp, clamp_cpu, clamp_gpu);

 // There was originally a bug on the CPU implementation for randn
 // https://github.com/huggingface/candle/issues/381
--- a/candle-examples/examples/llama2-c/main.rs
+++ b/candle-examples/examples/llama2-c/main.rs
@ -329,14 +329,18 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
        .get_ids()
        .to_vec();

+    println!("{tokens:?}");
+
    let start_gen = std::time::Instant::now();
-    for index in 0.. {
+    for index in 0..1 {
        if tokens.len() >= config.seq_len {
            break;
        }
        let context_size = if index > 0 { 1 } else { tokens.len() };
        let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
        let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
+        // println!("Input {}", input);
+        // println!("Input {}", input.to_device(&candle::Device::Cpu)?);
        let logits = model.forward(&input, index_pos)?;
        let logits = logits.i((0, logits.dim(1)? - 1))?;
        let logits = if common_args.repeat_penalty == 1. || tokens.is_empty() {
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -325,11 +325,10 @@ fn main() -> anyhow::Result<()> {
    };

    let mut pre_prompt_tokens = vec![];
-    for prompt_index in 0.. {
+    loop {
        let prompt_str = match &prompt {
            Prompt::One(prompt) => prompt.clone(),
            Prompt::Interactive | Prompt::Chat => {
-                let is_interactive = matches!(prompt, Prompt::Interactive);
                print!("> ");
                std::io::stdout().flush()?;
                let mut prompt = String::new();
@ -341,11 +340,7 @@ fn main() -> anyhow::Result<()> {
                    }
                }
                if args.which.is_zephyr() {
-                    if prompt_index == 0 || is_interactive {
-                        format!("<|system|>\n</s>\n<|user|>\n{prompt}</s>\n<|assistant|>",)
-                    } else {
-                        format!("<|user|>\n{prompt}</s>\n<|assistant|>")
-                    }
+                    format!("<|system|>\n</s>\n<|user|>\n{prompt}</s>\n<|assistant|>")
                } else if args.which.is_mistral() {
                    format!("[INST] {prompt} [/INST]")
                } else {
--- a/candle-examples/examples/t5/README.md
+++ b/candle-examples/examples/t5/README.md
@ -9,8 +9,6 @@ $ cargo run --example t5 --release -- --model-id "t5-small" --prompt "translate
 9 tokens generated (2.42 token/s)
 ```

-Variants such as [flan-t5](https://huggingface.co/google/flan-t5-small), [flan-ul2](https://huggingface.co/google/flan-ul2) (with `--revision "refs/pr/25"`), and [Co-EdIT](https://huggingface.co/grammarly/coedit-large) are also supported.
-
 ## Translation with [MADLAD-400](https://arxiv.org/abs/2309.04662)

 MADLAD-400 is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models.
@ -24,7 +22,7 @@ cargo run --example t5 --release  -- \
 Wie geht es dir, mein Freund?
 ```

-## Sentence embedding example
+## Sentence embedding example:

 ```bash
 $ cargo run --example t5 --release -- --model-id "t5-small" --prompt "A beautiful candle."
--- a/candle-examples/examples/t5/main.rs
+++ b/candle-examples/examples/t5/main.rs
@ -104,17 +104,6 @@ impl T5ModelBuilder {
                api.get("model-00004-of-00005.safetensors")?,
                api.get("model-00005-of-00005.safetensors")?,
            ]
-        } else if model_id == "google/flan-ul2" {
-            vec![
-                api.get("model-00001-of-00008.safetensors")?,
-                api.get("model-00002-of-00008.safetensors")?,
-                api.get("model-00003-of-00008.safetensors")?,
-                api.get("model-00004-of-00008.safetensors")?,
-                api.get("model-00005-of-00008.safetensors")?,
-                api.get("model-00006-of-00008.safetensors")?,
-                api.get("model-00007-of-00008.safetensors")?,
-                api.get("model-00008-of-00008.safetensors")?,
-            ]
        } else {
            vec![api.get("model.safetensors")?]
        };
--- a/candle-examples/examples/trocr/assets/trocr.png
+++ b/candle-examples/examples/trocr/assets/trocr.png
--- a/candle-examples/examples/trocr/image_processor.rs
+++ b/candle-examples/examples/trocr/image_processor.rs
@ -1,154 +0,0 @@
-use image::{DynamicImage, ImageBuffer};
-use serde::Deserialize;
-use std::collections::HashMap;
-
-use candle::{DType, Device, Result, Tensor};
-
-#[derive(Debug, Clone, PartialEq, Deserialize)]
-pub struct ProcessorConfig {
-    do_resize: bool,
-    height: u32,
-    width: u32,
-    do_rescale: bool,
-    do_normalize: bool,
-    image_mean: Vec<f32>,
-    image_std: Vec<f32>,
-}
-
-impl Default for ProcessorConfig {
-    fn default() -> Self {
-        Self {
-            do_resize: true,
-            height: 384,
-            width: 384,
-            do_rescale: true,
-            do_normalize: true,
-            image_mean: vec![0.5, 0.5, 0.5],
-            image_std: vec![0.5, 0.5, 0.5],
-        }
-    }
-}
-
-pub struct ViTImageProcessor {
-    do_resize: bool,
-    height: u32,
-    width: u32,
-    do_normalize: bool,
-    image_mean: Vec<f32>,
-    image_std: Vec<f32>,
-}
-
-impl ViTImageProcessor {
-    pub fn new(config: &ProcessorConfig) -> Self {
-        Self {
-            do_resize: config.do_resize,
-            height: config.height,
-            width: config.width,
-            do_normalize: config.do_normalize,
-            image_mean: config.image_mean.clone(),
-            image_std: config.image_std.clone(),
-        }
-    }
-
-    pub fn preprocess(&self, images: Vec<&str>) -> Result<Tensor> {
-        let height = self.height as usize;
-        let width = self.width as usize;
-        let channels = 3;
-
-        let images = self.load_images(images)?;
-
-        let resized_images: Vec<DynamicImage> = if self.do_resize {
-            images
-                .iter()
-                .map(|image| self.resize(image.clone(), None).unwrap())
-                .collect()
-        } else {
-            images
-        };
-
-        let normalized_images: Vec<Tensor> = if self.do_normalize {
-            resized_images
-                .iter()
-                .map(|image| self.normalize(image.clone(), None, None).unwrap())
-                .collect()
-        } else {
-            let resized_images: Vec<ImageBuffer<image::Rgb<u8>, Vec<u8>>> =
-                resized_images.iter().map(|image| image.to_rgb8()).collect();
-            let data = resized_images
-                .into_iter()
-                .map(|image| image.into_raw())
-                .collect::<Vec<Vec<u8>>>();
-
-            data.iter()
-                .map(|image| {
-                    Tensor::from_vec(image.clone(), (height, width, channels), &Device::Cpu)
-                        .unwrap()
-                        .permute((2, 0, 1))
-                        .unwrap()
-                })
-                .collect::<Vec<Tensor>>()
-        };
-
-        Tensor::stack(&normalized_images, 0)
-    }
-
-    fn resize(
-        &self,
-        image: image::DynamicImage,
-        size: Option<HashMap<String, u32>>,
-    ) -> Result<image::DynamicImage> {
-        let (height, width) = match &size {
-            Some(size) => (size.get("height").unwrap(), size.get("width").unwrap()),
-            None => (&self.height, &self.width),
-        };
-
-        let resized_image =
-            image.resize_exact(*width, *height, image::imageops::FilterType::Triangle);
-
-        Ok(resized_image)
-    }
-
-    fn normalize(
-        &self,
-        image: image::DynamicImage,
-        mean: Option<Vec<f32>>,
-        std: Option<Vec<f32>>,
-    ) -> Result<Tensor> {
-        let mean = match mean {
-            Some(mean) => mean,
-            None => self.image_mean.clone(),
-        };
-
-        let std = match std {
-            Some(std) => std,
-            None => self.image_std.clone(),
-        };
-
-        let mean = Tensor::from_vec(mean, (3, 1, 1), &Device::Cpu)?;
-        let std = Tensor::from_vec(std, (3, 1, 1), &Device::Cpu)?;
-
-        let image = image.to_rgb8();
-        let data = image.into_raw();
-
-        let height = self.height as usize;
-        let width = self.width as usize;
-        let channels = 3;
-
-        let data =
-            Tensor::from_vec(data, &[height, width, channels], &Device::Cpu)?.permute((2, 0, 1))?;
-
-        (data.to_dtype(DType::F32)? / 255.)?
-            .broadcast_sub(&mean)?
-            .broadcast_div(&std)
-    }
-
-    pub fn load_images(&self, image_path: Vec<&str>) -> Result<Vec<image::DynamicImage>> {
-        let mut images: Vec<image::DynamicImage> = Vec::new();
-        for path in image_path {
-            let img = image::io::Reader::open(path)?.decode().unwrap();
-            images.push(img);
-        }
-
-        Ok(images)
-    }
-}
--- a/candle-examples/examples/trocr/main.rs
+++ b/candle-examples/examples/trocr/main.rs
@ -1,132 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::Error as E;
-use clap::{Parser, ValueEnum};
-
-use candle::{DType, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::models::trocr;
-
-use tokenizers::Tokenizer;
-mod image_processor;
-
-#[derive(Clone, Debug, Copy, ValueEnum)]
-enum Which {
-    Base,
-    Large,
-}
-
-#[derive(Parser, Debug)]
-struct Args {
-    #[arg(long)]
-    model: Option<String>,
-
-    /// Choose the variant of the model to run.
-    #[arg(long, default_value = "base")]
-    which: Which,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Text to be translated
-    #[arg(long)]
-    image: String,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    use hf_hub::api::sync::Api;
-    let args = Args::parse();
-
-    let tokenizer_dec = {
-        let tokenizer = Api::new()?
-            .model(String::from("ToluClassics/candle-trocr-tokenizer"))
-            .get("tokenizer.json")?;
-
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
-    };
-
-    let mut tokenizer_dec = TokenOutputStream::new(tokenizer_dec);
-
-    let device = candle_examples::device(args.cpu)?;
-
-    let vb = {
-        let model = match args.model {
-            Some(model) => std::path::PathBuf::from(model),
-            None => match args.which {
-                Which::Base => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "microsoft/trocr-base-handwritten".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/3".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-                Which::Large => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "microsoft/trocr-large-handwritten".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/6".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-            },
-        };
-        println!("model: {:?}", model);
-        unsafe { VarBuilder::from_mmaped_safetensors(&[model], DType::F32, &device)? }
-    };
-
-    let encoder_config = match args.which {
-        Which::Base => candle_transformers::models::vit::Config::microsoft_trocr_base_handwritten(),
-        Which::Large => {
-            candle_transformers::models::vit::Config::microsoft_trocr_base_handwritten()
-        }
-    };
-
-    let decoder_config = trocr::TrOCRConfig::default();
-    let mut model = trocr::TrOCRModel::new(&encoder_config, &decoder_config, vb)?;
-
-    let config = image_processor::ProcessorConfig::default();
-    let processor = image_processor::ViTImageProcessor::new(&config);
-
-    let image = vec![args.image.as_str()];
-    let image = processor.preprocess(image)?;
-
-    let encoder_xs = model.encoder().forward(&image)?;
-
-    let mut logits_processor =
-        candle_transformers::generation::LogitsProcessor::new(1337, None, None);
-
-    let mut token_ids: Vec<u32> = vec![decoder_config.decoder_start_token_id];
-    for index in 0..1000 {
-        let context_size = if index >= 1 { 1 } else { token_ids.len() };
-        let start_pos = token_ids.len().saturating_sub(context_size);
-        let input_ids = Tensor::new(&token_ids[start_pos..], &device)?.unsqueeze(0)?;
-
-        let logits = model.decode(&input_ids, &encoder_xs, start_pos)?;
-
-        let logits = logits.squeeze(0)?;
-        let logits = logits.get(logits.dim(0)? - 1)?;
-        let token = logits_processor.sample(&logits)?;
-        token_ids.push(token);
-
-        if let Some(t) = tokenizer_dec.next_token(token)? {
-            use std::io::Write;
-            print!("{t}");
-            std::io::stdout().flush()?;
-        }
-        if token == decoder_config.eos_token_id {
-            break;
-        }
-    }
-
-    if let Some(rest) = tokenizer_dec.decode_rest().map_err(E::msg)? {
-        print!("{rest}");
-    }
-    println!();
-
-    Ok(())
-}
--- a/candle-examples/examples/trocr/readme.md
+++ b/candle-examples/examples/trocr/readme.md
@ -1,16 +0,0 @@
-# candle-trocr
-
-`TrOCR` is a transformer OCR Model. In this example it is used to
-transcribe image text. See the associated [model
-card](https://huggingface.co/microsoft/trocr-base-printed) for details on
-the model itself.
-
-## Running an example
-
-```bash
-cargo run --example trocr --release --  --which base --cpu --image assets/trocr.png
-```
-
-```
-<s> industry , Mr. Brown commented icily . " Let us have a</s>
-```
--- a/candle-examples/examples/yi/main.rs
+++ b/candle-examples/examples/yi/main.rs
@ -1,268 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::{Error as E, Result};
-use clap::{Parser, ValueEnum};
-
-use candle_transformers::models::yi::{Config, Model};
-
-use candle::{DType, Device, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
-enum Which {
-    #[value(name = "6b")]
-    L6b,
-    #[value(name = "34b")]
-    L34b,
-}
-
-struct TextGeneration {
-    model: Model,
-    device: Device,
-    tokenizer: TokenOutputStream,
-    logits_processor: LogitsProcessor,
-    repeat_penalty: f32,
-    repeat_last_n: usize,
-}
-
-impl TextGeneration {
-    #[allow(clippy::too_many_arguments)]
-    fn new(
-        model: Model,
-        tokenizer: Tokenizer,
-        seed: u64,
-        temp: Option<f64>,
-        top_p: Option<f64>,
-        repeat_penalty: f32,
-        repeat_last_n: usize,
-        device: &Device,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-        Self {
-            model,
-            tokenizer: TokenOutputStream::new(tokenizer),
-            logits_processor,
-            repeat_penalty,
-            repeat_last_n,
-            device: device.clone(),
-        }
-    }
-
-    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
-        use std::io::Write;
-        self.tokenizer.clear();
-        let mut tokens = self
-            .tokenizer
-            .tokenizer()
-            .encode(prompt, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        for &t in tokens.iter() {
-            if let Some(t) = self.tokenizer.next_token(t)? {
-                print!("{t}")
-            }
-        }
-        std::io::stdout().flush()?;
-
-        let mut generated_tokens = 0usize;
-        let eos_token = match self.tokenizer.get_token("</s>") {
-            Some(token) => token,
-            None => anyhow::bail!("cannot find the </s> token"),
-        };
-        let start_gen = std::time::Instant::now();
-        for index in 0..sample_len {
-            let context_size = if index > 0 { 1 } else { tokens.len() };
-            let start_pos = tokens.len().saturating_sub(context_size);
-            let ctxt = &tokens[start_pos..];
-            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
-            let logits = self.model.forward(&input, start_pos)?;
-            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
-            let logits = if self.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    self.repeat_penalty,
-                    &tokens[start_at..],
-                )?
-            };
-
-            let next_token = self.logits_processor.sample(&logits)?;
-            tokens.push(next_token);
-            generated_tokens += 1;
-            if next_token == eos_token {
-                break;
-            }
-            if let Some(t) = self.tokenizer.next_token(next_token)? {
-                print!("{t}");
-                std::io::stdout().flush()?;
-            }
-        }
-        let dt = start_gen.elapsed();
-        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
-            print!("{rest}");
-        }
-        std::io::stdout().flush()?;
-        println!(
-            "\n{generated_tokens} tokens generated ({:.2} token/s)",
-            generated_tokens as f64 / dt.as_secs_f64(),
-        );
-        Ok(())
-    }
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    #[arg(long)]
-    prompt: String,
-
-    /// The temperature used to generate samples.
-    #[arg(long)]
-    temperature: Option<f64>,
-
-    /// Nucleus sampling probability cutoff.
-    #[arg(long)]
-    top_p: Option<f64>,
-
-    /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
-    seed: u64,
-
-    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 100)]
-    sample_len: usize,
-
-    #[arg(long, default_value = "01-ai/Yi-6B")]
-    model_id: String,
-
-    #[arg(long, default_value = "main")]
-    revision: String,
-
-    #[arg(long)]
-    tokenizer_file: Option<String>,
-
-    #[arg(long)]
-    weight_files: Option<String>,
-
-    /// Penalty to be applied for repeating tokens, 1. means no penalty.
-    #[arg(long, default_value_t = 1.1)]
-    repeat_penalty: f32,
-
-    /// The context size to consider for the repeat penalty.
-    #[arg(long, default_value_t = 64)]
-    repeat_last_n: usize,
-
-    /// The model size to use.
-    #[arg(long, default_value = "6b")]
-    which: Which,
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    println!(
-        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        candle::utils::with_avx(),
-        candle::utils::with_neon(),
-        candle::utils::with_simd128(),
-        candle::utils::with_f16c()
-    );
-    println!(
-        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
-        args.temperature.unwrap_or(0.),
-        args.repeat_penalty,
-        args.repeat_last_n
-    );
-
-    let start = std::time::Instant::now();
-    let api = Api::new()?;
-    let repo = api.repo(Repo::with_revision(
-        args.model_id,
-        RepoType::Model,
-        args.revision,
-    ));
-    let tokenizer_filename = match args.tokenizer_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("tokenizer.json")?,
-    };
-    let filenames = match args.weight_files {
-        Some(files) => files
-            .split(',')
-            .map(std::path::PathBuf::from)
-            .collect::<Vec<_>>(),
-        None => match args.which {
-            Which::L6b => vec![
-                repo.get("model-00001-of-00002.safetensors")?,
-                repo.get("model-00002-of-00002.safetensors")?,
-            ],
-            Which::L34b => vec![
-                repo.get("model-00001-of-00007.safetensors")?,
-                repo.get("model-00002-of-00007.safetensors")?,
-                repo.get("model-00003-of-00007.safetensors")?,
-                repo.get("model-00004-of-00007.safetensors")?,
-                repo.get("model-00005-of-00007.safetensors")?,
-                repo.get("model-00006-of-00007.safetensors")?,
-                repo.get("model-00007-of-00007.safetensors")?,
-            ],
-        },
-    };
-    println!("retrieved the files in {:?}", start.elapsed());
-    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-    let start = std::time::Instant::now();
-    let config = match args.which {
-        Which::L6b => Config::config_6b(),
-        Which::L34b => Config::config_34b(),
-    };
-    let device = candle_examples::device(args.cpu)?;
-    let dtype = if device.is_cuda() {
-        DType::BF16
-    } else {
-        DType::F32
-    };
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
-    let model = Model::new(&config, vb)?;
-
-    println!("loaded the model in {:?}", start.elapsed());
-
-    let mut pipeline = TextGeneration::new(
-        model,
-        tokenizer,
-        args.seed,
-        args.temperature,
-        args.top_p,
-        args.repeat_penalty,
-        args.repeat_last_n,
-        &device,
-    );
-    pipeline.run(&args.prompt, args.sample_len)?;
-    Ok(())
-}
--- a/candle-flash-attn/src/lib.rs
+++ b/candle-flash-attn/src/lib.rs
@ -233,8 +233,8 @@ impl FlashAttnVarLen {

        let (seqlens_q, seqlens_q_layout) = self.seqlens_q.storage_and_layout();
        let seqlens_q = match &*seqlens_q {
+            candle::Storage::Cpu(_) => candle::bail!("seqlens_q must be a cuda tensor"),
            candle::Storage::Cuda(c) => c.as_cuda_slice::<u32>()?, // Should be i32!
-            _ => candle::bail!("seqlens_q must be a cuda tensor"),
        };
        let seqlens_q = match seqlens_q_layout.contiguous_offsets() {
            Some((o1, o2)) => seqlens_q.slice(o1..o2),
@ -243,8 +243,8 @@ impl FlashAttnVarLen {

        let (seqlens_k, seqlens_k_layout) = self.seqlens_k.storage_and_layout();
        let seqlens_k = match &*seqlens_k {
+            candle::Storage::Cpu(_) => candle::bail!("seqlens_k must be a cuda tensor"),
            candle::Storage::Cuda(c) => c.as_cuda_slice::<u32>()?, // Should be i32!
-            _ => candle::bail!("seqlens_k must be a cuda tensor"),
        };
        let seqlens_k = match seqlens_k_layout.contiguous_offsets() {
            Some((o1, o2)) => seqlens_k.slice(o1..o2),
--- a/candle-metal-kernels/examples/affine.rs
+++ b/candle-metal-kernels/examples/affine.rs
@ -50,7 +50,6 @@ fn run_affine_bench<T: Clone>(device: &Device, kernels: &Kernels, v: &[T]) {
                &device,
                command_buffer,
                &kernels,
-                "affine_float",
                v.len(),
                &input,
                &mut output,
--- a/candle-metal-kernels/examples/binary.rs
+++ b/candle-metal-kernels/examples/binary.rs
--- a/candle-metal-kernels/examples/cast.rs
+++ b/candle-metal-kernels/examples/cast.rs
--- a/candle-metal-kernels/examples/unary.rs
+++ b/candle-metal-kernels/examples/unary.rs
@ -147,7 +147,7 @@ fn run_unary_bench<T: Clone>(
        println!(
            "{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
            type_name::<T>().split("::").last().unwrap(),
-            kernel_name.0,
+            kernel_name.to_string(),
            v.len(),
            iterations,
            total_time,
@ -159,7 +159,7 @@ fn run_unary_bench<T: Clone>(
    let shape = vec![2, 5_000];
    let strides = vec![2, 1];
    let offset = 0;
-    for kernel_name in &strided {
+    for kernel_name in strided {
        let total_time = autoreleasepool(|| {
            let command_buffer = command_queue.new_command_buffer();
            let start = Instant::now();
@ -187,7 +187,7 @@ fn run_unary_bench<T: Clone>(
        println!(
            "{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
            type_name::<T>().split("::").last().unwrap(),
-            kernel_name.0,
+            kernel_name.to_string(),
            v.len(),
            iterations,
            total_time,
--- a/candle-metal-kernels/src/affine.metal
+++ b/candle-metal-kernels/src/affine.metal
@ -33,24 +33,6 @@ kernel void FN_NAME( \
    const TYPENAME a = TYPENAME(add); \
    output[id] = input[id] * m + a; \
 } \
-kernel void FN_NAME##_strided( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant float &mul, \
-    constant float &add, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    const TYPENAME m = TYPENAME(mul); \
-    const TYPENAME a = TYPENAME(add); \
-    output[id] = input[get_strided_index(id, num_dims, dims, strides)] * m + a; \
-} \

 AFFINE(affine_float, float)
 AFFINE(affine_half, half)
--- a/candle-metal-kernels/src/cast.metal
+++ b/candle-metal-kernels/src/cast.metal
@ -46,8 +46,6 @@ kernel void FN_NAME_STRIDED( \
 } \

 CAST(cast_u32_f32, cast_u32_f32_strided, int32_t, float)
-CAST(cast_f16_f32, cast_f16_f32_strided, half, float)
-CAST(cast_f32_f16, cast_f32_f16_strided, float, half)

 #if __METAL_VERSION__ >= 310
 #endif
--- a/candle-metal-kernels/src/indexing.metal
+++ b/candle-metal-kernels/src/indexing.metal
@ -16,16 +16,16 @@ kernel void NAME( \
    if (gid >= dst_size) { \
        return; \
    } \
-    const size_t id_i = (gid / right_size) % ids_size; \
-    const INDEX_TYPENAME input_i = min(input_ids[id_i], (INDEX_TYPENAME)(src_dim_size - 1)); \
+    const size_t id_i = gid / right_size / left_size; \
    const size_t right_rank_i = gid % right_size; \
-    const size_t left_rank_i = gid / right_size / ids_size; \
+    const size_t left_rank_i = gid % left_size; \
    /* \
    // Force prevent out of bounds indexing \
    // since there doesn't seem to be a good way to force crash \
    // No need to check for zero we're only allowing unsized. \
    */ \
-    const size_t src_i = left_rank_i * src_dim_size * right_size + input_i * right_size + right_rank_i; \
+    const INDEX_TYPENAME input_i = min(input_ids[id_i], (INDEX_TYPENAME)(src_dim_size - 1)); \
+    const size_t src_i = ((input_i * right_size) + right_rank_i) * left_size + left_rank_i; \
    output[gid] = input[src_i]; \
 }

@ -75,7 +75,6 @@ kernel void FN_NAME( \


 INDEX_OP(is_u32_f32, uint, float)
-INDEX_OP(is_u32_f16, uint, half)


 #if __METAL_VERSION__ >= 310
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
@ -1,7 +1,7 @@
 #![allow(clippy::too_many_arguments)]
 use metal::{
-    Buffer, CommandBufferRef, CompileOptions, ComputeCommandEncoderRef, ComputePipelineState,
-    Device, Function, Library, MTLSize,
+    Buffer, CommandBufferRef, CompileOptions, ComputeCommandEncoderRef, ComputePipelineDescriptor,
+    ComputePipelineState, Device, Function, Library, MTLSize,
 };
 use std::collections::HashMap;
 use std::ffi::c_void;
@ -60,8 +60,8 @@ impl<T> EncoderParam for &[T] {
    fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) {
        encoder.set_bytes(
            position,
-            core::mem::size_of_val(data) as u64,
-            data.as_ptr() as *const c_void,
+            (core::mem::size_of::<T>() * data.len()) as u64,
+            data.as_ptr() as *const T as *const c_void,
        );
    }
 }
@ -112,7 +112,13 @@ macro_rules! ops{
    ($($name:ident),+) => {

        pub mod contiguous {
-        pub struct Kernel(pub &'static str);
+        #[derive(Clone, Copy)]
+        pub struct Kernel(pub(crate) &'static str);
+        impl std::fmt::Display for Kernel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+        }
        $(
        pub mod $name {
            use super::Kernel;
@ -121,17 +127,16 @@ macro_rules! ops{
            pub const BFLOAT: Kernel = Kernel(concat!(stringify!($name), "_bfloat"));
        }
        )+
-            pub mod copy {
-                use super::Kernel;
-                pub const FLOAT: Kernel = Kernel("copy_float");
-                pub const HALF: Kernel = Kernel("copy_half");
-                pub const BFLOAT: Kernel = Kernel("copy_bfloat");
-                pub const U32: Kernel = Kernel("copy_u32");
-            }
        }

        pub mod strided {
-        pub struct Kernel(pub &'static str);
+        #[derive(Clone, Copy)]
+        pub struct Kernel(pub(crate) &'static str);
+        impl std::fmt::Display for Kernel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+        }
        $(
        pub mod $name {
            use super::Kernel;
@ -140,19 +145,12 @@ macro_rules! ops{
            pub const BFLOAT: Kernel = Kernel(concat!(stringify!($name), "_bfloat_strided"));
        }
        )+
-            pub mod copy {
-                use super::Kernel;
-                pub const FLOAT: Kernel = Kernel("copy_float_strided");
-                pub const HALF: Kernel = Kernel("copy_half_strided");
-                pub const BFLOAT: Kernel = Kernel("copy_bfloat_strided");
-                pub const U32: Kernel = Kernel("copy_u32_strided");
-            }
        }
    };
 }

 pub mod unary {
-    ops!(cos, sin, exp, sqr, sqrt, neg, log, gelu, ceil, floor, round, erf, gelu_erf);
+    ops!(cos, sin, exp, sqr, sqrt, neg, copy);
 }
 pub mod binary {
    ops!(add, sub, mul, div);
@ -172,12 +170,8 @@ pub enum MetalKernelError {
    LockError(String),
    #[error("Error while loading library: {0}")]
    LoadLibraryError(String),
-    #[error("Error while loading function: {0:?}")]
+    #[error("Error while loading function: {0}")]
    LoadFunctionError(String),
-    #[error("Failed to create compute function")]
-    FailedToCreateComputeFunction,
-    #[error("Failed to create pipeline")]
-    FailedToCreatePipeline(String),
 }

 impl<T> From<std::sync::PoisonError<T>> for MetalKernelError {
@ -188,22 +182,19 @@ impl<T> From<std::sync::PoisonError<T>> for MetalKernelError {

 type KernelMap<T> = HashMap<&'static str, T>;
 type Libraries = HashMap<Source, Library>;
-type Pipelines = KernelMap<ComputePipelineState>;
+type Functions = KernelMap<Function>;

 #[derive(Debug, Default)]
 pub struct Kernels {
    libraries: RwLock<Libraries>,
-    pipelines: RwLock<Pipelines>,
+    funcs: RwLock<Functions>,
 }

 impl Kernels {
    pub fn new() -> Self {
        let libraries = RwLock::new(Libraries::new());
-        let pipelines = RwLock::new(Pipelines::new());
-        Self {
-            libraries,
-            pipelines,
-        }
+        let funcs = RwLock::new(Functions::new());
+        Self { libraries, funcs }
    }

    // pub fn init(device: &Device) -> Result<Self, MetalKernelError> {
@ -250,43 +241,22 @@ impl Kernels {
        }
    }

-    fn load_function(
+    pub fn load_function(
        &self,
        device: &Device,
        source: Source,
        name: &'static str,
    ) -> Result<Function, MetalKernelError> {
-        let func = self
-            .load_library(device, source)?
-            .get_function(name, None)
-            .map_err(|e| MetalKernelError::LoadFunctionError(e.to_string()))?;
-        Ok(func)
-        // let mut funcs = self.funcs.write()?;
-        // if let Some(func) = funcs.get(name) {
-        //     Ok(func.clone())
-        // } else {
-        //     funcs.insert(name, func.clone());
-        //     Ok(func)
-        // }
-    }
-
-    pub fn load_pipeline(
-        &self,
-        device: &Device,
-        source: Source,
-        name: &'static str,
-    ) -> Result<ComputePipelineState, MetalKernelError> {
-        let mut pipelines = self.pipelines.write()?;
-        if let Some(pipeline) = pipelines.get(name) {
-            Ok(pipeline.clone())
+        let mut funcs = self.funcs.write()?;
+        if let Some(func) = funcs.get(name) {
+            Ok(func.clone())
        } else {
-            let func = self.load_function(device, source, name)?;
-            let pipeline = device
-                .new_compute_pipeline_state_with_function(&func)
-                .map_err(|e| MetalKernelError::FailedToCreatePipeline(e.to_string()))?;
-            pipelines.insert(name, pipeline.clone());
-
-            Ok(pipeline)
+            let func = self
+                .load_library(device, source)?
+                .get_function(name, None)
+                .map_err(|e| MetalKernelError::LoadFunctionError(e.to_string()))?;
+            funcs.insert(name, func.clone());
+            Ok(func)
        }
    }
 }
@ -300,7 +270,18 @@ pub fn call_unary_contiguous(
    input: &Buffer,
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Unary, kernel_name.0)?;
+    // println!("Kernel {:?}", kernel_name.0);
+    // assert_eq!(input.length(), output.length());
+    let func = kernels.load_function(device, Source::Unary, kernel_name.0)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();
+
    let encoder = command_buffer.new_compute_command_encoder();
    encoder.set_compute_pipeline_state(&pipeline);

@ -323,7 +304,15 @@ pub fn call_unary_strided(
    output: &mut Buffer,
    output_offset: usize,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Unary, name.0)?;
+    let func = kernels.load_function(device, Source::Unary, name.0)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();

    let num_dims: usize = shape.len();
    let encoder = command_buffer.new_compute_command_encoder();
@ -360,7 +349,17 @@ pub fn call_binary_contiguous(
    right: &Buffer,
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Binary, kernel_name.0)?;
+    // println!("Kernel {:?}", kernel_name.0);
+    // assert_eq!(input.length(), output.length());
+    let func = kernels.load_function(device, Source::Binary, kernel_name.0)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();

    let encoder = command_buffer.new_compute_command_encoder();
    encoder.set_compute_pipeline_state(&pipeline);
@ -388,7 +387,15 @@ pub fn call_binary_strided(
    right_offset: usize,
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Binary, name.0)?;
+    let func = kernels.load_function(device, Source::Binary, name.0)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();

    let num_dims: usize = shape.len();
    let encoder = command_buffer.new_compute_command_encoder();
@ -427,7 +434,17 @@ pub fn call_cast_contiguous(
    input: &Buffer,
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Cast, kernel_name)?;
+    // println!("Kernel {:?}", kernel_name.0);
+    // assert_eq!(input.length(), output.length());
+    let func = kernels.load_function(device, Source::Cast, kernel_name)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();

    let encoder = command_buffer.new_compute_command_encoder();
    encoder.set_compute_pipeline_state(&pipeline);
@ -441,38 +458,6 @@ pub fn call_cast_contiguous(
    Ok(())
 }

-pub fn call_cast_strided(
-    device: &Device,
-    command_buffer: &CommandBufferRef,
-    kernels: &Kernels,
-    kernel_name: &'static str,
-    shape: &[usize],
-    input: &Buffer,
-    input_strides: &[usize],
-    input_offset: usize,
-    output: &mut Buffer,
-) -> Result<(), MetalKernelError> {
-    // println!("Kernel {:?}", kernel_name.0);
-    // assert_eq!(input.length(), output.length());
-    let pipeline = kernels.load_pipeline(device, Source::Cast, kernel_name)?;
-
-    let encoder = command_buffer.new_compute_command_encoder();
-    encoder.set_compute_pipeline_state(&pipeline);
-
-    let length: usize = shape.iter().product();
-
-    set_params!(
-        encoder,
-        (length, shape, input_strides, (input, input_offset), output)
-    );
-
-    let (thread_group_count, thread_group_size) = linear_split(&pipeline, length);
-
-    encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
-    encoder.end_encoding();
-    Ok(())
-}
-
 pub fn call_reduce_contiguous(
    device: &Device,
    command_buffer: &CommandBufferRef,
@ -483,7 +468,16 @@ pub fn call_reduce_contiguous(
    input: &Buffer,
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?;
+    let func = kernels.load_function(device, Source::Reduce, kernel_name)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();
+
    let elements_to_sum = length / out_length;

    let encoder = command_buffer.new_compute_command_encoder();
@ -524,7 +518,16 @@ pub fn call_last_softmax(
    input: &Buffer,
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?;
+    let func = kernels.load_function(device, Source::Reduce, kernel_name)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();
+
    let encoder = command_buffer.new_compute_command_encoder();
    encoder.set_compute_pipeline_state(&pipeline);

@ -560,14 +563,21 @@ pub fn call_affine(
    device: &Device,
    command_buffer: &CommandBufferRef,
    kernels: &Kernels,
-    name: &'static str,
    size: usize,
    input: &Buffer,
    output: &mut Buffer,
    mul: f32,
    add: f32,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Affine, name)?;
+    let func = kernels.load_function(device, Source::Affine, "affine_float")?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();

    let encoder = command_buffer.new_compute_command_encoder();
    encoder.set_compute_pipeline_state(&pipeline);
@ -580,45 +590,6 @@ pub fn call_affine(
    Ok(())
 }

-pub fn call_affine_strided(
-    device: &Device,
-    command_buffer: &CommandBufferRef,
-    kernels: &Kernels,
-    name: &'static str,
-    shape: &[usize],
-    input: &Buffer,
-    input_stride: &[usize],
-    input_offset: usize,
-    output: &mut Buffer,
-    mul: f32,
-    add: f32,
-) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Affine, name)?;
-    let size: usize = shape.iter().product();
-
-    let encoder = command_buffer.new_compute_command_encoder();
-    encoder.set_compute_pipeline_state(&pipeline);
-
-    set_params!(
-        encoder,
-        (
-            size,
-            shape.len(),
-            shape,
-            input_stride,
-            mul,
-            add,
-            (input, input_offset),
-            output
-        )
-    );
-
-    let (thread_group_count, thread_group_size) = linear_split(&pipeline, size);
-    encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
-    encoder.end_encoding();
-    Ok(())
-}
-
 pub fn call_where_cond_strided(
    device: &Device,
    command_buffer: &CommandBufferRef,
@ -633,7 +604,15 @@ pub fn call_where_cond_strided(
    (right_stride, right_offset): (&[usize], usize),
    output: &mut Buffer,
 ) -> Result<(), MetalKernelError> {
-    let pipeline = kernels.load_pipeline(device, Source::Ternary, name)?;
+    let func = kernels.load_function(device, Source::Ternary, name)?;
+    let pipeline_state_descriptor = ComputePipelineDescriptor::new();
+    pipeline_state_descriptor.set_compute_function(Some(&func));
+
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(
+            pipeline_state_descriptor.compute_function().unwrap(),
+        )
+        .unwrap();

    let encoder = command_buffer.new_compute_command_encoder();
    encoder.set_compute_pipeline_state(&pipeline);
@ -681,7 +660,10 @@ pub fn call_index_select(
    let src_dim_size = shape[dim];
    let dst_el = ids_size * left_size * right_size;

-    let pipeline = kernels.load_pipeline(device, Source::Indexing, name)?;
+    let func = kernels.load_function(device, Source::Indexing, name)?;
+    let pipeline = device
+        .new_compute_pipeline_state_with_function(&func)
+        .unwrap();

    let encoder = command_buffer.new_compute_command_encoder();

@ -980,7 +962,6 @@ mod tests {
            &device,
            command_buffer,
            &kernels,
-            "affine_float",
            size,
            &input,
            &mut output,
@ -994,43 +975,6 @@ mod tests {
        output.read_to_vec::<T>(v.len())
    }

-    fn run_affine_strided<T: Clone>(
-        v: &[T],
-        shape: &[usize],
-        strides: &[usize],
-        mul: f64,
-        add: f64,
-    ) -> Vec<T> {
-        let device = device();
-        let kernels = Kernels::new();
-        let command_queue = device.new_command_queue();
-        let command_buffer = command_queue.new_command_buffer();
-
-        let input = new_buffer(&device, v);
-        let mut output = new_buffer(&device, v);
-
-        let size = v.len();
-
-        call_affine_strided(
-            &device,
-            command_buffer,
-            &kernels,
-            "affine_float",
-            shape,
-            &input,
-            strides,
-            0,
-            &mut output,
-            mul as f32,
-            add as f32,
-        )
-        .unwrap();
-        command_buffer.commit();
-        command_buffer.wait_until_completed();
-
-        output.read_to_vec::<T>(v.len())
-    }
-
    #[test]
    fn affine() {
        let input = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
@ -1046,16 +990,6 @@ mod tests {
        assert_eq!(result, vec![2.6; 40_000]);
    }

-    // #[test]
-    // fn affine_strided() {
-    //     let input = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-    //     let mul = 1.5;
-    //     let add = 1.1;
-    //     let result = run_affine_(&input, mul, add);
-    //     assert_eq!(result, vec![2.6, 4.1, 5.6, 7.1, 8.6, 10.1, 11.6, 13.1]);
-
-    // }
-
    #[test]
    fn index_select() {
        let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
@ -1074,10 +1008,7 @@ mod tests {
            result,
            vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 1.0f32, 2.0, 3.0, 4.0, 5.0]
        );
-    }

-    #[test]
-    fn index_select_dim1() {
        let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
        let shape = [5, 2];
        let ids = [0u32, 1, 0];
@ -1085,7 +1016,7 @@ mod tests {
        let result = run_index_select(&embedding, &shape, &ids, dim);
        assert_eq!(
            result,
-            vec![1.0f32, 2.0, 1.0, 3.0, 4.0, 3.0, 5.0, 6.0, 5.0, 7.0, 8.0f32, 7.0, 9.0, 10.0, 9.0]
+            vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 1.0f32, 2.0, 3.0, 4.0, 5.0]
        );
    }

@ -1133,7 +1064,6 @@ mod tests {
        let device = Device::system_default().expect("no device found");

        let options = CompileOptions::new();
-        options.set_fast_math_enabled(true);
        let library = device.new_library_with_source(INDEXING, &options).unwrap();

        let left = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
--- a/candle-metal-kernels/src/unary.metal
+++ b/candle-metal-kernels/src/unary.metal
@ -1,7 +1,4 @@
 #include <metal_stdlib>
-#include <metal_math>
-#
-using namespace metal;

 METAL_FUNC uint get_strided_index(
    uint idx,
@ -20,39 +17,10 @@ METAL_FUNC uint get_strided_index(

 template <typename T> METAL_FUNC T sqr(T in){ return in * in; }
 template <typename T> METAL_FUNC T neg(T in){ return -in; }
-template <typename T> METAL_FUNC T erf(T in){
-    float x = (float) in;
-    // constants
-    float a1 =  0.254829592;
-    float a2 = -0.284496736;
-    float a3 =  1.421413741;
-    float a4 = -1.453152027;
-    float a5 =  1.061405429;
-    float p  =  0.3275911;
-
-    // Save the sign of x
-    int sign = 1;
-    if (x < 0)
-        sign = -1;
-    x = fabs(x);
-
-    // A&S formula 7.1.26
-    float t = 1.0/(1.0 + p*x);
-    float y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
-
-    return T(sign*y);
-}
 template <typename T> METAL_FUNC T id(T in){ return in; }
-template <typename T> METAL_FUNC T gelu_erf(T x){ return T(x * (1 + erf(x * M_SQRT1_2_F)) / 2); }
-template <typename T> METAL_FUNC T gelu(T x){
-    T x_sq = x * x;
-    T x_cube = x_sq * x;
-    T alpha = x + static_cast<T>(0.044715) * x_cube;
-    T beta =  (static_cast<T>(M_2_SQRTPI_F * M_SQRT1_2_F) * alpha);
-    return static_cast<T>(0.5) * x * (static_cast<T>(1.0) + T(tanh(beta)));
-}


+using namespace metal;

 #define UNARY(FN, TYPENAME, FN_NAME, FN_NAME_STRIDED) \
 kernel void FN_NAME( \
@ -95,17 +63,8 @@ UNARY_OP(sqr)
 UNARY_OP(sqrt)
 UNARY_OP(neg)
 UNARY_OP(exp)
-UNARY_OP(log)
-UNARY_OP(gelu)
-UNARY_OP(ceil)
-UNARY_OP(floor)
-UNARY_OP(round)
-UNARY_OP(gelu_erf)
-UNARY_OP(erf)
 UNARY(id, float, copy_float, copy_float_strided)
 UNARY(id, half, copy_half, copy_half_strided)
-UNARY(id, uint8_t, copy_u8, copy_u8_strided)
-UNARY(id, uint32_t, copy_u32, copy_u32_strided)

 #if __METAL_VERSION__ >= 310
 BFLOAT_UNARY_OP(cos)
@ -114,13 +73,6 @@ BFLOAT_UNARY_OP(sqr)
 BFLOAT_UNARY_OP(sqrt)
 BFLOAT_UNARY_OP(neg)
 BFLOAT_UNARY_OP(exp)
-BFLOAT_UNARY_OP(log)
-BFLOAT_UNARY_OP(gelu)
-BFLOAT_UNARY_OP(ceil)
-BFLOAT_UNARY_OP(floor)
-BFLOAT_UNARY_OP(round)
-BFLOAT_UNARY_OP(gelu_erf)
-BFLOAT_UNARY_OP(erf)

 UNARY(id, bfloat, copy_bfloat, copy_bfloat_strided)
 #endif
--- a/candle-nn/src/activation.rs
+++ b/candle-nn/src/activation.rs
@ -6,6 +6,7 @@ use serde::Deserialize;
 pub enum Activation {
    #[default]
    Gelu,
+    #[serde(rename = "gated-gelu")]
    NewGelu,
    Relu,
    Relu2,
--- a/candle-nn/src/embedding.rs
+++ b/candle-nn/src/embedding.rs
@ -9,6 +9,7 @@ pub struct Embedding {

 impl Embedding {
    pub fn new(embeddings: Tensor, hidden_size: usize) -> Self {
+        // todo!("Embedding {embeddings}");
        Self {
            embeddings,
            hidden_size,
--- a/candle-transformers/Cargo.toml
+++ b/candle-transformers/Cargo.toml
@ -21,7 +21,6 @@ rand = { workspace = true }
 rayon = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
-serde_plain = { workspace = true }
 tracing = { workspace = true }
 wav = { workspace = true }

--- a/candle-transformers/src/models/llama2_c.rs
+++ b/candle-transformers/src/models/llama2_c.rs
@ -156,6 +156,7 @@ impl CausalSelfAttention {
        let x = x.reshape((b_sz, seq_len, h, n_embd / 2, 2))?;
        let x0 = x.narrow(D::Minus1, 0, 1)?;
        let x1 = x.narrow(D::Minus1, 1, 1)?;
+        todo!("X {x1}");
        let dst0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?;
        let dst1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?;
        let rope = Tensor::cat(&[&dst0, &dst1], D::Minus1)?.reshape((b_sz, seq_len, h, n_embd))?;
@ -173,6 +174,7 @@ impl CausalSelfAttention {
        let mut v = v.reshape((b_sz, seq_len, self.n_key_value_head, self.head_dim))?;

        let q = self.apply_rotary_emb(&q, index_pos)?;
+        todo!("X {q}");
        let mut k = self.apply_rotary_emb(&k, index_pos)?;

        if self.cache.use_kv_cache {
@ -295,6 +297,7 @@ impl Block {
        let residual = x;
        let x = self.rms_1.forward(x)?;
        let x = (self.attn.forward(&x, index_pos, block_idx)? + residual)?;
+        todo!("---X {}", x);
        let residual = &x;
        let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
        Ok(x)
@ -327,6 +330,7 @@ impl Llama {
    pub fn forward(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
        let (_b_sz, _seq_len) = x.dims2()?;
        let mut x = self.wte.forward(x)?;
+        //println!("Embeddings {}", self.wte.embeddings());
        for (block_idx, block) in self.blocks.iter().enumerate() {
            x = block.forward(&x, index_pos, block_idx)?;
        }
--- a/candle-transformers/src/models/mod.rs
+++ b/candle-transformers/src/models/mod.rs
@ -29,10 +29,8 @@ pub mod segment_anything;
 pub mod stable_diffusion;
 pub mod stable_lm;
 pub mod t5;
-pub mod trocr;
 pub mod vgg;
 pub mod vit;
 pub mod whisper;
 pub mod with_tracing;
 pub mod wuerstchen;
-pub mod yi;
--- a/candle-transformers/src/models/quantized_t5.rs
+++ b/candle-transformers/src/models/quantized_t5.rs
@ -1,7 +1,6 @@
 // T5 Text Model, quantized version
 // https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py

-use crate::models::t5::{deserialize_feed_forward_proj_activation, ActivationWithOptionalGating};
 use crate::models::with_tracing::QMatMul;
 use crate::quantized_nn::Embedding;
 pub use crate::quantized_var_builder::VarBuilder;
@ -55,8 +54,8 @@ pub struct Config {
    dropout_rate: f64,
    layer_norm_epsilon: f64,
    initializer_factor: f64,
-    #[serde(default, deserialize_with = "deserialize_feed_forward_proj_activation")]
-    pub feed_forward_proj: ActivationWithOptionalGating,
+    #[serde(default)]
+    feed_forward_proj: Activation,
    #[serde(default = "default_tie_word_embeddings")]
    tie_word_embeddings: bool,
    #[serde(default = "default_is_decoder")]
@ -84,10 +83,7 @@ impl Default for Config {
            dropout_rate: 0.1,
            layer_norm_epsilon: 1e-6,
            initializer_factor: 1.0,
-            feed_forward_proj: ActivationWithOptionalGating {
-                gated: false,
-                activation: Activation::Relu,
-            },
+            feed_forward_proj: Activation::Relu,
            tie_word_embeddings: true,
            is_decoder: false,
            is_encoder_decoder: true,
@ -180,7 +176,7 @@ impl T5DenseGatedActDense {
            wi_0,
            wi_1,
            wo,
-            act: cfg.feed_forward_proj.activation,
+            act: Activation::NewGelu,
            span: tracing::span!(tracing::Level::TRACE, "dense-gated-act-dense"),
        })
    }
@ -209,7 +205,7 @@ impl T5LayerFF {
    fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
        let layer_norm =
            T5LayerNorm::load(cfg.d_model, cfg.layer_norm_epsilon, vb.pp("layer_norm"))?;
-        let (dense_act, gated_dense_act) = if cfg.feed_forward_proj.gated {
+        let (dense_act, gated_dense_act) = if cfg.feed_forward_proj == Activation::NewGelu {
            (
                None,
                Some(T5DenseGatedActDense::load(vb.pp("DenseReluDense"), cfg)?),
--- a/candle-transformers/src/models/t5.rs
+++ b/candle-transformers/src/models/t5.rs
@ -37,37 +37,6 @@ fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result<Tensor>
    Ok(m)
 }

-#[derive(Debug, Deserialize, Default, Clone, PartialEq)]
-pub struct ActivationWithOptionalGating {
-    pub gated: bool,
-    pub activation: candle_nn::Activation,
-}
-
-pub fn deserialize_feed_forward_proj_activation<'de, D>(
-    deserializer: D,
-) -> std::result::Result<ActivationWithOptionalGating, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    match String::deserialize(deserializer)?.as_str() {
-        "gated-gelu" => Ok(ActivationWithOptionalGating {
-            gated: true,
-            activation: candle_nn::Activation::NewGelu,
-        }),
-        "gated-silu" => Ok(ActivationWithOptionalGating {
-            gated: true,
-            activation: candle_nn::Activation::Silu,
-        }),
-        buf => {
-            let activation = serde_plain::from_str(buf).map_err(serde::de::Error::custom)?;
-            Ok(ActivationWithOptionalGating {
-                gated: false,
-                activation,
-            })
-        }
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Deserialize)]
 pub struct Config {
    vocab_size: usize,
@ -83,8 +52,8 @@ pub struct Config {
    dropout_rate: f64,
    layer_norm_epsilon: f64,
    initializer_factor: f64,
-    #[serde(default, deserialize_with = "deserialize_feed_forward_proj_activation")]
-    feed_forward_proj: ActivationWithOptionalGating,
+    #[serde(default)]
+    feed_forward_proj: Activation,
    #[serde(default = "default_tie_word_embeddings")]
    tie_word_embeddings: bool,
    #[serde(default = "default_is_decoder")]
@ -112,10 +81,7 @@ impl Default for Config {
            dropout_rate: 0.1,
            layer_norm_epsilon: 1e-6,
            initializer_factor: 1.0,
-            feed_forward_proj: ActivationWithOptionalGating {
-                gated: false,
-                activation: Activation::Relu,
-            },
+            feed_forward_proj: Activation::Relu,
            tie_word_embeddings: true,
            is_decoder: false,
            is_encoder_decoder: true,
@ -136,10 +102,7 @@ impl Config {
            d_model: 768,
            dropout_rate: 0.1,
            eos_token_id: 1,
-            feed_forward_proj: ActivationWithOptionalGating {
-                gated: false,
-                activation: Activation::Relu,
-            },
+            feed_forward_proj: Activation::Relu,
            tie_word_embeddings: true,
            initializer_factor: 1.0,
            is_decoder: false,
@ -239,7 +202,7 @@ impl T5DenseGatedActDense {
            wi_0,
            wi_1,
            wo,
-            act: cfg.feed_forward_proj.activation,
+            act: Activation::NewGelu,
            span: tracing::span!(tracing::Level::TRACE, "dense-gated-act-dense"),
        })
    }
@ -268,7 +231,7 @@ impl T5LayerFF {
    fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
        let layer_norm =
            T5LayerNorm::load(cfg.d_model, cfg.layer_norm_epsilon, vb.pp("layer_norm"))?;
-        let (dense_act, gated_dense_act) = if cfg.feed_forward_proj.gated {
+        let (dense_act, gated_dense_act) = if cfg.feed_forward_proj == Activation::NewGelu {
            (
                None,
                Some(T5DenseGatedActDense::load(vb.pp("DenseReluDense"), cfg)?),
@ -462,7 +425,7 @@ impl T5Attention {
                                            self.relative_attention_max_distance as f32
                                                / max_exact as f32,
                                        ) * (num_buckets - max_exact) as f32;
-                                        u32::min(max_exact + b as u32, num_buckets - 1)
+                                        max_exact + b as u32
                                    }
                                })
                                .collect::<Vec<u32>>()
--- a/candle-transformers/src/models/trocr.rs
+++ b/candle-transformers/src/models/trocr.rs
@ -1,434 +0,0 @@
-use crate::models::vit::{Config, Embeddings, Encoder};
-use candle::{Result, Tensor};
-use candle_nn::{
-    embedding, layer_norm, linear_no_bias, Embedding, LayerNorm, Linear, Module, VarBuilder,
-};
-use serde::Deserialize;
-
-#[derive(Debug, Clone, PartialEq, Deserialize)]
-pub struct TrOCRConfig {
-    pub vocab_size: usize,
-    pub d_model: usize,
-    pub hidden_size: usize,
-    pub decoder_layers: usize,
-    pub decoder_attention_heads: usize,
-    pub decoder_ffn_dim: usize,
-    pub activation_function: candle_nn::Activation,
-    pub max_position_embeddings: usize,
-    pub dropout: f64,
-    pub attention_dropout: f64,
-    pub activation_dropout: f64,
-    pub decoder_start_token_id: u32,
-    pub init_std: f64,
-    pub decoder_layerdrop: f64,
-    pub use_cache: bool,
-    pub scale_embedding: bool,
-    pub use_learned_position_embeddings: bool,
-    pub layernorm_embedding: bool,
-    pub pad_token_id: usize,
-    pub bos_token_id: usize,
-    pub eos_token_id: u32,
-    pub num_attention_heads: usize,
-    pub decoder_vocab_size: Option<usize>,
-}
-
-impl Default for TrOCRConfig {
-    fn default() -> Self {
-        Self {
-            vocab_size: 50265,
-            d_model: 1024,
-            hidden_size: 768,
-            decoder_layers: 12,
-            decoder_attention_heads: 16,
-            decoder_ffn_dim: 4096,
-            activation_function: candle_nn::Activation::Gelu,
-            max_position_embeddings: 512,
-            dropout: 0.1,
-            attention_dropout: 0.0,
-            activation_dropout: 0.0,
-            decoder_start_token_id: 2,
-            init_std: 0.02,
-            decoder_layerdrop: 0.0,
-            use_cache: true,
-            scale_embedding: false,
-            use_learned_position_embeddings: true,
-            layernorm_embedding: true,
-            pad_token_id: 1,
-            bos_token_id: 0,
-            eos_token_id: 2,
-            num_attention_heads: 12,
-            decoder_vocab_size: Some(50265),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct TrOCRLearnedPositionalEmbedding {
-    offset: usize,
-    weights: Embedding,
-}
-
-impl TrOCRLearnedPositionalEmbedding {
-    fn load(vb: VarBuilder, cfg: &TrOCRConfig) -> Result<Self> {
-        let offset: usize = 2;
-        let num_embeddings = cfg.max_position_embeddings;
-        let embedding_dim = cfg.d_model;
-        let weights = embedding(num_embeddings + offset, embedding_dim, vb)?;
-
-        Ok(Self { offset, weights })
-    }
-
-    fn forward(&mut self, input_ids: &Tensor, past_key_values_length: u32) -> Result<Tensor> {
-        let (b_sz, seq_len) = input_ids.dims2()?;
-
-        let mut positions = Tensor::arange(
-            past_key_values_length,
-            seq_len as u32 + past_key_values_length,
-            input_ids.device(),
-        )?
-        .expand((b_sz, seq_len))?;
-
-        positions =
-            positions.broadcast_add(&Tensor::new(self.offset as u32, input_ids.device())?)?;
-        self.weights.forward(&positions)
-    }
-}
-
-#[derive(Debug, Clone)]
-struct TrOCRAttention {
-    head_dim: usize,
-    num_heads: usize,
-    is_decoder: bool,
-    scaling: f64,
-    k_proj: Linear,
-    v_proj: Linear,
-    q_proj: Linear,
-    out_proj: Linear,
-    kv_cache: Option<(Tensor, Tensor)>,
-}
-
-impl TrOCRAttention {
-    fn load(
-        vb: VarBuilder,
-        cfg: &TrOCRConfig,
-        kdim: Option<usize>,
-        vdim: Option<usize>,
-    ) -> Result<Self> {
-        let embed_dim = cfg.d_model;
-        let num_heads = cfg.decoder_attention_heads;
-        let head_dim = embed_dim / num_heads;
-        let kdim = kdim.unwrap_or(embed_dim);
-        let vdim = vdim.unwrap_or(embed_dim);
-
-        let k_proj = linear_no_bias(kdim, embed_dim, vb.pp("k_proj"))?;
-        let v_proj = linear_no_bias(vdim, embed_dim, vb.pp("v_proj"))?;
-        let q_proj = linear_no_bias(embed_dim, embed_dim, vb.pp("q_proj"))?;
-
-        let out_proj = linear_no_bias(embed_dim, embed_dim, vb.pp("out_proj"))?;
-        Ok(Self {
-            head_dim,
-            num_heads,
-            is_decoder: true,
-            scaling: 1. / (head_dim as f64).sqrt(),
-            k_proj,
-            v_proj,
-            q_proj,
-            out_proj,
-            kv_cache: None,
-        })
-    }
-
-    fn _shape(&self, tensor: &Tensor, bsz: usize) -> Result<Tensor> {
-        tensor
-            .reshape((bsz, (), self.num_heads, self.head_dim))?
-            .transpose(1, 2)?
-            .contiguous()
-    }
-
-    fn forward(
-        &mut self,
-        xs: &Tensor,
-        kv_states: Option<&Tensor>,
-        attn_mask: Option<&Tensor>,
-    ) -> Result<Tensor> {
-        let (b_sz, tgt_len, _) = xs.dims3()?;
-        let query_states = (xs.apply(&self.q_proj)? * self.scaling)?;
-        let (key_states, value_states) = match kv_states {
-            None => {
-                let key_states = self._shape(&xs.apply(&self.k_proj)?, b_sz)?;
-                let value_states = self._shape(&xs.apply(&self.v_proj)?, b_sz)?;
-                if self.is_decoder {
-                    let kv_states = match &self.kv_cache {
-                        None => (key_states, value_states),
-                        Some((p_key_states, p_value_states)) => {
-                            let key_states = Tensor::cat(&[p_key_states, &key_states], 2)?;
-                            let value_states = Tensor::cat(&[p_value_states, &value_states], 2)?;
-                            (key_states, value_states)
-                        }
-                    };
-                    self.kv_cache = Some(kv_states.clone());
-                    kv_states
-                } else {
-                    (key_states, value_states)
-                }
-            }
-            Some(kv_states) => {
-                let key_states = self._shape(&kv_states.apply(&self.k_proj)?, b_sz)?;
-                let value_states = self._shape(&kv_states.apply(&self.v_proj)?, b_sz)?;
-                (key_states, value_states)
-            }
-        };
-        let proj_shape = (b_sz * self.num_heads, (), self.head_dim);
-        let query_states = self._shape(&query_states, b_sz)?.reshape(proj_shape)?;
-        let key_states = key_states.reshape(proj_shape)?;
-        let value_states = value_states.reshape(proj_shape)?;
-        let attn_weights = query_states.matmul(&key_states.transpose(1, 2)?)?;
-        let attn_weights = match attn_mask {
-            None => attn_weights,
-            Some(attn_mask) => attn_weights.broadcast_add(attn_mask)?,
-        };
-        let attn_probs = candle_nn::ops::softmax_last_dim(&attn_weights)?;
-        let attn_output = attn_probs.matmul(&value_states)?;
-        attn_output
-            .reshape((b_sz, self.num_heads, tgt_len, self.head_dim))?
-            .transpose(1, 2)?
-            .reshape((b_sz, tgt_len, self.head_dim * self.num_heads))?
-            .apply(&self.out_proj)
-    }
-}
-
-#[derive(Debug, Clone)]
-struct TrOCRDecoderLayer {
-    self_attn: TrOCRAttention,
-    activation_fn: candle_nn::Activation,
-    self_attn_layer_norm: LayerNorm,
-    encoder_attn: TrOCRAttention,
-    encoder_attn_layer_norm: LayerNorm,
-    fc1: Linear,
-    fc2: Linear,
-    final_layer_norm: LayerNorm,
-}
-
-impl TrOCRDecoderLayer {
-    fn load(vb: VarBuilder, cfg: &TrOCRConfig) -> Result<Self> {
-        let embed_dim = cfg.d_model;
-        let self_attn = TrOCRAttention::load(vb.pp("self_attn"), cfg, None, None)?;
-        let self_attn_layer_norm = layer_norm(embed_dim, 1e-5, vb.pp("self_attn_layer_norm"))?;
-        let encoder_attn = TrOCRAttention::load(
-            vb.pp("encoder_attn"),
-            cfg,
-            Some(cfg.hidden_size),
-            Some(cfg.hidden_size),
-        )?;
-        let encoder_attn_layer_norm =
-            layer_norm(embed_dim, 1e-5, vb.pp("encoder_attn_layer_norm"))?;
-        let fc1 = linear_no_bias(embed_dim, cfg.decoder_ffn_dim, vb.pp("fc1"))?;
-        let fc2 = linear_no_bias(cfg.decoder_ffn_dim, embed_dim, vb.pp("fc2"))?;
-        let final_layer_norm = layer_norm(embed_dim, 1e-5, vb.pp("final_layer_norm"))?;
-        let activation_fn = candle_nn::Activation::Gelu;
-
-        Ok(Self {
-            self_attn,
-            activation_fn,
-            self_attn_layer_norm,
-            encoder_attn,
-            encoder_attn_layer_norm,
-            fc1,
-            fc2,
-            final_layer_norm,
-        })
-    }
-
-    fn forward(
-        &mut self,
-        xs: &Tensor,
-        attention_mask: &Tensor,
-        encoder_hidden_states: Option<&Tensor>,
-    ) -> Result<Tensor> {
-        let residual = xs.clone();
-        let xs = self.self_attn.forward(xs, None, Some(attention_mask))?;
-        let xs = (xs + residual)?;
-        let mut xs = self.self_attn_layer_norm.forward(&xs)?;
-
-        if let Some(encoder_hidden_states) = &encoder_hidden_states {
-            let residual = xs.clone();
-            let encoder_attention_mask = attention_mask.clone(); // TODO
-            xs = self.encoder_attn.forward(
-                &xs,
-                Some(encoder_hidden_states),
-                Some(&encoder_attention_mask),
-            )?;
-            xs = (xs + residual)?;
-            xs = self.encoder_attn_layer_norm.forward(&xs)?
-        }
-
-        let residual = xs.clone();
-        let xs = self.fc1.forward(&xs)?;
-        let xs = self.activation_fn.forward(&xs)?;
-        let xs = self.fc2.forward(&xs)?;
-        let xs = (xs + residual)?;
-        let xs = self.final_layer_norm.forward(&xs)?;
-
-        Ok(xs)
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct TrOCRDecoder {
-    layers: Vec<TrOCRDecoderLayer>,
-    embed_scale: Option<f64>,
-    embed_tokens: Embedding,
-    embed_positions: TrOCRLearnedPositionalEmbedding,
-}
-
-impl TrOCRDecoder {
-    fn new(cfg: &TrOCRConfig, vb: VarBuilder) -> Result<Self> {
-        let vb = vb.pp("decoder.model.decoder");
-
-        let embed_tokens = embedding(cfg.vocab_size, cfg.d_model, vb.pp("embed_tokens"))?;
-        let embed_positions = TrOCRLearnedPositionalEmbedding::load(vb.pp("embed_positions"), cfg)?;
-        let mut layers = Vec::with_capacity(cfg.decoder_layers);
-        let vb_l = vb.pp("layers");
-        for idx in 0..cfg.decoder_layers {
-            let layer = TrOCRDecoderLayer::load(vb_l.pp(idx), cfg)?;
-            layers.push(layer)
-        }
-        let embed_scale = if cfg.scale_embedding {
-            Some((cfg.d_model as f64).sqrt())
-        } else {
-            None
-        };
-
-        Ok(Self {
-            layers,
-            embed_scale,
-            embed_tokens,
-            embed_positions,
-        })
-    }
-
-    pub fn forward(
-        &mut self,
-        xs: &Tensor,
-        encoder_xs: Option<&Tensor>,
-        past_kv_len: usize,
-        attn_mask: &Tensor,
-    ) -> Result<Tensor> {
-        let embed_pos = self.embed_positions.forward(xs, past_kv_len as u32)?;
-        let xs = xs.apply(&self.embed_tokens)?;
-
-        let xs = match self.embed_scale {
-            None => xs,
-            Some(scale) => (xs * scale)?,
-        };
-
-        let mut xs = xs.broadcast_add(&embed_pos)?;
-
-        for layer in self.layers.iter_mut() {
-            xs = layer.forward(&xs, attn_mask, encoder_xs)?;
-        }
-        Ok(xs)
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct TrOCREncoder {
-    embeddings: Embeddings,
-    encoder: Encoder,
-    layernorm: LayerNorm,
-}
-
-impl TrOCREncoder {
-    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let vb_v = vb.pp("encoder");
-
-        let embeddings = Embeddings::new(cfg, false, vb_v.pp("embeddings"))?;
-
-        let encoder = Encoder::new(cfg, vb_v.pp("encoder"))?;
-        let layernorm = layer_norm(cfg.hidden_size, cfg.layer_norm_eps, vb_v.pp("layernorm"))?;
-
-        Ok(Self {
-            embeddings,
-            encoder,
-            layernorm,
-        })
-    }
-
-    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        let embedding_output = self.embeddings.forward(xs, None, false)?;
-        let encoder_outputs = self.encoder.forward(&embedding_output)?;
-
-        self.layernorm.forward(&encoder_outputs)
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct TrOCRForCausalLM {
-    decoder: TrOCRDecoder,
-    output_projection: Linear,
-}
-
-impl TrOCRForCausalLM {
-    pub fn new(decoder_cfg: &TrOCRConfig, vb: VarBuilder) -> Result<Self> {
-        let decoder = TrOCRDecoder::new(decoder_cfg, vb.clone())?;
-        let output_projection =
-            candle_nn::Linear::new(decoder.embed_tokens.embeddings().clone(), None);
-        Ok(Self {
-            decoder,
-            output_projection,
-        })
-    }
-
-    pub fn forward(
-        &mut self,
-        xs: &Tensor,
-        encoder_xs: Option<&Tensor>,
-        past_kv_len: usize,
-        attn_mask: &Tensor,
-    ) -> Result<Tensor> {
-        let xs = self
-            .decoder
-            .forward(xs, encoder_xs, past_kv_len, attn_mask)?;
-        let xs = xs.apply(&self.output_projection)?;
-
-        Ok(xs)
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct TrOCRModel {
-    encoder: TrOCREncoder,
-    decoder: TrOCRForCausalLM,
-}
-
-impl TrOCRModel {
-    pub fn new(encoder_cfg: &Config, decoder_cfg: &TrOCRConfig, vb: VarBuilder) -> Result<Self> {
-        let encoder = TrOCREncoder::new(encoder_cfg, vb.clone())?;
-        let decoder = TrOCRForCausalLM::new(decoder_cfg, vb)?;
-        Ok(Self { encoder, decoder })
-    }
-
-    pub fn encoder(&mut self) -> &mut TrOCREncoder {
-        &mut self.encoder
-    }
-
-    pub fn decoder(&mut self) -> &mut TrOCRForCausalLM {
-        &mut self.decoder
-    }
-
-    pub fn decode(
-        &mut self,
-        xs: &Tensor,
-        encoder_xs: &Tensor,
-        past_kv_len: usize,
-    ) -> Result<Tensor> {
-        let seq_len = xs.dim(1)?;
-        let mask: Vec<_> = (0..seq_len)
-            .flat_map(|i| (0..seq_len).map(move |j| if j > i { f32::NEG_INFINITY } else { 0f32 }))
-            .collect();
-        let mask = Tensor::from_vec(mask, (seq_len, seq_len), xs.device())?;
-
-        self.decoder
-            .forward(xs, Some(encoder_xs), past_kv_len, &mask)
-    }
-}
--- a/candle-transformers/src/models/vit.rs
+++ b/candle-transformers/src/models/vit.rs
@ -6,16 +6,16 @@ use candle_nn::{layer_norm, LayerNorm, VarBuilder};
 // https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit/configuration_vit.py
 #[derive(Debug, Clone)]
 pub struct Config {
-    pub hidden_size: usize,
-    pub num_hidden_layers: usize,
-    pub num_attention_heads: usize,
-    pub intermediate_size: usize,
-    pub hidden_act: candle_nn::Activation,
-    pub layer_norm_eps: f64,
-    pub image_size: usize,
-    pub patch_size: usize,
-    pub num_channels: usize,
-    pub qkv_bias: bool,
+    hidden_size: usize,
+    num_hidden_layers: usize,
+    num_attention_heads: usize,
+    intermediate_size: usize,
+    hidden_act: candle_nn::Activation,
+    layer_norm_eps: f64,
+    image_size: usize,
+    patch_size: usize,
+    num_channels: usize,
+    qkv_bias: bool,
 }

 impl Config {
@ -34,21 +34,6 @@ impl Config {
            qkv_bias: true,
        }
    }
-
-    pub fn microsoft_trocr_base_handwritten() -> Self {
-        Self {
-            hidden_size: 768,
-            num_hidden_layers: 12,
-            num_attention_heads: 12,
-            intermediate_size: 3072,
-            hidden_act: candle_nn::Activation::Gelu,
-            layer_norm_eps: 1e-12,
-            image_size: 384,
-            patch_size: 16,
-            num_channels: 3,
-            qkv_bias: false,
-        }
-    }
 }

 #[derive(Debug, Clone)]
@ -91,7 +76,7 @@ impl Module for PatchEmbeddings {
 }

 #[derive(Debug, Clone)]
-pub struct Embeddings {
+struct Embeddings {
    cls_token: Tensor,
    mask_token: Option<Tensor>,
    patch_embeddings: PatchEmbeddings,
@ -100,7 +85,7 @@ pub struct Embeddings {
 }

 impl Embeddings {
-    pub fn new(cfg: &Config, use_mask_token: bool, vb: VarBuilder) -> Result<Self> {
+    fn new(cfg: &Config, use_mask_token: bool, vb: VarBuilder) -> Result<Self> {
        let hidden_size = cfg.hidden_size;
        let cls_token = vb.get((1, 1, hidden_size), "cls_token")?;
        let mask_token = if use_mask_token {
@ -130,7 +115,7 @@ impl Embeddings {
        todo!()
    }

-    pub fn forward(
+    fn forward(
        &self,
        pixel_values: &Tensor,
        bool_masked_pos: Option<&Tensor>,
@ -339,12 +324,12 @@ impl Module for Layer {
 }

 #[derive(Debug, Clone)]
-pub struct Encoder {
+struct Encoder {
    layers: Vec<Layer>,
 }

 impl Encoder {
-    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
        let vb = vb.pp("layer");
        let mut layers = Vec::with_capacity(cfg.num_hidden_layers);
        for i in 0..cfg.num_hidden_layers {
--- a/candle-transformers/src/models/yi.rs
+++ b/candle-transformers/src/models/yi.rs
@ -1,377 +0,0 @@
-/// https://huggingface.co/01-ai/Yi-6B/blob/main/modeling_yi.py
-use crate::models::with_tracing::{linear_no_bias, Linear};
-use candle::{DType, Device, Module, Result, Tensor, D};
-use candle_nn::{Activation, VarBuilder};
-use std::sync::Arc;
-
-#[derive(Debug, Clone, PartialEq)]
-pub struct Config {
-    pub(crate) vocab_size: usize,
-    pub(crate) hidden_size: usize,
-    pub(crate) intermediate_size: usize,
-    pub(crate) num_hidden_layers: usize,
-    pub(crate) num_attention_heads: usize,
-    pub(crate) num_key_value_heads: usize,
-    pub(crate) hidden_act: Activation,
-    pub(crate) max_position_embeddings: usize,
-    pub(crate) rms_norm_eps: f64,
-    pub(crate) rope_theta: f64,
-}
-
-impl Config {
-    pub fn config_6b() -> Self {
-        Self {
-            vocab_size: 64000,
-            hidden_size: 4096,
-            intermediate_size: 11008,
-            num_hidden_layers: 32,
-            num_attention_heads: 32,
-            num_key_value_heads: 4,
-            hidden_act: Activation::Silu,
-            max_position_embeddings: 4096,
-            rms_norm_eps: 1e-5,
-            rope_theta: 5_000_000.,
-        }
-    }
-
-    pub fn config_34b() -> Self {
-        Self {
-            vocab_size: 64000,
-            hidden_size: 7168,
-            intermediate_size: 20480,
-            num_hidden_layers: 60,
-            num_attention_heads: 56,
-            num_key_value_heads: 8,
-            hidden_act: Activation::Silu,
-            max_position_embeddings: 4096,
-            rms_norm_eps: 1e-5,
-            rope_theta: 5_000_000.,
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct RmsNorm {
-    inner: candle_nn::RmsNorm,
-    span: tracing::Span,
-}
-
-impl RmsNorm {
-    fn new(size: usize, eps: f64, vb: VarBuilder) -> Result<Self> {
-        let span = tracing::span!(tracing::Level::TRACE, "rms-norm");
-        let inner = candle_nn::rms_norm(size, eps, vb)?;
-        Ok(Self { inner, span })
-    }
-}
-
-impl Module for RmsNorm {
-    fn forward(&self, x: &Tensor) -> Result<Tensor> {
-        let _enter = self.span.enter();
-        self.inner.forward(x)
-    }
-}
-
-#[derive(Debug, Clone)]
-struct RotaryEmbedding {
-    sin: Tensor,
-    cos: Tensor,
-}
-
-fn rotate_half(xs: &Tensor) -> Result<Tensor> {
-    let last_dim = xs.dim(D::Minus1)?;
-    let xs1 = xs.narrow(D::Minus1, 0, last_dim / 2)?;
-    let xs2 = xs.narrow(D::Minus1, last_dim / 2, last_dim - last_dim / 2)?;
-    Tensor::cat(&[&xs2.neg()?, &xs1], D::Minus1)
-}
-
-impl RotaryEmbedding {
-    fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result<Self> {
-        let dim = cfg.hidden_size / cfg.num_attention_heads;
-        let max_seq_len = cfg.max_position_embeddings;
-        let inv_freq: Vec<_> = (0..dim)
-            .step_by(2)
-            .map(|i| 1f32 / 10000f32.powf(i as f32 / dim as f32))
-            .collect();
-        let inv_freq_len = inv_freq.len();
-        let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(dtype)?;
-        let t = Tensor::arange(0u32, max_seq_len as u32, dev)?
-            .to_dtype(dtype)?
-            .reshape((max_seq_len, 1))?;
-        let freqs = t.matmul(&inv_freq)?;
-        let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1)?;
-        Ok(Self {
-            sin: freqs.sin()?,
-            cos: freqs.cos()?,
-        })
-    }
-
-    fn apply_rotary_emb_qkv(
-        &self,
-        q: &Tensor,
-        k: &Tensor,
-        seqlen_offset: usize,
-    ) -> Result<(Tensor, Tensor)> {
-        let (_b_sz, _h, seq_len, _n_embd) = q.dims4()?;
-        let cos = self.cos.narrow(0, seqlen_offset, seq_len)?;
-        let sin = self.sin.narrow(0, seqlen_offset, seq_len)?;
-        let cos = cos.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim)
-        let sin = sin.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim)
-        let q_embed = (q.broadcast_mul(&cos)? + rotate_half(q)?.broadcast_mul(&sin))?;
-        let k_embed = (k.broadcast_mul(&cos)? + rotate_half(k)?.broadcast_mul(&sin))?;
-        Ok((q_embed, k_embed))
-    }
-}
-
-#[derive(Debug, Clone)]
-#[allow(clippy::upper_case_acronyms)]
-struct MLP {
-    gate_proj: Linear,
-    up_proj: Linear,
-    down_proj: Linear,
-    act_fn: Activation,
-}
-
-impl MLP {
-    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let hidden_sz = cfg.hidden_size;
-        let intermediate_sz = cfg.intermediate_size;
-        let gate_proj = linear_no_bias(hidden_sz, intermediate_sz, vb.pp("gate_proj"))?;
-        let up_proj = linear_no_bias(hidden_sz, intermediate_sz, vb.pp("up_proj"))?;
-        let down_proj = linear_no_bias(intermediate_sz, hidden_sz, vb.pp("down_proj"))?;
-        Ok(Self {
-            gate_proj,
-            up_proj,
-            down_proj,
-            act_fn: cfg.hidden_act,
-        })
-    }
-}
-
-impl Module for MLP {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        let lhs = xs.apply(&self.gate_proj)?.apply(&self.act_fn)?;
-        let rhs = xs.apply(&self.up_proj)?;
-        (lhs * rhs)?.apply(&self.down_proj)
-    }
-}
-
-#[derive(Debug, Clone)]
-struct Attention {
-    q_proj: Linear,
-    k_proj: Linear,
-    v_proj: Linear,
-    o_proj: Linear,
-    num_heads: usize,
-    num_kv_heads: usize,
-    num_kv_groups: usize,
-    head_dim: usize,
-    hidden_size: usize,
-    rotary_emb: Arc<RotaryEmbedding>,
-    kv_cache: Option<(Tensor, Tensor)>,
-}
-
-impl Attention {
-    fn new(rotary_emb: Arc<RotaryEmbedding>, cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let hidden_sz = cfg.hidden_size;
-        let num_heads = cfg.num_attention_heads;
-        let num_kv_heads = cfg.num_key_value_heads;
-        let num_kv_groups = num_heads / num_kv_heads;
-        let head_dim = hidden_sz / num_heads;
-        let q_proj = linear_no_bias(hidden_sz, num_heads * head_dim, vb.pp("q_proj"))?;
-        let k_proj = linear_no_bias(hidden_sz, num_kv_heads * head_dim, vb.pp("k_proj"))?;
-        let v_proj = linear_no_bias(hidden_sz, num_kv_heads * head_dim, vb.pp("v_proj"))?;
-        let o_proj = linear_no_bias(num_heads * head_dim, hidden_sz, vb.pp("o_proj"))?;
-        Ok(Self {
-            q_proj,
-            k_proj,
-            v_proj,
-            o_proj,
-            num_heads,
-            num_kv_heads,
-            num_kv_groups,
-            head_dim,
-            hidden_size: hidden_sz,
-            rotary_emb,
-            kv_cache: None,
-        })
-    }
-
-    fn repeat_kv(&self, xs: Tensor) -> Result<Tensor> {
-        let n_rep = self.num_kv_groups;
-        if n_rep == 1 {
-            Ok(xs)
-        } else {
-            let (b_sz, num_kv_heads, seq_len, head_dim) = xs.dims4()?;
-            xs.unsqueeze(2)?
-                .expand((b_sz, num_kv_heads, n_rep, seq_len, head_dim))?
-                .reshape((b_sz, num_kv_heads * n_rep, seq_len, head_dim))
-        }
-    }
-
-    fn forward(
-        &mut self,
-        xs: &Tensor,
-        attention_mask: Option<&Tensor>,
-        seqlen_offset: usize,
-    ) -> Result<Tensor> {
-        let (b_sz, q_len, _) = xs.dims3()?;
-
-        let query_states = self.q_proj.forward(xs)?;
-        let key_states = self.k_proj.forward(xs)?;
-        let value_states = self.v_proj.forward(xs)?;
-
-        let query_states = query_states
-            .reshape((b_sz, q_len, self.num_heads, self.head_dim))?
-            .transpose(1, 2)?;
-        let key_states = key_states
-            .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))?
-            .transpose(1, 2)?;
-        let value_states = value_states
-            .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))?
-            .transpose(1, 2)?;
-
-        let (query_states, key_states) =
-            self.rotary_emb
-                .apply_rotary_emb_qkv(&query_states, &key_states, seqlen_offset)?;
-
-        let (key_states, value_states) = match &self.kv_cache {
-            None => (key_states, value_states),
-            Some((prev_k, prev_v)) => {
-                let key_states = Tensor::cat(&[prev_k, &key_states], 2)?;
-                let value_states = Tensor::cat(&[prev_v, &value_states], 2)?;
-                (key_states, value_states)
-            }
-        };
-        self.kv_cache = Some((key_states.clone(), value_states.clone()));
-
-        let key_states = self.repeat_kv(key_states)?;
-        let value_states = self.repeat_kv(value_states)?;
-
-        let attn_output = {
-            let scale = 1f64 / f64::sqrt(self.head_dim as f64);
-            let attn_weights = (query_states.matmul(&key_states.transpose(2, 3)?)? * scale)?;
-
-            let attn_weights = match attention_mask {
-                None => attn_weights,
-                Some(mask) => attn_weights.broadcast_add(mask)?,
-            };
-            let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?;
-            attn_weights.matmul(&value_states)?
-        };
-        attn_output
-            .transpose(1, 2)?
-            .reshape((b_sz, q_len, self.hidden_size))?
-            .apply(&self.o_proj)
-    }
-}
-
-#[derive(Debug, Clone)]
-struct DecoderLayer {
-    self_attn: Attention,
-    mlp: MLP,
-    ln1: RmsNorm,
-    ln2: RmsNorm,
-}
-
-impl DecoderLayer {
-    fn new(rotary_emb: Arc<RotaryEmbedding>, cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let self_attn = Attention::new(rotary_emb, cfg, vb.pp("self_attn"))?;
-        let mlp = MLP::new(cfg, vb.pp("mlp"))?;
-        let ln1 = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("ln1"))?;
-        let ln2 = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("ln2"))?;
-        Ok(Self {
-            self_attn,
-            mlp,
-            ln1,
-            ln2,
-        })
-    }
-
-    fn forward(
-        &mut self,
-        xs: &Tensor,
-        attention_mask: Option<&Tensor>,
-        seqlen_offset: usize,
-    ) -> Result<Tensor> {
-        let residual = xs;
-        let xs = self.ln1.forward(xs)?;
-        let xs = self.self_attn.forward(&xs, attention_mask, seqlen_offset)?;
-        let xs = (xs + residual)?;
-        let residual = &xs;
-        let xs = xs.apply(&self.ln2)?.apply(&self.mlp)?;
-        residual + xs
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct Model {
-    embed_tokens: candle_nn::Embedding,
-    layers: Vec<DecoderLayer>,
-    norm: RmsNorm,
-    lm_head: Linear,
-    device: Device,
-    dtype: DType,
-}
-
-impl Model {
-    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let vb_m = vb.pp("model");
-        let embed_tokens =
-            candle_nn::embedding(cfg.vocab_size, cfg.hidden_size, vb_m.pp("embed_tokens"))?;
-        let rotary_emb = Arc::new(RotaryEmbedding::new(vb.dtype(), cfg, vb_m.device())?);
-        let mut layers = Vec::with_capacity(cfg.num_hidden_layers);
-        let vb_l = vb_m.pp("layers");
-        for layer_idx in 0..cfg.num_hidden_layers {
-            let layer = DecoderLayer::new(rotary_emb.clone(), cfg, vb_l.pp(layer_idx))?;
-            layers.push(layer)
-        }
-        let norm = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb_m.pp("norm"))?;
-        let lm_head = linear_no_bias(cfg.hidden_size, cfg.vocab_size, vb.pp("lm_head"))?;
-        Ok(Self {
-            embed_tokens,
-            layers,
-            norm,
-            lm_head,
-            device: vb.device().clone(),
-            dtype: vb.dtype(),
-        })
-    }
-
-    fn prepare_decoder_attention_mask(
-        &self,
-        b_size: usize,
-        tgt_len: usize,
-        seqlen_offset: usize,
-    ) -> Result<Tensor> {
-        // Sliding window mask?
-        let mask: Vec<_> = (0..tgt_len)
-            .flat_map(|i| (0..tgt_len).map(move |j| if i < j { f32::NEG_INFINITY } else { 0. }))
-            .collect();
-        let mask = Tensor::from_slice(&mask, (tgt_len, tgt_len), &self.device)?;
-        let mask = if seqlen_offset > 0 {
-            let mask0 = Tensor::zeros((tgt_len, seqlen_offset), DType::F32, &self.device)?;
-            Tensor::cat(&[&mask0, &mask], D::Minus1)?
-        } else {
-            mask
-        };
-        mask.expand((b_size, 1, tgt_len, tgt_len + seqlen_offset))?
-            .to_dtype(self.dtype)
-    }
-
-    pub fn forward(&mut self, input_ids: &Tensor, seqlen_offset: usize) -> Result<Tensor> {
-        let (b_size, seq_len) = input_ids.dims2()?;
-        let attention_mask = if seq_len <= 1 {
-            None
-        } else {
-            let mask = self.prepare_decoder_attention_mask(b_size, seq_len, seqlen_offset)?;
-            Some(mask)
-        };
-        let mut xs = self.embed_tokens.forward(input_ids)?;
-        for layer in self.layers.iter_mut() {
-            xs = layer.forward(&xs, attention_mask.as_ref(), seqlen_offset)?
-        }
-        xs.narrow(1, seq_len - 1, 1)?
-            .apply(&self.norm)?
-            .apply(&self.lm_head)
-    }
-}
Author	SHA1	Message	Date
Nicolas Patry	d9c1f7e201	Fixed matmul (display still broken without casting back to CPU first? )	2023-11-10 20:09:25 +01:00
Nicolas Patry	315ba4cf0c	Tmp state.	2023-11-10 15:35:46 +01:00
Nicolas Patry	915f0e5b69	Fixing the kernels + launches to make them faster. Cool work by @ivarflakstad Co-authored-by: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com>	2023-11-10 11:14:51 +01:00
Nicolas Patry	9975f2b239	Adding indexing. Co-authored-by: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com>	2023-11-10 02:18:14 +01:00
Nicolas Patry	d7cc660c68	Refactor to simplify our lives for settings the params in the encoder.	2023-11-10 01:24:49 +01:00
Nicolas Patry	c54ed0ab48	Adding the actual backend	2023-11-09 19:53:14 +01:00
Nicolas Patry	af5e77f409	Remove tracing.	2023-11-09 19:41:08 +01:00
Nicolas Patry	8cf39d27ce	Metal part 1 - Scaffolding for metal.	2023-11-09 19:30:59 +01:00