Fix the tokenizer initialization for marian.

2025-06-20 12:06:35 +00:00 · 2023-10-29 21:13:14 +01:00
41 changed files with 148 additions and 3125 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -55,8 +55,6 @@ tracing-subscriber = "0.3.7"
 wav = "1.0.0"
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "0.6.6", default-features = false }
-# metal = { git = "https://github.com/ivarflakstad/metal-rs.git", features = ["mps"] }
-metal = { path = "../metal-rs", features = ["mps"] }

 [profile.release-with-debug]
 inherits = "release"
--- a/README.md
+++ b/README.md
@ -103,8 +103,6 @@ We also provide a some command line based examples using state of the art models
  evaluation, segmentation).
 - [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
  generate captions for an image.
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
-  model, generates the translated text from the input text.

 Run them using commands like:
 ```
@ -176,8 +174,6 @@ If you have an addition to this list, please submit a pull request.
        - Wurstchen v2.
    - Image to text.
        - BLIP.
-    - Text to text.
-        - Marian MT (Machine Translation).
    - Computer Vision Models.
        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT.
        - yolo-v3, yolo-v8.
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -13,8 +13,6 @@ readme = "README.md"
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
 candle-kernels = { path = "../candle-kernels", version = "0.3.0", optional = true }
-candle-metal-kernels = { path = "../candle-metal-kernels", version = "0.3.0", optional = true }
-metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
@ -30,7 +28,6 @@ safetensors = { workspace = true }
 thiserror = { workspace = true }
 yoke = { workspace = true }
 zip = { workspace = true }
-tracing = { workspace = true }

 [dev-dependencies]
 anyhow = { workspace = true }
@ -42,4 +39,3 @@ cuda = ["cudarc", "dep:candle-kernels"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
-metal = ["dep:candle-metal-kernels", "dep:metal"]
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -1,6 +1,6 @@
 use crate::backend::BackendDevice;
 use crate::cpu_backend::CpuDevice;
-use crate::{bail, CpuStorage, DType, Result, Shape, Storage, WithDType};
+use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};

 /// A `DeviceLocation` represents a physical device whereas multiple `Device`
 /// can live on the same location (typically for cuda devices).
@ -8,14 +8,12 @@ use crate::{bail, CpuStorage, DType, Result, Shape, Storage, WithDType};
 pub enum DeviceLocation {
    Cpu,
    Cuda { gpu_id: usize },
-    Metal,
 }

 #[derive(Debug, Clone)]
 pub enum Device {
    Cpu,
    Cuda(crate::CudaDevice),
-    Metal(crate::MetalDevice),
 }

 pub trait NdArray {
@ -105,14 +103,14 @@ impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize, const N4:
 impl<S: NdArray> NdArray for Vec<S> {
    fn shape(&self) -> Result<Shape> {
        if self.is_empty() {
-            bail!("empty array")
+            crate::bail!("empty array")
        }
        let shape0 = self[0].shape()?;
        let n = self.len();
        for v in self.iter() {
            let shape = v.shape()?;
            if shape != shape0 {
-                bail!("two elements have different shapes {shape:?} {shape0:?}")
+                crate::bail!("two elements have different shapes {shape:?} {shape0:?}")
            }
        }
        Ok(Shape::from([[n].as_slice(), shape0.dims()].concat()))
@ -130,15 +128,10 @@ impl Device {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
    }

-    pub fn new_metal(ordinal: usize) -> Result<Self> {
-        Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
-    }
-
    pub fn set_seed(&self, seed: u64) -> Result<()> {
        match self {
-            Self::Cpu => CpuDevice.set_seed(seed),
+            Self::Cpu => crate::cpu_backend::CpuDevice.set_seed(seed),
            Self::Cuda(c) => c.set_seed(seed),
-            Self::Metal(m) => m.set_seed(seed),
        }
    }

@ -154,16 +147,21 @@ impl Device {
        match self {
            Self::Cpu => DeviceLocation::Cpu,
            Self::Cuda(device) => device.location(),
-            Device::Metal(device) => device.location(),
        }
    }

    pub fn is_cpu(&self) -> bool {
-        matches!(self, Self::Cpu)
+        match self {
+            Self::Cpu => true,
+            Self::Cuda(_) => false,
+        }
    }

    pub fn is_cuda(&self) -> bool {
-        matches!(self, Self::Cuda(_))
+        match self {
+            Self::Cpu => false,
+            Self::Cuda(_) => true,
+        }
    }

    pub fn cuda_if_available(ordinal: usize) -> Result<Self> {
@ -190,11 +188,6 @@ impl Device {
                let storage = device.rand_uniform(shape, dtype, lo, up)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(_device) => {
-                // let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                // Ok(Storage::Metal(storage))
-                bail!("Metal rand_uniform not implemented")
-            }
        }
    }

@ -223,10 +216,6 @@ impl Device {
                let storage = device.rand_normal(shape, dtype, mean, std)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.rand_normal(shape, dtype, mean, std)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -249,10 +238,6 @@ impl Device {
                let storage = device.ones_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.ones_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -266,10 +251,6 @@ impl Device {
                let storage = device.zeros_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.zeros_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -281,11 +262,6 @@ impl Device {
                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage(&storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -297,11 +273,6 @@ impl Device {
                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage(&storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }
 }
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -14,7 +14,6 @@ impl Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            _ => todo!(),
        };

        write!(f, "Tensor[")?;
@ -477,7 +476,6 @@ impl std::fmt::Display for Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            crate::DeviceLocation::Metal => todo!(),
        };

        write!(
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -1,201 +0,0 @@
-#![allow(dead_code)]
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
-use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
-
-#[derive(Debug, Clone)]
-pub struct MetalDevice;
-
-#[derive(Debug)]
-pub struct MetalStorage;
-
-macro_rules! fail {
-    () => {
-        unimplemented!("metal support has not been enabled, add `metal` feature to enable.")
-    };
-}
-
-impl crate::backend::BackendStorage for MetalStorage {
-    type Device = MetalDevice;
-
-    fn try_clone(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn dtype(&self) -> DType {
-        fail!()
-    }
-
-    fn device(&self) -> &Self::Device {
-        fail!()
-    }
-
-    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv2d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn matmul(
-        &self,
-        _: &Self,
-        _: (usize, usize, usize, usize),
-        _: &Layout,
-        _: &Layout,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
-
-impl crate::backend::BackendDevice for MetalDevice {
-    type Storage = MetalStorage;
-    fn new(_: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn set_seed(&self, _: u64) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        fail!()
-    }
-
-    fn same_device(&self, _: &Self) -> bool {
-        fail!()
-    }
-
-    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -152,9 +152,6 @@ pub enum Error {
    #[error("the candle crate has not been built with cuda support")]
    NotCompiledWithCudaSupport,

-    #[error("the candle crate has not been built with metal support")]
-    NotCompiledWithMetalSupport,
-
    #[error("cannot find tensor {path}")]
    CannotFindTensor { path: String },

@ -162,9 +159,6 @@ pub enum Error {
    #[error(transparent)]
    Cuda(Box<dyn std::error::Error + Send + Sync>),

-    #[error("Metal error {0}")]
-    Metal(String),
-
    #[error(transparent)]
    TryFromIntError(#[from] core::num::TryFromIntError),

--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -52,10 +52,6 @@ mod dummy_cuda_backend;
 pub mod error;
 mod indexer;
 pub mod layout;
-#[cfg(feature = "metal")]
-pub mod metal_backend;
-#[cfg(feature = "accelerate")]
-mod metal_backend;
 #[cfg(feature = "mkl")]
 mod mkl;
 pub mod npy;
@ -91,12 +87,6 @@ pub use cuda_backend::{CudaDevice, CudaStorage};
 #[cfg(not(feature = "cuda"))]
 pub use dummy_cuda_backend::{CudaDevice, CudaStorage};

-#[cfg(feature = "metal")]
-pub use metal_backend::{MetalDevice, MetalStorage};
-
-#[cfg(not(feature = "metal"))]
-pub use dummy_metal_backend::{MetalDevice, MetalStorage};
-
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@ -1,474 +0,0 @@
-use crate::backend::{BackendDevice, BackendStorage};
-use crate::bail;
-use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose2D};
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
-use crate::{CpuStorage, DType, Layout, Result, Shape};
-use candle_metal_kernels;
-use core::mem;
-use half::{bf16, f16};
-use metal;
-use metal::mps::matrix::{Matrix, MatrixDescriptor, MatrixMultiplication};
-use metal::mps::{Float32, MPSDataType};
-use metal::MTLResourceOptions;
-
-/// Metal related errors
-#[derive(thiserror::Error, Debug)]
-pub enum MetalError {
-    #[error("metal error")]
-    Metal,
-}
-
-#[derive(Clone)]
-pub struct MetalDevice {
-    device: metal::Device,
-    _command_queue: metal::CommandQueue,
-    command_buffer: metal::CommandBuffer,
-}
-
-impl std::fmt::Debug for MetalDevice {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "MetalDevice({:?})", self.device.registry_id())
-    }
-}
-
-impl std::ops::Deref for MetalDevice {
-    type Target = metal::DeviceRef;
-
-    fn deref(&self) -> &Self::Target {
-        &self.device
-    }
-}
-
-impl MetalDevice {
-    pub fn metal_device(&self) -> &metal::DeviceRef {
-        self.device.as_ref()
-    }
-
-    pub fn id(&self) -> u64 {
-        self.registry_id()
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct MetalStorage {
-    buffer: metal::Buffer,
-    device: MetalDevice,
-    dtype: DType,
-}
-
-impl BackendStorage for MetalStorage {
-    type Device = MetalDevice;
-
-    fn try_clone(&self, _: &Layout) -> Result<Self> {
-        Ok(self.clone())
-    }
-
-    fn dtype(&self) -> DType {
-        self.dtype
-    }
-
-    fn device(&self) -> &Self::Device {
-        &self.device
-    }
-
-    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        match self.dtype{
-            DType::F32 => {
-// self.buffer.read_to_vec(self.buffer.length() as usize / 4);
-                let mut buffer = vec![0.0; 32000];
-buffer[0] = 1.0;
-                Ok(CpuStorage::F32(buffer))},
-            dtype => todo!("Unsupported dtype {dtype:?}")
-        }
-    }
-
-    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
-        println!("TODO Affine");
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        todo!()
-    }
-
-    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
-        todo!()
-    }
-
-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
-        println!("TODO reduce_op");
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        todo!()
-    }
-
-    fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
-        todo!("Implement {:?} {layout:?}  - {dtype:?}", self.dtype)
-    }
-
-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
-        // todo!()
-        // TODO
-        println!("TODO {:?}", B::NAME);
-        Ok(self.clone())
-    }
-
-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        println!("TODO Binary {:?}", B::NAME);
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn where_cond(&self, _: &Layout, rhs: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
-        println!("TODO where_cond");
-        Ok(rhs.clone())
-        // todo!()
-    }
-
-    fn conv1d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &ParamsConv1D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn conv2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &ParamsConv2D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        todo!()
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        todo!()
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        todo!()
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        todo!()
-    }
-
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        todo!()
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        println!("TODO Index select");
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn matmul(
-        &self,
-        rhs: &Self,
-        (b, m, n, k): (usize, usize, usize, usize),
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-    ) -> Result<Self> {
-        let transpose_left = false;
-        let transpose_right = false;
-        let alpha = 1.0;
-        let beta = 0.0;
-        self.matmul_generic(
-            rhs,
-            (b, m, n, k),
-            lhs_l,
-            rhs_l,
-            transpose_left,
-            transpose_right,
-            alpha,
-            beta,
-        )
-    }
-
-    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
-        println!("TODO Copy strided");
-        Ok(())
-    }
-}
-
-impl MetalStorage {
-    pub(crate) fn matmul_t(
-        &self,
-        rhs: &Self,
-        (b, m, n, k): (usize, usize, usize, usize),
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-    ) -> Result<Self> {
-        let transpose_left = false;
-        let transpose_right = true;
-        let alpha = 1.0;
-        let beta = 0.0;
-        self.matmul_generic(
-            rhs,
-            (b, m, n, k),
-            lhs_l,
-            rhs_l,
-            transpose_left,
-            transpose_right,
-            alpha,
-            beta,
-        )
-    }
-    pub(crate) fn matmul_generic(
-        &self,
-        rhs: &Self,
-        (b, m, n, k): (usize, usize, usize, usize),
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-        transpose_left: bool,
-        transpose_right: bool,
-        alpha: f64,
-        beta: f64,
-    ) -> Result<Self> {
-        let elem_count = b * m * n;
-        match (self.dtype, rhs.dtype) {
-            (DType::F32, DType::F32) => {
-            let span= tracing::span!(tracing::Level::TRACE, "metal alloc matmul");
-            let _enter = span.enter();
-
-                let out_buffer = self.device.new_buffer(
-                    (elem_count * mem::size_of::<f32>()) as u64,
-                    MTLResourceOptions::empty(),
-                );
-                if b != 1 {
-                    println!("TODO implement batched matmul for B={b}");
-                    // bail!("Didn't implemented strided matmul yet");
-                    return Ok(Self {
-                        buffer: out_buffer,
-                        device: self.device.clone(),
-                        dtype: self.dtype(),
-                    });
-                }
-                if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() {
-                    println!("Didn't implemented non contiguous matmul yet {:?} {:?}", lhs_l.is_contiguous(), rhs_l.is_contiguous());
-                    return Ok(Self {
-                        buffer: out_buffer,
-                        device: self.device.clone(),
-                        dtype: self.dtype(),
-                    });
-                }
-                return Ok(Self {
-                    buffer: out_buffer,
-                    device: self.device.clone(),
-                    dtype: self.dtype(),
-                });
-                let m: u64 = m.try_into().expect("usize should fit u64");
-                let n: u64 = n.try_into().expect("usize should fit u64");
-                let k: u64 = k.try_into().expect("usize should fit u64");
-                // Create descriptors
-                let left_descriptor =
-                    MatrixDescriptor::init_single(m, k, k * Float32::SIZE, Float32::TYPE_ID);
-                let right_descriptor =
-                    MatrixDescriptor::init_single(k, n, n * Float32::SIZE, Float32::TYPE_ID);
-                let result_descriptor =
-                    MatrixDescriptor::init_single(m, n, n * Float32::SIZE, Float32::TYPE_ID);
-
-                println!("lhs {:?} {m} {k}", self.buffer.length());
-                println!("rhs {:?} {k} {n}", rhs.buffer.length());
-                println!("out {:?} {m} {n}", out_buffer.length());
-                // Create matrix objects
-                let left_matrix =
-                    Matrix::init_with_buffer_descriptor(&self.buffer, &left_descriptor)
-                        .expect("Failed to create left matrix");
-                let right_matrix =
-                    Matrix::init_with_buffer_descriptor(&rhs.buffer, &right_descriptor)
-                        .expect("Failed to create left matrix");
-
-                let result_matrix =
-                    Matrix::init_with_buffer_descriptor(&out_buffer, &result_descriptor)
-                        .expect("Failed to create left matrix");
-
-                println!("lhs {:?}", lhs_l.shape());
-
-                // Create kernel
-                let matrix_multiplication = MatrixMultiplication::init(
-                    &self.device,
-                    transpose_left,
-                    transpose_right,
-                    m,
-                    n,
-                    k,
-                    alpha,
-                    beta,
-                )
-                .expect("Failed to create matrix multiplication kernel");
-
-                // Encode kernel to command buffer
-                matrix_multiplication.encode_to_command_buffer(
-                    &self.device.command_buffer,
-                    &left_matrix,
-                    &right_matrix,
-                    &result_matrix,
-                );
-                Ok(Self {
-                    buffer: out_buffer,
-                    device: self.device.clone(),
-                    dtype: self.dtype(),
-                })
-            }
-            _ => todo!("Unimplemented matmul for this pair"),
-        }
-    }
-}
-
-impl MetalDevice{
-    pub fn flush(&mut self){
-        self.command_buffer.commit();
-        self.command_buffer.wait_until_completed();
-        self.command_buffer = self._command_queue.new_owned_command_buffer();
-    }
-
-}
-
-impl BackendDevice for MetalDevice {
-    type Storage = MetalStorage;
-
-    fn new(ordinal: usize) -> Result<Self> {
-        let device = metal::Device::all().swap_remove(ordinal);
-        let _command_queue = device.new_command_queue();
-        let command_buffer = _command_queue.new_owned_command_buffer();
-        Ok(Self {
-            device,
-            _command_queue,
-            command_buffer,
-        })
-    }
-
-    fn set_seed(&self, _seed: u64) -> Result<()> {
-        todo!("set_seed")
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        crate::DeviceLocation::Metal
-    }
-
-    fn same_device(&self, rhs: &Self) -> bool {
-        self.device.registry_id() == rhs.device.registry_id()
-    }
-
-    fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<MetalStorage> {
-        // TODO Is there a faster way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.zeros_impl(shape, dtype)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-
-    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
-        // TODO Is there a faster way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.ones_impl(shape, dtype)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-
-    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<Self::Storage> {
-        let option = metal::MTLResourceOptions::CPUCacheModeDefaultCache;
-        let span= tracing::span!(tracing::Level::TRACE, "metal alloc");
-        let _enter = span.enter();
-
-        let buffer = self.device.new_buffer(4, option);
-        // let buffer = match storage {
-        //     CpuStorage::U8(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<u8>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::U32(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<u32>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::I64(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<i64>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::BF16(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<bf16>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::F16(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<f16>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::F32(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<f32>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::F64(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<f64>()) as u64,
-        //         option,
-        //     ),
-        // };
-        Ok(Self::Storage {
-            buffer,
-            device: self.clone(),
-            dtype: storage.dtype(),
-        })
-    }
-
-    fn rand_uniform(&self, shape: &Shape, dtype: DType, mean: f64, stddev: f64) -> Result<Self::Storage> {
-        // TODO is there a better way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.rand_uniform(shape, dtype, mean, stddev)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-
-    fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, stddev: f64) -> Result<Self::Storage> {
-        // TODO is there a better way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.rand_normal(shape, dtype, mean, stddev)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-}
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,5 +1,5 @@
 #![allow(clippy::redundant_closure_call)]
-use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
+use crate::{CpuStorage, CudaStorage, Layout, Result, Shape, Tensor};
 use half::{bf16, f16};
 use num_traits::float::Float;

@ -174,18 +174,6 @@ pub trait CustomOp1 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _storage: &MetalStorage,
-        _layout: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    /// This function takes as argument the argument `arg` used in the forward pass, the result
    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
    /// The function should return the gradient of the argument.
@ -221,20 +209,6 @@ pub trait CustomOp2 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    fn bwd(
        &self,
        _arg1: &Tensor,
@ -277,22 +251,6 @@ pub trait CustomOp3 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    fn bwd(
        &self,
        _arg1: &Tensor,
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -1,7 +1,7 @@
 //! Support for the GGML file format.

 use super::{k_quants, GgmlDType};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;

@ -121,12 +121,11 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
    raw_data: &[u8],
    size_in_bytes: usize,
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let raw_data_ptr = raw_data.as_ptr();
    let n_blocks = size_in_bytes / std::mem::size_of::<T>();
    let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
-    super::QTensor::new(data.to_vec(), dims, device)
+    super::QTensor::new(data.to_vec(), dims)
 }

 /// Creates a [Tensor] from a raw GGML tensor.
@ -134,7 +133,6 @@ pub fn qtensor_from_ggml(
    ggml_dtype: GgmlDType,
    raw_data: &[u8],
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let tensor_elems = dims.iter().product::<usize>();
    let blck_size = ggml_dtype.blck_size();
@ -146,38 +144,18 @@ pub fn qtensor_from_ggml(
    let size_in_bytes = tensor_elems / blck_size * ggml_dtype.type_size();

    match ggml_dtype {
-        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::Q4_0 => {
-            from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4_1 => {
-            from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_0 => {
-            from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_1 => {
-            from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q8_0 => {
-            from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q2K => {
-            from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q3K => {
-            from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4K => {
-            from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5K => {
-            from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q6K => {
-            from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
-        }
+        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims),
+        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_0 => from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_1 => from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_0 => from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_1 => from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q8_0 => from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q2K => from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q3K => from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4K => from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5K => from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q6K => from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims),
        _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
    }
 }
@ -185,7 +163,6 @@ pub fn qtensor_from_ggml(
 fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    reader: &mut R,
    magic: VersionedMagic,
-    device: &Device,
 ) -> Result<(String, super::QTensor)> {
    let n_dims = reader.read_u32::<LittleEndian>()?;
    let name_len = reader.read_u32::<LittleEndian>()?;
@ -210,7 +187,7 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    // TODO: Mmap version to avoid copying the data around?
    let mut raw_data = vec![0u8; size_in_bytes];
    reader.read_exact(&mut raw_data)?;
-    match qtensor_from_ggml(ggml_dtype, &raw_data, dims, device) {
+    match qtensor_from_ggml(ggml_dtype, &raw_data, dims) {
        Ok(tensor) => Ok((name, tensor)),
        Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
    }
@ -224,10 +201,7 @@ pub struct Content {
 }

 impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(
-        reader: &mut R,
-        device: &Device,
-    ) -> Result<Content> {
+    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Content> {
        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
        let last_position = reader.seek(std::io::SeekFrom::End(0))?;
        reader.seek(std::io::SeekFrom::Start(0))?;
@ -237,7 +211,7 @@ impl Content {
        let mut tensors = HashMap::new();

        while reader.stream_position()? != last_position {
-            let (name, tensor) = read_one_tensor(reader, magic, device)?;
+            let (name, tensor) = read_one_tensor(reader, magic)?;
            tensors.insert(name, tensor);
        }
        Ok(Self {
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -3,7 +3,7 @@
 //! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md

 use super::{GgmlDType, QTensor};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::collections::HashMap;

@ -57,7 +57,6 @@ impl TensorInfo {
        &self,
        reader: &mut R,
        tensor_data_offset: u64,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_elems = self.shape.elem_count();
        let blck_size = self.ggml_dtype.blck_size();
@ -70,12 +69,7 @@ impl TensorInfo {
        let mut raw_data = vec![0u8; size_in_bytes];
        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(
-            self.ggml_dtype,
-            &raw_data,
-            self.shape.dims().to_vec(),
-            device,
-        )
+        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
    }
 }

@ -456,13 +450,12 @@ impl Content {
        &self,
        reader: &mut R,
        name: &str,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_info = match self.tensor_infos.get(name) {
            Some(tensor_info) => tensor_info,
            None => crate::bail!("cannot find tensor-infor for {name}"),
        };
-        tensor_info.read(reader, self.tensor_data_offset, device)
+        tensor_info.read(reader, self.tensor_data_offset)
    }
 }

--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -14,7 +14,6 @@ pub mod utils;
 pub use k_quants::GgmlType;

 pub struct QTensor {
-    device: Device,
    data: Box<dyn QuantizedType>,
    shape: Shape,
 }
@ -171,20 +170,17 @@ impl QTensor {
    pub fn new<S: Into<Shape>, T: k_quants::GgmlType + Send + Sync + 'static>(
        data: Vec<T>,
        shape: S,
-        device: &Device,
    ) -> Result<Self> {
        let shape = shape.into();
        check_shape::<T>(&shape)?;
        Ok(Self {
            data: Box::new(data),
            shape,
-            device: device.clone(),
        })
    }

    pub fn quantize<T: k_quants::GgmlType + Send + Sync + 'static>(src: &Tensor) -> Result<Self> {
        let shape = src.shape();
-        let device = src.device();
        check_shape::<T>(shape)?;
        let src = src
            .to_dtype(crate::DType::F32)?
@ -201,7 +197,6 @@ impl QTensor {
        Ok(Self {
            data: Box::new(data),
            shape: shape.clone(),
-            device: device.clone(),
        })
    }

@ -217,12 +212,7 @@ impl QTensor {
        &self.shape
    }

-    pub fn device(&self) -> &Device {
-        &self.device
-    }
-
    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
-        // TODO Skip the CPU part on metal
        let mut f32_data = vec![0f32; self.shape.elem_count()];
        self.data.to_float(&mut f32_data)?;
        Tensor::from_vec(f32_data, &self.shape, device)
@ -315,49 +305,6 @@ impl crate::CustomOp1 for QTensor {
        )?;
        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
    }
-
-    fn metal_fwd(
-        &self,
-        storage: &crate::MetalStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::MetalStorage, Shape)> {
-        println!("TODO qmatmul");
-        if !layout.is_contiguous() {
-            crate::bail!("input tensor is not contiguous {layout:?}")
-        }
-        let src_shape = layout.shape();
-        // self is transposed so n is first then k.
-        let (n, k) = self.shape.dims2()?;
-        if src_shape.rank() < 2 {
-            crate::bail!("input tensor has only one dimension {layout:?}")
-        }
-        let mut dst_shape = src_shape.dims().to_vec();
-        let last_k = dst_shape.pop().unwrap();
-        if last_k != k {
-            crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape)
-        }
-        dst_shape.push(n);
-        let dst_shape = Shape::from(dst_shape);
-        // let storage = storage.as_slice::<f32>()?;
-        // let storage =
-        //     &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
-        let dst_storage = vec![0f32; dst_shape.elem_count()];
-        // self.matmul_t(
-        //     (dst_shape.elem_count() / n, k, n),
-        //     storage,
-        //     &mut dst_storage,
-        // )?;
-        let cpu_storage = crate::CpuStorage::F32(dst_storage);
-        use crate::backend::{BackendDevice, BackendStorage};
-        if let Device::Metal(device) = &self.device{
-        Ok((
-            device.storage_from_cpu_storage(&cpu_storage)?,
-            dst_shape,
-        ))
-        }else{
-            crate::bail!("qtensor not on metal device")
-        }
-    }
 }

 impl QMatMul {
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -203,7 +203,7 @@ impl Shape {

    /// Check whether the two shapes are compatible for broadcast, and if it is the case return the
    /// broadcasted shape. This is to be used for binary pointwise ops.
-    pub fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
+    pub(crate) fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
        let lhs = self;
        let lhs_dims = lhs.dims();
        let rhs_dims = rhs.dims();
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,6 +1,6 @@
 use crate::backend::BackendStorage;
 use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
-use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
+use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};

 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -8,7 +8,6 @@ use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage,
 pub enum Storage {
    Cpu(CpuStorage),
    Cuda(CudaStorage),
-    Metal(MetalStorage),
 }

 impl Storage {
@ -19,10 +18,6 @@ impl Storage {
                let storage = storage.try_clone(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.try_clone(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -30,7 +25,6 @@ impl Storage {
        match self {
            Self::Cpu(_) => Device::Cpu,
            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
-            Self::Metal(storage) => Device::Metal(storage.device().clone()),
        }
    }

@ -38,7 +32,6 @@ impl Storage {
        match self {
            Self::Cpu(storage) => storage.dtype(),
            Self::Cuda(storage) => storage.dtype(),
-            Self::Metal(storage) => storage.dtype(),
        }
    }

@ -72,10 +65,6 @@ impl Storage {
                let storage = storage.affine(layout, mul, add)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.affine(layout, mul, add)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -89,10 +78,6 @@ impl Storage {
                let storage = storage.powf(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.powf(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -106,10 +91,6 @@ impl Storage {
                let storage = storage.elu(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.elu(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -131,10 +112,6 @@ impl Storage {
                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -158,10 +135,6 @@ impl Storage {
                let storage = storage.reduce_op(op, layout, s)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.reduce_op(op, layout, s)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -175,10 +148,6 @@ impl Storage {
                let storage = storage.to_dtype(layout, dtype)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.to_dtype(layout, dtype)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -192,10 +161,6 @@ impl Storage {
                let (storage, shape) = c.cuda_fwd(storage, l)?;
                Ok((Self::Cuda(storage), shape))
            }
-            Self::Metal(storage) => {
-                let (storage, shape) = c.metal_fwd(storage, l)?;
-                Ok((Self::Metal(storage), shape))
-            }
        }
    }

@ -216,10 +181,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -244,10 +205,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2, s3, l3)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -262,10 +219,6 @@ impl Storage {
                let storage = storage.unary_impl::<B>(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.unary_impl::<B>(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -286,10 +239,6 @@ impl Storage {
                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -321,10 +270,6 @@ impl Storage {
                let s = inp.conv1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -352,10 +297,6 @@ impl Storage {
                let s = inp.conv2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -383,10 +324,6 @@ impl Storage {
                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -411,10 +348,6 @@ impl Storage {
                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -433,10 +366,6 @@ impl Storage {
                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -450,10 +379,6 @@ impl Storage {
                let storage = storage.upsample_nearest1d(layout, sz)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -467,10 +392,6 @@ impl Storage {
                let storage = storage.upsample_nearest2d(layout, h, w)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest2d(layout, h, w)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -494,10 +415,6 @@ impl Storage {
                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(cond), Self::Metal(t), Self::Metal(f)) => {
-                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
-                Ok(Self::Metal(storage))
-            }
            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -524,10 +441,6 @@ impl Storage {
                let storage = s.gather(l, indexes, indexes_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes)) => {
-                let storage = s.gather(l, indexes, indexes_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -552,10 +465,6 @@ impl Storage {
                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -580,10 +489,6 @@ impl Storage {
                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -605,10 +510,6 @@ impl Storage {
                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -636,10 +537,6 @@ impl Storage {
                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -659,9 +556,6 @@ impl Storage {
        match (self, dst) {
            (Self::Cpu(src), Self::Cpu(dst)) => src.copy_strided_src(dst, dst_offset, src_l),
            (Self::Cuda(src), Self::Cuda(dst)) => Ok(src.copy_strided_src(dst, dst_offset, src_l)?),
-            (Self::Metal(src), Self::Metal(dst)) => {
-                Ok(src.copy_strided_src(dst, dst_offset, src_l)?)
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -6,7 +6,7 @@ use crate::op::{
 };
 use crate::scalar::TensorOrScalar;
 use crate::shape::{Dim, Dims};
-use crate::{bail, storage::Storage, DType, Device, Error, Layout, Result, Shape};
+use crate::{storage::Storage, DType, Device, Error, Layout, Result, Shape};
 use std::sync::{Arc, RwLock};

 /// Unique identifier for tensors.
@ -385,21 +385,11 @@ impl Tensor {
        step: D,
        device: &Device,
    ) -> Result<Self> {
-        if D::is_zero(&step) {
-            crate::bail!("step cannot be zero")
-        }
        let mut data = vec![];
        let mut current = start;
-        if step >= D::zero() {
-            while current < end {
-                data.push(current);
-                current += step;
-            }
-        } else {
-            while current > end {
-                data.push(current);
-                current += step;
-            }
+        while current < end {
+            data.push(current);
+            current += step;
        }
        let len = data.len();
        Self::from_vec_impl(data, len, device, false)
@ -523,7 +513,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(cpu_storage) => from_cpu_storage(cpu_storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1449,7 +1438,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1480,7 +1468,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1521,7 +1508,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1841,9 +1827,6 @@ impl Tensor {
                    Storage::Cuda(cuda.storage_from_cpu_storage(&cpu_storage)?)
                }
                (Storage::Cpu(storage), Device::Cpu) => Storage::Cpu(storage.clone()),
-                _ => {
-                    bail!("not implemented yet")
-                }
            };
            let op = BackpropOp::new1(self, Op::ToDevice);
            let tensor_ = Tensor_ {
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -23,10 +23,6 @@ pub fn cuda_is_available() -> bool {
    cfg!(feature = "cuda")
 }

-pub fn metal_is_available() -> bool {
-    cfg!(feature = "metal")
-}
-
 pub fn with_avx() -> bool {
    cfg!(target_feature = "avx")
 }
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -29,26 +29,7 @@ fn ones(device: &Device) -> Result<()> {
        Tensor::ones((2, 3), DType::F64, device)?.to_vec2::<f64>()?,
        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
    );
-    Ok(())
-}

-fn arange(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::arange(0u8, 5u8, device)?.to_vec1::<u8>()?,
-        [0, 1, 2, 3, 4],
-    );
-    assert_eq!(
-        Tensor::arange_step(0u8, 5u8, 2, device)?.to_vec1::<u8>()?,
-        [0, 2, 4],
-    );
-    assert_eq!(
-        Tensor::arange_step(0u8, 5u8, 3, device)?.to_vec1::<u8>()?,
-        [0, 3],
-    );
-    assert_eq!(
-        Tensor::arange_step(5i64, 0i64, -1, device)?.to_vec1::<i64>()?,
-        [5, 4, 3, 2, 1],
-    );
    Ok(())
 }

@ -1056,7 +1037,6 @@ fn randn(device: &Device) -> Result<()> {

 test_device!(zeros, zeros_cpu, zeros_gpu);
 test_device!(ones, ones_cpu, ones_gpu);
-test_device!(arange, arange_cpu, arange_gpu);
 test_device!(add_mul, add_mul_cpu, add_mul_gpu);
 test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu);
 test_device!(narrow, narrow_cpu, narrow_gpu);
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -51,7 +51,6 @@ anyhow = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
 cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
-metal = ["candle/metal", "candle-nn/metal", "candle-transformers/metal"]
 cudnn = ["candle/cudnn"]
 flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
--- a/candle-examples/examples/blip/main.rs
+++ b/candle-examples/examples/blip/main.rs
@ -149,6 +149,6 @@ pub fn main() -> anyhow::Result<()> {
    if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
        print!("{rest}");
    }
-    println!();
+
    Ok(())
 }
--- a/candle-examples/examples/marian-mt/README.md
+++ b/candle-examples/examples/marian-mt/README.md
@ -1,38 +0,0 @@
-# candle-marian-mt
-
-`marian-mt` is a neural machine translation model. In this example it is used to
-translate text from French to English. See the associated [model
-card](https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fr-en) for details on
-the model itself.
-
-## Running an example
-
-```bash
-cargo run --example marian-mt --release -- \
-    --text "Demain, dès l'aube, à l'heure où blanchit la campagne, Je partirai. Vois-tu, je sais que tu m'attends. J'irai par la forêt, j'irai par la montagne. Je ne puis demeurer loin de toi plus longtemps."
-```
-
-```
-<NIL> Tomorrow, at dawn, at the time when the country is whitening, I will go. See,
-I know you are waiting for me. I will go through the forest, I will go through the
-mountain. I cannot stay far from you any longer.</s>
-```
-
-## Generating the tokenizer.json files
-
-You can use the following script to generate the `tokenizer.json` config files
-from the hf-hub repos. This requires the `tokenizers` and `sentencepiece`
-packages to be install and use the `convert_slow_tokenizer.py` script from this
-directory.
-
-```python
-from convert_slow_tokenizer import MarianConverter
-from transformers import AutoTokenizer
-
-
-tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en", use_fast=False)
-fast_tokenizer = MarianConverter(tokenizer, index=0).converted()
-fast_tokenizer.save(f"tokenizer-marian-base-fr.json")
-fast_tokenizer = MarianConverter(tokenizer, index=1).converted()
-fast_tokenizer.save(f"tokenizer-marian-base-en.json")
-```
--- a/candle-examples/examples/marian-mt/convert_slow_tokenizer.py
+++ b/candle-examples/examples/marian-mt/convert_slow_tokenizer.py
--- a/candle-examples/examples/marian-mt/main.rs
+++ b/candle-examples/examples/marian-mt/main.rs
@ -5,7 +5,7 @@ extern crate intel_mkl_src;
 extern crate accelerate_src;

 use anyhow::Error as E;
-use clap::{Parser, ValueEnum};
+use clap::Parser;

 use candle::{DType, Tensor};
 use candle_examples::token_output_stream::TokenOutputStream;
@ -14,27 +14,14 @@ use candle_transformers::models::marian;

 use tokenizers::Tokenizer;

-#[derive(Clone, Debug, Copy, ValueEnum)]
-enum Which {
-    Base,
-    Big,
-}
-
 // TODO: Maybe add support for the conditional prompt.
 #[derive(Parser)]
 struct Args {
    #[arg(long)]
-    model: Option<String>,
+    model: String,

    #[arg(long)]
-    tokenizer: Option<String>,
-
-    #[arg(long)]
-    tokenizer_dec: Option<String>,
-
-    /// Choose the variant of the model to run.
-    #[arg(long, default_value = "big")]
-    which: Which,
+    tokenizer: String,

    /// Run on CPU rather than on GPU.
    #[arg(long)]
@ -49,104 +36,62 @@ struct Args {
    text: String,
 }

+const SEP_TOKEN_ID: u32 = 102;
+
 pub fn main() -> anyhow::Result<()> {
-    use hf_hub::api::sync::Api;
    let args = Args::parse();

-    let config = match args.which {
-        Which::Base => marian::Config::opus_mt_fr_en(),
-        Which::Big => marian::Config::opus_mt_tc_big_fr_en(),
-    };
-    let tokenizer = {
-        let tokenizer = match args.tokenizer {
-            Some(tokenizer) => std::path::PathBuf::from(tokenizer),
-            None => {
-                let name = match args.which {
-                    Which::Base => "tokenizer-marian-base-fr.json",
-                    Which::Big => "tokenizer-marian-fr.json",
-                };
-                Api::new()?
-                    .model("lmz/candle-marian".to_string())
-                    .get(name)?
-            }
-        };
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
-    };
-
-    let tokenizer_dec = {
-        let tokenizer = match args.tokenizer_dec {
-            Some(tokenizer) => std::path::PathBuf::from(tokenizer),
-            None => {
-                let name = match args.which {
-                    Which::Base => "tokenizer-marian-base-en.json",
-                    Which::Big => "tokenizer-marian-en.json",
-                };
-                Api::new()?
-                    .model("lmz/candle-marian".to_string())
-                    .get(name)?
-            }
-        };
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
-    };
-    let mut tokenizer_dec = TokenOutputStream::new(tokenizer_dec);
+    let config = marian::Config::opus_mt_tc_big_fr_en();

    let device = candle_examples::device(args.cpu)?;
-    let vb = {
-        let model = match args.model {
-            Some(model) => std::path::PathBuf::from(model),
-            None => match args.which {
-                Which::Base => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "Helsinki-NLP/opus-mt-fr-en".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/4".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-                Which::Big => Api::new()?
-                    .model("Helsinki-NLP/opus-mt-tc-big-fr-en".to_string())
-                    .get("model.safetensors")?,
-            },
-        };
-        unsafe { VarBuilder::from_mmaped_safetensors(&[&model], DType::F32, &device)? }
-    };
-    let mut model = marian::MTModel::new(&config, vb)?;
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[&args.model], DType::F32, &device)? };
+    let model = marian::MTModel::new(&config, vb)?;

+    let vocab = std::fs::read_to_string(args.tokenizer)?;
+    let vocab = serde_json::from_str(&vocab)?;
+    let tokenizer = tokenizers::models::wordpiece::WordPieceBuilder::new()
+        .unk_token("<unk>".to_string())
+        .vocab(vocab)
+        .build()
+        .map_err(E::msg)?;
+    let tokenizer = Tokenizer::new(tokenizer);
+    let mut tokenizer_dec = TokenOutputStream::new(tokenizer.clone());
    let mut logits_processor =
        candle_transformers::generation::LogitsProcessor::new(1337, None, None);

    let encoder_xs = {
-        let mut tokens = tokenizer
+        let tokens = tokenizer
            .encode(args.text, true)
            .map_err(E::msg)?
            .get_ids()
            .to_vec();
-        tokens.push(config.eos_token_id);
        let tokens = Tensor::new(tokens.as_slice(), &device)?.unsqueeze(0)?;
        model.encoder().forward(&tokens, 0)?
    };

-    let mut token_ids = vec![config.decoder_start_token_id];
+    let mut token_ids = vec![30522u32];
    for index in 0..1000 {
-        let context_size = if index >= 1 { 1 } else { token_ids.len() };
+        // TODO: Add a kv cache.
+        let context_size = if index >= 1000 { 1 } else { token_ids.len() };
        let start_pos = token_ids.len().saturating_sub(context_size);
        let input_ids = Tensor::new(&token_ids[start_pos..], &device)?.unsqueeze(0)?;
-        let logits = model.decode(&input_ids, &encoder_xs, start_pos)?;
+        let logits = model.decode(&input_ids, &encoder_xs)?;
        let logits = logits.squeeze(0)?;
        let logits = logits.get(logits.dim(0)? - 1)?;
        let token = logits_processor.sample(&logits)?;
+        if token == SEP_TOKEN_ID {
+            break;
+        }
        token_ids.push(token);
        if let Some(t) = tokenizer_dec.next_token(token)? {
            use std::io::Write;
            print!("{t}");
            std::io::stdout().flush()?;
        }
-        if token == config.eos_token_id || token == config.forced_eos_token_id {
-            break;
-        }
    }
    if let Some(rest) = tokenizer_dec.decode_rest().map_err(E::msg)? {
        print!("{rest}");
    }
-    println!();
+
    Ok(())
 }
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -9,7 +9,7 @@ use std::io::Write;
 use tokenizers::Tokenizer;

 use candle::quantized::{ggml_file, gguf_file};
-use candle::{Tensor};
+use candle::{Device, Tensor};
 use candle_transformers::generation::LogitsProcessor;

 use candle_transformers::models::quantized_llama as model;
@ -232,7 +232,6 @@ fn main() -> anyhow::Result<()> {
    use tracing_subscriber::prelude::*;

    let args = Args::parse();
-    let mut device = candle_examples::device(false)?;
    let temperature = if args.temperature == 0. {
        None
    } else {
@ -277,10 +276,10 @@ fn main() -> anyhow::Result<()> {
                &format_size(total_size_in_bytes),
                start.elapsed().as_secs_f32(),
            );
-            ModelWeights::from_gguf(model, &mut file, &device)?
+            ModelWeights::from_gguf(model, &mut file)?
        }
        Some("ggml" | "bin") | Some(_) | None => {
-            let model = ggml_file::Content::read(&mut file, &device)?;
+            let model = ggml_file::Content::read(&mut file)?;
            let mut total_size_in_bytes = 0;
            for (_, tensor) in model.tensors.iter() {
                let elem_count = tensor.shape().elem_count();
@ -308,7 +307,7 @@ fn main() -> anyhow::Result<()> {
                | Which::L70b
                | Which::L70bChat => 8,
            };
-            ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa), &device)?
+            ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa))?
        }
    };
    println!("model built");
@ -367,11 +366,9 @@ fn main() -> anyhow::Result<()> {

        let start_prompt_processing = std::time::Instant::now();
        let mut next_token = {
-            let input = Tensor::new(prompt_tokens.as_slice(), &device)?.unsqueeze(0)?;
+            let input = Tensor::new(prompt_tokens.as_slice(), &Device::Cpu)?.unsqueeze(0)?;
            let logits = model.forward(&input, 0)?;
            let logits = logits.squeeze(0)?;
-            // TODO Remove this once implementation is finished.
-            let logits = logits.ones_like()?;
            logits_processor.sample(&logits)?
        };
        let prompt_dt = start_prompt_processing.elapsed();
@ -382,24 +379,19 @@ fn main() -> anyhow::Result<()> {

        let start_post_prompt = std::time::Instant::now();
        for index in 0..to_sample {
-            let input = Tensor::new(&[next_token], &device)?.unsqueeze(0)?;
+            let input = Tensor::new(&[next_token], &Device::Cpu)?.unsqueeze(0)?;
            let logits = model.forward(&input, prompt_tokens.len() + index)?;
-            if let candle::Device::Metal(device) = &mut device{
-                device.flush()
-            }
            let logits = logits.squeeze(0)?;
-            // let logits = if args.repeat_penalty == 1. {
-            //     logits
-            // } else {
-            //     let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
-            //     candle_transformers::utils::apply_repeat_penalty(
-            //         &logits,
-            //         args.repeat_penalty,
-            //         &all_tokens[start_at..],
-            //     )?
-            // };
-            // TODO Remove this once implementation is finished.
-            let logits = logits.ones_like()?;
+            let logits = if args.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    args.repeat_penalty,
+                    &all_tokens[start_at..],
+                )?
+            };
            next_token = logits_processor.sample(&logits)?;
            all_tokens.push(next_token);
            print_token(next_token, &tokenizer);
--- a/candle-examples/src/lib.rs
+++ b/candle-examples/src/lib.rs
@ -2,30 +2,17 @@ pub mod coco_classes;
 pub mod imagenet;
 pub mod token_output_stream;

-use candle::utils::{cuda_is_available, metal_is_available};
 use candle::{Device, Result, Tensor};

 pub fn device(cpu: bool) -> Result<Device> {
    if cpu {
        Ok(Device::Cpu)
    } else {
-        if cuda_is_available() {
-            Ok(Device::new_cuda(0)?)
-        } else if metal_is_available() {
-            Ok(Device::new_metal(0)?)
-        } else {
-            #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-            {
-                println!("Running on CPU, to run on GPU(metal), build this example with `--features metal`");
-            }
-            #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-            {
-                println!(
-                    "Running on CPU, to run on GPU, build this example with `--features cuda`"
-                );
-            }
-            Ok(Device::Cpu)
+        let device = Device::cuda_if_available(0)?;
+        if !device.is_cuda() {
+            println!("Running on CPU, to run on GPU, build this example with `--features cuda`");
        }
+        Ok(device)
    }
 }

--- a/candle-metal-kernels/Cargo.toml
+++ b/candle-metal-kernels/Cargo.toml
@ -1,12 +0,0 @@
-[package]
-name = "candle-metal-kernels"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-repository.workspace = true
-keywords.workspace = true
-categories.workspace = true
-license.workspace = true
-
-[dependencies]
-metal = { workspace = true }
--- a/candle-metal-kernels/README.md
+++ b/candle-metal-kernels/README.md
@ -1,3 +0,0 @@
-# candle-metal-kernels
-
-This crate contains Metal kernels used from candle.
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
@ -1 +0,0 @@
-
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@ -28,5 +28,4 @@ clap = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate"]
 cuda = ["candle/cuda"]
-metal = ["candle/metal"]
 mkl = ["dep:intel-mkl-src", "candle/mkl"]
--- a/candle-nn/src/activation.rs
+++ b/candle-nn/src/activation.rs
@ -13,7 +13,6 @@ pub enum Activation {
    Relu6,
    Silu,
    Sigmoid,
-    Swish,
    Elu(f64),
    LeakyRelu(f64),
 }
@ -29,7 +28,6 @@ impl super::Module for Activation {
            Self::Relu6 => xs.clamp(0f32, 6f32),
            Self::Silu => crate::ops::silu(xs),
            Self::Sigmoid => crate::ops::sigmoid(xs),
-            Self::Swish => xs * crate::ops::sigmoid(xs)?,
            &Self::Elu(alpha) => xs.elu(alpha),
            &Self::LeakyRelu(negative_slope) => crate::ops::leaky_relu(xs, negative_slope),
        }
--- a/candle-nn/src/ops.rs
+++ b/candle-nn/src/ops.rs
@ -190,16 +190,6 @@ impl candle::CustomOp1 for SoftmaxLastDim {
            device: dev.clone(),
        };
        Ok((dst, layout.shape().clone()))
-    }    
-
-    #[cfg(feature = "metal")]
-    fn metal_fwd(
-        &self,
-        storage: &candle::MetalStorage,
-        layout: &Layout,
-    ) -> Result<(candle::MetalStorage, Shape)> {
-        println!("TODO softmax-last-dim");
-        Ok((storage.clone(), layout.shape().clone()))
    }
 }

--- a/candle-pyo3/_additional_typing/init.py
+++ b/candle-pyo3/_additional_typing/init.py
@ -53,39 +53,3 @@ class Tensor:
        Return a slice of a tensor.
        """
        pass
-
-    def __eq__(self, rhs: Union["Tensor", "Scalar"]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-
-    def __ne__(self, rhs: Union["Tensor", "Scalar"]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-
-    def __lt__(self, rhs: Union["Tensor", "Scalar"]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-
-    def __le__(self, rhs: Union["Tensor", "Scalar"]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-
-    def __gt__(self, rhs: Union["Tensor", "Scalar"]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-
-    def __ge__(self, rhs: Union["Tensor", "Scalar"]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
--- a/candle-pyo3/py_src/candle/init.pyi
+++ b/candle-pyo3/py_src/candle/init.pyi
@ -124,46 +124,16 @@ class Tensor:
        Add a scalar to a tensor or two tensors together.
        """
        pass
-    def __eq__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-    def __ge__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
    def __getitem__(self, index: Union[Index, Tensor, Sequence[Index]]) -> "Tensor":
        """
        Return a slice of a tensor.
        """
        pass
-    def __gt__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-    def __le__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
-    def __lt__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
    def __mul__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
        """
        Multiply a tensor by a scalar or one tensor by another.
        """
        pass
-    def __ne__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
-        """
-        Compare a tensor with a scalar or one tensor with another.
-        """
-        pass
    def __radd__(self, rhs: Union[Tensor, Scalar]) -> "Tensor":
        """
        Add a scalar to a tensor or two tensors together.
@ -189,11 +159,6 @@ class Tensor:
        Divide a tensor by a scalar or one tensor by another.
        """
        pass
-    def abs(self) -> Tensor:
-        """
-        Performs the `abs` operation on the tensor.
-        """
-        pass
    def argmax_keepdim(self, dim: int) -> Tensor:
        """
        Returns the indices of the maximum value(s) across the selected dimension.
@ -343,12 +308,6 @@ class Tensor:
        ranges from `start` to `start + len`.
        """
        pass
-    @property
-    def nelement(self) -> int:
-        """
-        Gets the tensor's element count.
-        """
-        pass
    def powf(self, p: float) -> Tensor:
        """
        Performs the `pow` operation on the tensor with the given exponent.
--- a/candle-pyo3/py_src/candle/testing/init.py
+++ b/candle-pyo3/py_src/candle/testing/init.py
@ -1,70 +0,0 @@
-import candle
-from candle import Tensor
-
-
-_UNSIGNED_DTYPES = set([str(candle.u8), str(candle.u32)])
-
-
-def _assert_tensor_metadata(
-    actual: Tensor,
-    expected: Tensor,
-    check_device: bool = True,
-    check_dtype: bool = True,
-    check_layout: bool = True,
-    check_stride: bool = False,
-):
-    if check_device:
-        assert actual.device == expected.device, f"Device mismatch: {actual.device} != {expected.device}"
-
-    if check_dtype:
-        assert str(actual.dtype) == str(expected.dtype), f"Dtype mismatch: {actual.dtype} != {expected.dtype}"
-
-    if check_layout:
-        assert actual.shape == expected.shape, f"Shape mismatch: {actual.shape} != {expected.shape}"
-
-    if check_stride:
-        assert actual.stride == expected.stride, f"Stride mismatch: {actual.stride} != {expected.stride}"
-
-
-def assert_equal(
-    actual: Tensor,
-    expected: Tensor,
-    check_device: bool = True,
-    check_dtype: bool = True,
-    check_layout: bool = True,
-    check_stride: bool = False,
-):
-    """
-    Asserts that two tensors are exact equals.
-    """
-    _assert_tensor_metadata(actual, expected, check_device, check_dtype, check_layout, check_stride)
-    assert (actual - expected).abs().sum_all().values() == 0, f"Tensors mismatch: {actual} != {expected}"
-
-
-def assert_almost_equal(
-    actual: Tensor,
-    expected: Tensor,
-    rtol=1e-05,
-    atol=1e-08,
-    check_device: bool = True,
-    check_dtype: bool = True,
-    check_layout: bool = True,
-    check_stride: bool = False,
-):
-    """
-    Asserts, that two tensors are almost equal by performing an element wise comparison of the tensors with a tolerance.
-
-    Computes: |actual - expected| ≤ atol + rtol x |expected|
-    """
-    _assert_tensor_metadata(actual, expected, check_device, check_dtype, check_layout, check_stride)
-
-    # Secure against overflow of u32 and u8 tensors
-    if str(actual.dtype) in _UNSIGNED_DTYPES or str(expected.dtype) in _UNSIGNED_DTYPES:
-        actual = actual.to(candle.i64)
-        expected = expected.to(candle.i64)
-
-    diff = (actual - expected).abs()
-
-    threshold = (expected.abs().to_dtype(candle.f32) * rtol + atol).to(expected)
-
-    assert (diff <= threshold).sum_all().values() == actual.nelement, f"Difference between tensors was to great"
--- a/candle-pyo3/src/lib.rs
+++ b/candle-pyo3/src/lib.rs
@ -1,11 +1,8 @@
 #![allow(clippy::redundant_closure_call)]
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::prelude::*;
-use pyo3::pyclass::CompareOp;
 use pyo3::types::{IntoPyDict, PyDict, PyTuple};
 use pyo3::ToPyObject;
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
 use std::os::raw::c_long;
 use std::sync::Arc;

@ -81,7 +78,6 @@ impl PyDevice {
        match device {
            Device::Cpu => Self::Cpu,
            Device::Cuda(_) => Self::Cuda,
-            Device::Metal(_) => unimplemented!(),
        }
    }

@ -136,10 +132,9 @@ macro_rules! pydtype {
        }
    };
 }
-
-pydtype!(i64, |v| v);
 pydtype!(u8, |v| v);
 pydtype!(u32, |v| v);
+pydtype!(i64, |v| v);
 pydtype!(f16, f32::from);
 pydtype!(bf16, f32::from);
 pydtype!(f32, |v| v);
@ -322,13 +317,6 @@ impl PyTensor {
        PyTuple::new(py, self.0.dims()).to_object(py)
    }

-    #[getter]
-    /// Gets the tensor's element count.
-    /// &RETURNS&: int
-    fn nelement(&self) -> usize {
-        self.0.elem_count()
-    }
-
    #[getter]
    /// Gets the tensor's strides.
    /// &RETURNS&: Tuple[int]
@ -365,12 +353,6 @@ impl PyTensor {
        self.__repr__()
    }

-    /// Performs the `abs` operation on the tensor.
-    /// &RETURNS&: Tensor
-    fn abs(&self) -> PyResult<Self> {
-        Ok(PyTensor(self.0.abs().map_err(wrap_err)?))
-    }
-
    /// Performs the `sin` operation on the tensor.
    /// &RETURNS&: Tensor
    fn sin(&self) -> PyResult<Self> {
@ -688,58 +670,6 @@ impl PyTensor {
        };
        Ok(Self(tensor))
    }
-    /// Rich-compare two tensors.
-    /// &RETURNS&: Tensor
-    fn __richcmp__(&self, rhs: &PyAny, op: CompareOp) -> PyResult<Self> {
-        let compare = |lhs: &Tensor, rhs: &Tensor| {
-            let t = match op {
-                CompareOp::Eq => lhs.eq(rhs),
-                CompareOp::Ne => lhs.ne(rhs),
-                CompareOp::Lt => lhs.lt(rhs),
-                CompareOp::Le => lhs.le(rhs),
-                CompareOp::Gt => lhs.gt(rhs),
-                CompareOp::Ge => lhs.ge(rhs),
-            };
-            Ok(PyTensor(t.map_err(wrap_err)?))
-        };
-        if let Ok(rhs) = rhs.extract::<PyTensor>() {
-            if self.0.shape() == rhs.0.shape() {
-                compare(&self.0, &rhs.0)
-            } else {
-                // We broadcast manually here because `candle.cmp` does not support automatic broadcasting
-                let broadcast_shape = self
-                    .0
-                    .shape()
-                    .broadcast_shape_binary_op(rhs.0.shape(), "cmp")
-                    .map_err(wrap_err)?;
-                let broadcasted_lhs = self.0.broadcast_as(&broadcast_shape).map_err(wrap_err)?;
-                let broadcasted_rhs = rhs.0.broadcast_as(&broadcast_shape).map_err(wrap_err)?;
-
-                compare(&broadcasted_lhs, &broadcasted_rhs)
-            }
-        } else if let Ok(rhs) = rhs.extract::<f64>() {
-            let scalar_tensor = Tensor::new(rhs, self.0.device())
-                .map_err(wrap_err)?
-                .to_dtype(self.0.dtype())
-                .map_err(wrap_err)?
-                .broadcast_as(self.0.shape())
-                .map_err(wrap_err)?;
-
-            compare(&self.0, &scalar_tensor)
-        } else {
-            return Err(PyTypeError::new_err("unsupported rhs for __richcmp__"));
-        }
-    }
-
-    fn __hash__(&self) -> u64 {
-        // we have overridden __richcmp__ => py03 wants us to also override __hash__
-        // we simply hash the address of the tensor
-        let mut hasher = DefaultHasher::new();
-        let pointer = &self.0 as *const Tensor;
-        let address = pointer as usize;
-        address.hash(&mut hasher);
-        hasher.finish()
-    }

    #[pyo3(signature=(*shape), text_signature = "(self, *shape:Shape)")]
    /// Reshapes the tensor to the given shape.
@ -1573,7 +1503,7 @@ fn candle(py: Python<'_>, m: &PyModule) -> PyResult<()> {
    m.add_class::<PyDType>()?;
    m.add("u8", PyDType(DType::U8))?;
    m.add("u32", PyDType(DType::U32))?;
-    m.add("i64", PyDType(DType::I64))?;
+    m.add("i16", PyDType(DType::I64))?;
    m.add("bf16", PyDType(DType::BF16))?;
    m.add("f16", PyDType(DType::F16))?;
    m.add("f32", PyDType(DType::F32))?;
--- a/candle-pyo3/tests/bindings/test_testing.py
+++ b/candle-pyo3/tests/bindings/test_testing.py
@ -1,33 +0,0 @@
-import candle
-from candle import Tensor
-from candle.testing import assert_equal, assert_almost_equal
-import pytest
-
-
-@pytest.mark.parametrize("dtype", [candle.f32, candle.f64, candle.f16, candle.u32, candle.u8, candle.i64])
-def test_assert_equal_asserts_correctly(dtype: candle.DType):
-    a = Tensor([1, 2, 3]).to(dtype)
-    b = Tensor([1, 2, 3]).to(dtype)
-    assert_equal(a, b)
-
-    with pytest.raises(AssertionError):
-        assert_equal(a, b + 1)
-
-
-@pytest.mark.parametrize("dtype", [candle.f32, candle.f64, candle.f16, candle.u32, candle.u8, candle.i64])
-def test_assert_almost_equal_asserts_correctly(dtype: candle.DType):
-    a = Tensor([1, 2, 3]).to(dtype)
-    b = Tensor([1, 2, 3]).to(dtype)
-    assert_almost_equal(a, b)
-
-    with pytest.raises(AssertionError):
-        assert_almost_equal(a, b + 1)
-
-    assert_almost_equal(a, b + 1, atol=20)
-    assert_almost_equal(a, b + 1, rtol=20)
-
-    with pytest.raises(AssertionError):
-        assert_almost_equal(a, b + 1, atol=0.9)
-
-    with pytest.raises(AssertionError):
-        assert_almost_equal(a, b + 1, rtol=0.1)
--- a/candle-pyo3/tests/native/test_tensor.py
+++ b/candle-pyo3/tests/native/test_tensor.py
@ -1,7 +1,6 @@
 import candle
 from candle import Tensor
 from candle.utils import cuda_is_available
-from candle.testing import assert_equal
 import pytest


@ -78,78 +77,6 @@ def test_tensor_can_be_scliced_3d():
    assert t[..., 0:2].values() == [[[1, 2], [5, 6]], [[9, 10], [13, 14]]]


-def assert_bool(t: Tensor, expected: bool):
-    assert t.shape == ()
-    assert str(t.dtype) == str(candle.u8)
-    assert bool(t.values()) == expected
-
-
-def test_tensor_supports_equality_opperations_with_scalars():
-    t = Tensor(42.0)
-
-    assert_bool(t == 42.0, True)
-    assert_bool(t == 43.0, False)
-
-    assert_bool(t != 42.0, False)
-    assert_bool(t != 43.0, True)
-
-    assert_bool(t > 41.0, True)
-    assert_bool(t > 42.0, False)
-
-    assert_bool(t >= 41.0, True)
-    assert_bool(t >= 42.0, True)
-
-    assert_bool(t < 43.0, True)
-    assert_bool(t < 42.0, False)
-
-    assert_bool(t <= 43.0, True)
-    assert_bool(t <= 42.0, True)
-
-
-def test_tensor_supports_equality_opperations_with_tensors():
-    t = Tensor(42.0)
-    same = Tensor(42.0)
-    other = Tensor(43.0)
-
-    assert_bool(t == same, True)
-    assert_bool(t == other, False)
-
-    assert_bool(t != same, False)
-    assert_bool(t != other, True)
-
-    assert_bool(t > same, False)
-    assert_bool(t > other, False)
-
-    assert_bool(t >= same, True)
-    assert_bool(t >= other, False)
-
-    assert_bool(t < same, False)
-    assert_bool(t < other, True)
-
-    assert_bool(t <= same, True)
-    assert_bool(t <= other, True)
-
-
-def test_tensor_equality_opperations_can_broadcast():
-    # Create a decoder attention mask as a test case
-    # e.g.
-    # [[1,0,0]
-    #  [1,1,0]
-    #  [1,1,1]]
-    mask_cond = candle.Tensor([0, 1, 2])
-    mask = mask_cond < (mask_cond + 1).reshape((3, 1))
-    assert mask.shape == (3, 3)
-    assert_equal(mask, Tensor([[1, 0, 0], [1, 1, 0], [1, 1, 1]]).to_dtype(candle.u8))
-
-
-def test_tensor_can_be_hashed():
-    t = Tensor(42.0)
-    other = Tensor(42.0)
-    # Hash should represent a unique tensor
-    assert hash(t) != hash(other)
-    assert hash(t) == hash(t)
-
-
 def test_tensor_can_be_expanded_with_none():
    t = candle.rand((12, 12))

--- a/candle-transformers/Cargo.toml
+++ b/candle-transformers/Cargo.toml
@ -28,6 +28,5 @@ wav = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"]
 cuda = ["candle/cuda", "candle-nn/cuda"]
-metal = ["candle/metal", "candle-nn/metal"]
 flash-attn = ["cuda", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl"]
--- a/candle-transformers/src/models/marian.rs
+++ b/candle-transformers/src/models/marian.rs
@ -18,11 +18,11 @@ pub struct Config {
    pub is_encoder_decoder: bool,
    pub activation_function: candle_nn::Activation,
    pub d_model: usize,
-    pub decoder_start_token_id: u32,
+    pub decoder_start_token_id: usize,
    pub scale_embedding: bool,
-    pub pad_token_id: u32,
-    pub eos_token_id: u32,
-    pub forced_eos_token_id: u32,
+    pub pad_token_id: usize,
+    pub eos_token_id: usize,
+    pub forced_eos_token_id: usize,
    pub share_encoder_decoder_embeddings: bool,
 }

@ -51,31 +51,6 @@ impl Config {
            vocab_size: 53017,
        }
    }
-
-    // https://huggingface.co/Helsinki-NLP/opus-mt-fr-en/blob/main/config.json
-    pub fn opus_mt_fr_en() -> Self {
-        Self {
-            activation_function: candle_nn::Activation::Swish,
-            d_model: 512,
-            decoder_attention_heads: 8,
-            decoder_ffn_dim: 2048,
-            decoder_layers: 6,
-            decoder_start_token_id: 59513,
-            decoder_vocab_size: Some(59514),
-            encoder_attention_heads: 8,
-            encoder_ffn_dim: 2048,
-            encoder_layers: 6,
-            eos_token_id: 0,
-            forced_eos_token_id: 0,
-            is_encoder_decoder: true,
-            max_position_embeddings: 512,
-            pad_token_id: 59513,
-            scale_embedding: true,
-            share_encoder_decoder_embeddings: true,
-            use_cache: true,
-            vocab_size: 59514,
-        }
-    }
 }

 #[derive(Debug, Clone)]
@ -126,8 +101,6 @@ struct Attention {
    scaling: f64,
    num_heads: usize,
    head_dim: usize,
-    kv_cache: Option<(Tensor, Tensor)>,
-    is_decoder: bool,
 }

 impl Attention {
@ -152,8 +125,6 @@ impl Attention {
            scaling,
            num_heads,
            head_dim,
-            kv_cache: None,
-            is_decoder,
        })
    }

@ -164,12 +135,7 @@ impl Attention {
            .contiguous()
    }

-    fn forward(
-        &mut self,
-        xs: &Tensor,
-        kv_states: Option<&Tensor>,
-        attn_mask: Option<&Tensor>,
-    ) -> Result<Tensor> {
+    fn forward(&self, xs: &Tensor, kv_states: Option<&Tensor>) -> Result<Tensor> {
        let is_cross_attn = kv_states.is_some();
        let (b_sz, tgt_len, _) = xs.dims3()?;
        let query_states = (xs.apply(&self.q_proj)? * self.scaling)?;
@ -177,20 +143,7 @@ impl Attention {
            None => {
                let key_states = self._shape(&xs.apply(&self.k_proj)?, b_sz)?;
                let value_states = self._shape(&xs.apply(&self.v_proj)?, b_sz)?;
-                if self.is_decoder {
-                    let kv_states = match &self.kv_cache {
-                        None => (key_states, value_states),
-                        Some((p_key_states, p_value_states)) => {
-                            let key_states = Tensor::cat(&[p_key_states, &key_states], 2)?;
-                            let value_states = Tensor::cat(&[p_value_states, &value_states], 2)?;
-                            (key_states, value_states)
-                        }
-                    };
-                    self.kv_cache = Some(kv_states.clone());
-                    kv_states
-                } else {
-                    (key_states, value_states)
-                }
+                (key_states, value_states)
            }
            Some(kv_states) => {
                let key_states = self._shape(&kv_states.apply(&self.k_proj)?, b_sz)?;
@ -203,10 +156,7 @@ impl Attention {
        let key_states = key_states.reshape(proj_shape)?;
        let value_states = value_states.reshape(proj_shape)?;
        let attn_weights = query_states.matmul(&key_states.transpose(1, 2)?)?;
-        let attn_weights = match attn_mask {
-            None => attn_weights,
-            Some(attn_mask) => attn_weights.broadcast_add(attn_mask)?,
-        };
+        // todo: attn_mask
        let attn_probs = candle_nn::ops::softmax_last_dim(&attn_weights)?;
        let attn_output = attn_probs.matmul(&value_states)?;
        attn_output
@ -215,10 +165,6 @@ impl Attention {
            .reshape((b_sz, tgt_len, self.head_dim * self.num_heads))?
            .apply(&self.out_proj)
    }
-
-    fn reset_kv_cache(&mut self) {
-        self.kv_cache = None
-    }
 }

 #[derive(Debug, Clone)]
@ -248,10 +194,10 @@ impl EncoderLayer {
        })
    }

-    fn forward(&mut self, xs: &Tensor) -> Result<Tensor> {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let residual = xs;
-        let xs = (self.self_attn.forward(xs, None, None)? + residual)?
-            .apply(&self.self_attn_layer_norm)?;
+        let xs =
+            (self.self_attn.forward(xs, None)? + residual)?.apply(&self.self_attn_layer_norm)?;
        let residual = &xs;
        let xs = xs
            .apply(&self.fc1)?
@ -278,8 +224,7 @@ impl DecoderLayer {
        let self_attn = Attention::new(cfg, true, vb.pp("self_attn"))?;
        let self_attn_layer_norm = layer_norm(cfg.d_model, 1e-5, vb.pp("self_attn_layer_norm"))?;
        let encoder_attn = Attention::new(cfg, true, vb.pp("encoder_attn"))?;
-        let encoder_attn_layer_norm =
-            layer_norm(cfg.d_model, 1e-5, vb.pp("encoder_attn_layer_norm"))?;
+        let encoder_attn_layer_norm = layer_norm(cfg.d_model, 1e-5, vb.pp("self_attn_layer_norm"))?;
        let fc1 = linear(cfg.d_model, cfg.decoder_ffn_dim, vb.pp("fc1"))?;
        let fc2 = linear(cfg.decoder_ffn_dim, cfg.d_model, vb.pp("fc2"))?;
        let final_layer_norm = layer_norm(cfg.d_model, 1e-5, vb.pp("final_layer_norm"))?;
@ -295,21 +240,16 @@ impl DecoderLayer {
        })
    }

-    fn forward(
-        &mut self,
-        xs: &Tensor,
-        encoder_xs: Option<&Tensor>,
-        attn_mask: &Tensor,
-    ) -> Result<Tensor> {
+    fn forward(&self, xs: &Tensor, encoder_xs: Option<&Tensor>) -> Result<Tensor> {
        let residual = xs;
-        let xs = (self.self_attn.forward(xs, None, Some(attn_mask))? + residual)?
-            .apply(&self.self_attn_layer_norm)?;
+        let xs =
+            (self.self_attn.forward(xs, None)? + residual)?.apply(&self.self_attn_layer_norm)?;
        let xs = match encoder_xs {
            None => xs,
            Some(encoder_xs) => {
                let residual = &xs;
-                let xs = self.encoder_attn.forward(&xs, Some(encoder_xs), None)?;
-                (residual + xs)?.apply(&self.encoder_attn_layer_norm)?
+                let xs = self.encoder_attn.forward(&xs, Some(encoder_xs))?;
+                (residual + xs)?.apply(&self.self_attn_layer_norm)?
            }
        };
        let residual = &xs;
@ -317,8 +257,7 @@ impl DecoderLayer {
            .apply(&self.fc1)?
            .apply(&self.activation_fn)?
            .apply(&self.fc2)?;
-        let xs = (xs + residual)?.apply(&self.final_layer_norm)?;
-        Ok(xs)
+        (xs + residual)?.apply(&self.final_layer_norm)
    }
 }

@ -352,7 +291,7 @@ impl Encoder {
        })
    }

-    pub fn forward(&mut self, xs: &Tensor, past_kv_len: usize) -> Result<Tensor> {
+    pub fn forward(&self, xs: &Tensor, past_kv_len: usize) -> Result<Tensor> {
        let xs = xs.apply(&self.embed_tokens)?;
        let xs = match self.embed_scale {
            None => xs,
@ -363,7 +302,7 @@ impl Encoder {
            .forward(&xs, past_kv_len)?
            .unsqueeze(0)?;
        let mut xs = xs.broadcast_add(&embed_pos)?;
-        for layer in self.layers.iter_mut() {
+        for layer in self.layers.iter() {
            xs = layer.forward(&xs)?
        }
        Ok(xs)
@ -401,11 +340,10 @@ impl Decoder {
    }

    pub fn forward(
-        &mut self,
+        &self,
        xs: &Tensor,
        encoder_xs: Option<&Tensor>,
        past_kv_len: usize,
-        attn_mask: &Tensor,
    ) -> Result<Tensor> {
        let xs = xs.apply(&self.embed_tokens)?;
        let xs = match self.embed_scale {
@ -417,8 +355,8 @@ impl Decoder {
            .forward(&xs, past_kv_len)?
            .unsqueeze(0)?;
        let mut xs = xs.broadcast_add(&embed_pos)?;
-        for layer in self.layers.iter_mut() {
-            xs = layer.forward(&xs, encoder_xs, attn_mask)?;
+        for layer in self.layers.iter() {
+            xs = layer.forward(&xs, encoder_xs)?
        }
        Ok(xs)
    }
@ -447,7 +385,6 @@ impl Model {
 #[derive(Debug, Clone)]
 pub struct MTModel {
    model: Model,
-    lm_head: Linear,
    final_logits_bias: Tensor,
 }

@ -456,37 +393,21 @@ impl MTModel {
        let target_vocab_size = cfg.decoder_vocab_size.unwrap_or(cfg.vocab_size);
        let final_logits_bias = vb.get((1, target_vocab_size), "final_logits_bias")?;
        let model = Model::new(cfg, vb.pp("model"))?;
-        let lm_head = Linear::from_weights(model.shared.embeddings().clone(), None);
        Ok(Self {
            model,
-            lm_head,
            final_logits_bias,
        })
    }

-    pub fn encoder(&mut self) -> &mut Encoder {
-        &mut self.model.encoder
+    pub fn encoder(&self) -> &Encoder {
+        &self.model.encoder
    }

-    pub fn decoder(&mut self) -> &mut Decoder {
-        &mut self.model.decoder
+    pub fn decoder(&self) -> &Decoder {
+        &self.model.decoder
    }

-    pub fn decode(
-        &mut self,
-        xs: &Tensor,
-        encoder_xs: &Tensor,
-        past_kv_len: usize,
-    ) -> Result<Tensor> {
-        let seq_len = xs.dim(1)?;
-        let mask: Vec<_> = (0..seq_len)
-            .flat_map(|i| (0..seq_len).map(move |j| if j > i { f32::NEG_INFINITY } else { 0f32 }))
-            .collect();
-        let mask = Tensor::from_vec(mask, (seq_len, seq_len), xs.device())?;
-        self.model
-            .decoder
-            .forward(xs, Some(encoder_xs), past_kv_len, &mask)?
-            .apply(&self.lm_head)?
-            .broadcast_add(&self.final_logits_bias)
+    pub fn decode(&self, xs: &Tensor, encoder_xs: &Tensor) -> Result<Tensor> {
+        self.model.decoder.forward(xs, Some(encoder_xs), 0)
    }
 }
--- a/candle-transformers/src/models/quantized_llama.rs
+++ b/candle-transformers/src/models/quantized_llama.rs
@ -2,7 +2,7 @@ use std::collections::HashMap;

 use candle::quantized::QTensor;
 use candle::quantized::{ggml_file, gguf_file};
-use candle::{Device, IndexOp, Result, Tensor, D};
+use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{Embedding, Module};

 pub const MAX_SEQ_LEN: usize = 4096;
@ -16,7 +16,7 @@ struct RmsNorm {
 impl RmsNorm {
    fn new(scale: QTensor, eps: f32) -> Result<Self> {
        let span = tracing::span!(tracing::Level::TRACE, "rms-norm");
-        let scale = scale.dequantize(scale.device())?;
+        let scale = scale.dequantize(&Device::Cpu)?;
        let inner = candle_nn::LayerNorm::rms_norm(scale, eps as f64);
        Ok(Self { inner, span })
    }
@ -79,8 +79,6 @@ fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result<Tensor>
 impl LayerWeights {
    fn apply_rotary_emb(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
        let _enter = self.span_rot.enter();
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cos");
-        let _enter = span.enter();
        let (b_sz, n_head, seq_len, n_embd) = x.dims4()?;
        let cos = self
            .cos
@ -90,37 +88,21 @@ impl LayerWeights {
            .sin
            .narrow(0, index_pos, seq_len)?
            .reshape((seq_len, n_embd / 2, 1))?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad");
-        let _enter = span.enter();
        let cos = cos.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?;
        let sin = sin.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?;
-        drop(_enter);
        // This mimics the llama.cpp behavior.
        // https://github.com/ggerganov/llama.cpp/blob/1f0bccb27929e261744c979bc75114955da49e98/ggml.c#L12104-L12105
        // The x0 and x1 value are interleaved on the n_embd (= head_dim) dimension.
        // The resulting y0 and y1 are also interleaved with:
        //   y0 = x0*cos - x1*sin
        //   y1 = x0*sin + x1*cos
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-reshape");
-        let _enter = span.enter();
        let x = x.reshape((b_sz, n_head, seq_len, n_embd / 2, 2))?;
        let x0 = x.narrow(D::Minus1, 0, 1)?;
        let x1 = x.narrow(D::Minus1, 1, 1)?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad-mul");
-        let _enter = span.enter();
        let y0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?;
        let y1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cat");
-        let _enter = span.enter();
        let rope = Tensor::cat(&[y0, y1], D::Minus1)?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-flatten");
-        let _enter = span.enter();
        let rope = rope.flatten_from(D::Minus2)?;
-        drop(_enter);
        Ok(rope)
    }

@ -130,7 +112,6 @@ impl LayerWeights {
        let q = self.attention_wq.forward(x)?;
        let k = self.attention_wk.forward(x)?;
        let v = self.attention_wv.forward(x)?;
-        // println!("Q {:?} K {:?} V {:?}", q.dtype(), k.dtype(), v.dtype());

        let q = q
            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
@ -164,12 +145,9 @@ impl LayerWeights {
        let v = self.repeat_kv(v)?;

        let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?;
-        // println!("att {:?}", att.dtype());
        let mask = mask.broadcast_as(att.shape())?;
-        // println!("mask {:?}", mask.dtype());
        let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?;
        let att = candle_nn::ops::softmax_last_dim(&att)?;
-        // println!("att {:?} v {:?}", att.dtype(), v.dtype());
        // Convert to contiguous as matmul doesn't support strided vs for now.
        let y = att.matmul(&v.contiguous()?)?;
        let y = y.transpose(1, 2)?.reshape(&[b_sz, seq_len, n_embd])?;
@ -203,37 +181,28 @@ pub struct ModelWeights {
    span_output: tracing::Span,
 }

-fn precomput_freqs_cis(
-    head_dim: usize,
-    freq_base: f32,
-    device: &Device,
-) -> Result<(Tensor, Tensor)> {
+fn precomput_freqs_cis(head_dim: usize, freq_base: f32) -> Result<(Tensor, Tensor)> {
    let theta: Vec<_> = (0..head_dim)
        .step_by(2)
        .map(|i| 1f32 / freq_base.powf(i as f32 / head_dim as f32))
        .collect();
-    let theta = Tensor::new(theta.as_slice(), device)?;
-    let range: Vec<f32> = (0..MAX_SEQ_LEN).map(|r| r as f32).collect();
-    let idx_theta = Tensor::new(range.as_slice(), device)?
+    let theta = Tensor::new(theta.as_slice(), &Device::Cpu)?;
+    let idx_theta = Tensor::arange(0, MAX_SEQ_LEN as u32, &Device::Cpu)?
+        .to_dtype(DType::F32)?
        .reshape((MAX_SEQ_LEN, 1))?
        .matmul(&theta.reshape((1, theta.elem_count()))?)?;
-    // TODO This change avoids allocating on Metal and then casting since allocating directly on
-    // CPU as f32 seems just as fast
-    // let idx_theta = Tensor::arange(0, MAX_SEQ_LEN as u32, device)?
-    //     .to_dtype(DType::F32)?
-    //     .reshape((MAX_SEQ_LEN, 1))?
-    //     .matmul(&theta.reshape((1, theta.elem_count()))?)?;
    let cos = idx_theta.cos()?;
    let sin = idx_theta.sin()?;
    Ok((cos, sin))
 }

 impl ModelWeights {
-    pub fn from_ggml(mut ct: ggml_file::Content, gqa: usize, device: &Device) -> Result<Self> {
+    pub fn from_ggml(mut ct: ggml_file::Content, gqa: usize) -> Result<Self> {
+        let cpu = &Device::Cpu;
        let head_dim = (ct.hparams.n_embd / ct.hparams.n_head) as usize;
-        let (cos, sin) = precomput_freqs_cis(head_dim, 10000., device)?;
+        let (cos, sin) = precomput_freqs_cis(head_dim, 10000.)?;
        let tok_embeddings = ct.remove("tok_embeddings.weight")?;
-        let tok_embeddings = tok_embeddings.dequantize(device)?;
+        let tok_embeddings = tok_embeddings.dequantize(cpu)?;
        let norm = RmsNorm::new(ct.remove("norm.weight")?, 1e-5)?;
        let output = ct.remove("output.weight")?;
        let mut layers = Vec::with_capacity(ct.hparams.n_layer as usize);
@ -288,8 +257,8 @@ impl ModelWeights {
    pub fn from_gguf<R: std::io::Seek + std::io::Read>(
        ct: gguf_file::Content,
        reader: &mut R,
-        device: &Device,
    ) -> Result<Self> {
+        let cpu = &Device::Cpu;
        let md_get = |s: &str| match ct.metadata.get(s) {
            None => candle::bail!("cannot find {s} in metadata"),
            Some(v) => Ok(v),
@ -307,31 +276,24 @@ impl ModelWeights {
        let rope_freq_base = md_get("llama.rope.freq_base")
            .and_then(|m| m.to_f32())
            .unwrap_or(10000f32);
-        let (cos, sin) = precomput_freqs_cis(rope_dim, rope_freq_base, device)?;
+        let (cos, sin) = precomput_freqs_cis(rope_dim, rope_freq_base)?;

-        let tok_embeddings = ct.tensor(reader, "token_embd.weight", device)?;
-        let tok_embeddings = tok_embeddings.dequantize(device)?;
-        let norm = RmsNorm::new(
-            ct.tensor(reader, "output_norm.weight", device)?,
-            rms_norm_eps,
-        )?;
-        let output = ct.tensor(reader, "output.weight", device)?;
+        let tok_embeddings = ct.tensor(reader, "token_embd.weight")?;
+        let tok_embeddings = tok_embeddings.dequantize(cpu)?;
+        let norm = RmsNorm::new(ct.tensor(reader, "output_norm.weight")?, rms_norm_eps)?;
+        let output = ct.tensor(reader, "output.weight")?;
        let mut layers = Vec::with_capacity(block_count);
        for layer_idx in 0..block_count {
            let prefix = format!("blk.{layer_idx}");
-            let attention_wq = ct.tensor(reader, &format!("{prefix}.attn_q.weight"), device)?;
-            let attention_wk = ct.tensor(reader, &format!("{prefix}.attn_k.weight"), device)?;
-            let attention_wv = ct.tensor(reader, &format!("{prefix}.attn_v.weight"), device)?;
-            let attention_wo =
-                ct.tensor(reader, &format!("{prefix}.attn_output.weight"), device)?;
-            let feed_forward_w1 =
-                ct.tensor(reader, &format!("{prefix}.ffn_gate.weight"), device)?;
-            let feed_forward_w2 =
-                ct.tensor(reader, &format!("{prefix}.ffn_down.weight"), device)?;
-            let feed_forward_w3 = ct.tensor(reader, &format!("{prefix}.ffn_up.weight"), device)?;
-            let attention_norm =
-                ct.tensor(reader, &format!("{prefix}.attn_norm.weight"), device)?;
-            let ffn_norm = ct.tensor(reader, &format!("{prefix}.ffn_norm.weight"), device)?;
+            let attention_wq = ct.tensor(reader, &format!("{prefix}.attn_q.weight"))?;
+            let attention_wk = ct.tensor(reader, &format!("{prefix}.attn_k.weight"))?;
+            let attention_wv = ct.tensor(reader, &format!("{prefix}.attn_v.weight"))?;
+            let attention_wo = ct.tensor(reader, &format!("{prefix}.attn_output.weight"))?;
+            let feed_forward_w1 = ct.tensor(reader, &format!("{prefix}.ffn_gate.weight"))?;
+            let feed_forward_w2 = ct.tensor(reader, &format!("{prefix}.ffn_down.weight"))?;
+            let feed_forward_w3 = ct.tensor(reader, &format!("{prefix}.ffn_up.weight"))?;
+            let attention_norm = ct.tensor(reader, &format!("{prefix}.attn_norm.weight"))?;
+            let ffn_norm = ct.tensor(reader, &format!("{prefix}.ffn_norm.weight"))?;
            let span_attn = tracing::span!(tracing::Level::TRACE, "attn");
            let span_rot = tracing::span!(tracing::Level::TRACE, "attn-rot");
            let span_mlp = tracing::span!(tracing::Level::TRACE, "attn-mlp");
@ -369,14 +331,14 @@ impl ModelWeights {
        })
    }

-    fn mask(&mut self, t: usize, device: &Device) -> Result<Tensor> {
+    fn mask(&mut self, t: usize) -> Result<Tensor> {
        if let Some(mask) = self.masks.get(&t) {
            Ok(mask.clone())
        } else {
            let mask: Vec<_> = (0..t)
                .flat_map(|i| (0..t).map(move |j| u8::from(j > i)))
                .collect();
-            let mask = Tensor::from_slice(&mask, (t, t), device)?;
+            let mask = Tensor::from_slice(&mask, (t, t), &Device::Cpu)?;
            self.masks.insert(t, mask.clone());
            Ok(mask)
        }
@ -384,7 +346,7 @@ impl ModelWeights {

    pub fn forward(&mut self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
        let (_b_sz, seq_len) = x.dims2()?;
-        let mask = self.mask(seq_len, x.device())?;
+        let mask = self.mask(seq_len)?;
        let _enter = self.span.enter();
        let mut layer_in = self.tok_embeddings.forward(x)?;
        for layer in self.layers.iter_mut() {
--- a/candle-transformers/src/quantized_var_builder.rs
+++ b/candle-transformers/src/quantized_var_builder.rs
@ -10,12 +10,12 @@ pub struct VarBuilder {
 }

 impl VarBuilder {
-    pub fn from_gguf<P: AsRef<std::path::Path>>(p: P, device: &Device) -> Result<Self> {
+    pub fn from_gguf<P: AsRef<std::path::Path>>(p: P) -> Result<Self> {
        let mut file = std::fs::File::open(p)?;
        let content = candle::quantized::gguf_file::Content::read(&mut file)?;
        let mut data = std::collections::HashMap::new();
        for tensor_name in content.tensor_infos.keys() {
-            let tensor = content.tensor(&mut file, tensor_name, device)?;
+            let tensor = content.tensor(&mut file, tensor_name)?;
            data.insert(tensor_name.to_string(), Arc::new(tensor));
        }
        Ok(Self {
@ -25,12 +25,12 @@ impl VarBuilder {
        })
    }

-    pub fn from_gguf_buffer(buffer: &[u8], device: &Device) -> Result<Self> {
+    pub fn from_gguf_buffer(buffer: &[u8]) -> Result<Self> {
        let mut cursor = std::io::Cursor::new(buffer);
        let content = candle::quantized::gguf_file::Content::read(&mut cursor)?;
        let mut data = std::collections::HashMap::new();
        for tensor_name in content.tensor_infos.keys() {
-            let tensor = content.tensor(&mut cursor, tensor_name, device)?;
+            let tensor = content.tensor(&mut cursor, tensor_name)?;
            data.insert(tensor_name.to_string(), Arc::new(tensor));
        }
        Ok(Self {