llama2-c wasm fix.

Lazy detach. (#1242 )
Remove the unused pragma for marian. (#1236 )
2025-06-17 11:08:52 +00:00 · 2023-11-02 10:31:47 +01:00 · 2023-11-02 07:33:48 +00:00 · 2023-11-01 20:04:52 +00:00 · 2023-11-01 18:21:36 +00:00 · 2023-11-01 14:37:52 +00:00
35 changed files with 239 additions and 1270 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -55,8 +55,6 @@ tracing-subscriber = "0.3.7"
 wav = "1.0.0"
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "0.6.6", default-features = false }
-# metal = { git = "https://github.com/ivarflakstad/metal-rs.git", features = ["mps"] }
-metal = { path = "../metal-rs", features = ["mps"] }

 [profile.release-with-debug]
 inherits = "release"
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -13,8 +13,6 @@ readme = "README.md"
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
 candle-kernels = { path = "../candle-kernels", version = "0.3.0", optional = true }
-candle-metal-kernels = { path = "../candle-metal-kernels", version = "0.3.0", optional = true }
-metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
@ -30,7 +28,6 @@ safetensors = { workspace = true }
 thiserror = { workspace = true }
 yoke = { workspace = true }
 zip = { workspace = true }
-tracing = { workspace = true }

 [dev-dependencies]
 anyhow = { workspace = true }
@ -42,4 +39,3 @@ cuda = ["cudarc", "dep:candle-kernels"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
-metal = ["dep:candle-metal-kernels", "dep:metal"]
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -1,6 +1,6 @@
 use crate::backend::BackendDevice;
 use crate::cpu_backend::CpuDevice;
-use crate::{bail, CpuStorage, DType, Result, Shape, Storage, WithDType};
+use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};

 /// A `DeviceLocation` represents a physical device whereas multiple `Device`
 /// can live on the same location (typically for cuda devices).
@ -8,14 +8,12 @@ use crate::{bail, CpuStorage, DType, Result, Shape, Storage, WithDType};
 pub enum DeviceLocation {
    Cpu,
    Cuda { gpu_id: usize },
-    Metal,
 }

 #[derive(Debug, Clone)]
 pub enum Device {
    Cpu,
    Cuda(crate::CudaDevice),
-    Metal(crate::MetalDevice),
 }

 pub trait NdArray {
@ -105,14 +103,14 @@ impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize, const N4:
 impl<S: NdArray> NdArray for Vec<S> {
    fn shape(&self) -> Result<Shape> {
        if self.is_empty() {
-            bail!("empty array")
+            crate::bail!("empty array")
        }
        let shape0 = self[0].shape()?;
        let n = self.len();
        for v in self.iter() {
            let shape = v.shape()?;
            if shape != shape0 {
-                bail!("two elements have different shapes {shape:?} {shape0:?}")
+                crate::bail!("two elements have different shapes {shape:?} {shape0:?}")
            }
        }
        Ok(Shape::from([[n].as_slice(), shape0.dims()].concat()))
@ -130,15 +128,10 @@ impl Device {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
    }

-    pub fn new_metal(ordinal: usize) -> Result<Self> {
-        Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
-    }
-
    pub fn set_seed(&self, seed: u64) -> Result<()> {
        match self {
-            Self::Cpu => CpuDevice.set_seed(seed),
+            Self::Cpu => crate::cpu_backend::CpuDevice.set_seed(seed),
            Self::Cuda(c) => c.set_seed(seed),
-            Self::Metal(m) => m.set_seed(seed),
        }
    }

@ -154,16 +147,21 @@ impl Device {
        match self {
            Self::Cpu => DeviceLocation::Cpu,
            Self::Cuda(device) => device.location(),
-            Device::Metal(device) => device.location(),
        }
    }

    pub fn is_cpu(&self) -> bool {
-        matches!(self, Self::Cpu)
+        match self {
+            Self::Cpu => true,
+            Self::Cuda(_) => false,
+        }
    }

    pub fn is_cuda(&self) -> bool {
-        matches!(self, Self::Cuda(_))
+        match self {
+            Self::Cpu => false,
+            Self::Cuda(_) => true,
+        }
    }

    pub fn cuda_if_available(ordinal: usize) -> Result<Self> {
@ -187,13 +185,14 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
-                let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                Ok(Storage::Cuda(storage))
-            }
-            Device::Metal(_device) => {
-                // let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                // Ok(Storage::Metal(storage))
-                bail!("Metal rand_uniform not implemented")
+                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
+                if dtype == DType::F16 || dtype == DType::BF16 {
+                    let storage = device.rand_uniform(shape, DType::F32, lo, up)?;
+                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
+                } else {
+                    let storage = device.rand_uniform(shape, dtype, lo, up)?;
+                    Ok(Storage::Cuda(storage))
+                }
            }
        }
    }
@ -220,12 +219,14 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
-                let storage = device.rand_normal(shape, dtype, mean, std)?;
-                Ok(Storage::Cuda(storage))
-            }
-            Device::Metal(device) => {
-                let storage = device.rand_normal(shape, dtype, mean, std)?;
-                Ok(Storage::Metal(storage))
+                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
+                if dtype == DType::F16 || dtype == DType::BF16 {
+                    let storage = device.rand_normal(shape, DType::F32, mean, std)?;
+                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
+                } else {
+                    let storage = device.rand_normal(shape, dtype, mean, std)?;
+                    Ok(Storage::Cuda(storage))
+                }
            }
        }
    }
@ -249,10 +250,6 @@ impl Device {
                let storage = device.ones_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.ones_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -266,10 +263,6 @@ impl Device {
                let storage = device.zeros_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.zeros_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -281,11 +274,6 @@ impl Device {
                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage(&storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -297,11 +285,6 @@ impl Device {
                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage(&storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }
 }
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -14,7 +14,6 @@ impl Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            _ => todo!(),
        };

        write!(f, "Tensor[")?;
@ -477,7 +476,6 @@ impl std::fmt::Display for Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            crate::DeviceLocation::Metal => todo!(),
        };

        write!(
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -1,201 +0,0 @@
-#![allow(dead_code)]
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
-use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
-
-#[derive(Debug, Clone)]
-pub struct MetalDevice;
-
-#[derive(Debug)]
-pub struct MetalStorage;
-
-macro_rules! fail {
-    () => {
-        unimplemented!("metal support has not been enabled, add `metal` feature to enable.")
-    };
-}
-
-impl crate::backend::BackendStorage for MetalStorage {
-    type Device = MetalDevice;
-
-    fn try_clone(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn dtype(&self) -> DType {
-        fail!()
-    }
-
-    fn device(&self) -> &Self::Device {
-        fail!()
-    }
-
-    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv2d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn matmul(
-        &self,
-        _: &Self,
-        _: (usize, usize, usize, usize),
-        _: &Layout,
-        _: &Layout,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
-
-impl crate::backend::BackendDevice for MetalDevice {
-    type Storage = MetalStorage;
-    fn new(_: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn set_seed(&self, _: u64) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        fail!()
-    }
-
-    fn same_device(&self, _: &Self) -> bool {
-        fail!()
-    }
-
-    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -152,9 +152,6 @@ pub enum Error {
    #[error("the candle crate has not been built with cuda support")]
    NotCompiledWithCudaSupport,

-    #[error("the candle crate has not been built with metal support")]
-    NotCompiledWithMetalSupport,
-
    #[error("cannot find tensor {path}")]
    CannotFindTensor { path: String },

@ -162,9 +159,6 @@ pub enum Error {
    #[error(transparent)]
    Cuda(Box<dyn std::error::Error + Send + Sync>),

-    #[error("Metal error {0}")]
-    Metal(String),
-
    #[error(transparent)]
    TryFromIntError(#[from] core::num::TryFromIntError),

--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -52,10 +52,6 @@ mod dummy_cuda_backend;
 pub mod error;
 mod indexer;
 pub mod layout;
-#[cfg(feature = "metal")]
-pub mod metal_backend;
-#[cfg(feature = "accelerate")]
-mod metal_backend;
 #[cfg(feature = "mkl")]
 mod mkl;
 pub mod npy;
@ -91,12 +87,6 @@ pub use cuda_backend::{CudaDevice, CudaStorage};
 #[cfg(not(feature = "cuda"))]
 pub use dummy_cuda_backend::{CudaDevice, CudaStorage};

-#[cfg(feature = "metal")]
-pub use metal_backend::{MetalDevice, MetalStorage};
-
-#[cfg(not(feature = "metal"))]
-pub use dummy_metal_backend::{MetalDevice, MetalStorage};
-
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@ -1,474 +0,0 @@
-use crate::backend::{BackendDevice, BackendStorage};
-use crate::bail;
-use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose2D};
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
-use crate::{CpuStorage, DType, Layout, Result, Shape};
-use candle_metal_kernels;
-use core::mem;
-use half::{bf16, f16};
-use metal;
-use metal::mps::matrix::{Matrix, MatrixDescriptor, MatrixMultiplication};
-use metal::mps::{Float32, MPSDataType};
-use metal::MTLResourceOptions;
-
-/// Metal related errors
-#[derive(thiserror::Error, Debug)]
-pub enum MetalError {
-    #[error("metal error")]
-    Metal,
-}
-
-#[derive(Clone)]
-pub struct MetalDevice {
-    device: metal::Device,
-    _command_queue: metal::CommandQueue,
-    command_buffer: metal::CommandBuffer,
-}
-
-impl std::fmt::Debug for MetalDevice {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "MetalDevice({:?})", self.device.registry_id())
-    }
-}
-
-impl std::ops::Deref for MetalDevice {
-    type Target = metal::DeviceRef;
-
-    fn deref(&self) -> &Self::Target {
-        &self.device
-    }
-}
-
-impl MetalDevice {
-    pub fn metal_device(&self) -> &metal::DeviceRef {
-        self.device.as_ref()
-    }
-
-    pub fn id(&self) -> u64 {
-        self.registry_id()
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct MetalStorage {
-    buffer: metal::Buffer,
-    device: MetalDevice,
-    dtype: DType,
-}
-
-impl BackendStorage for MetalStorage {
-    type Device = MetalDevice;
-
-    fn try_clone(&self, _: &Layout) -> Result<Self> {
-        Ok(self.clone())
-    }
-
-    fn dtype(&self) -> DType {
-        self.dtype
-    }
-
-    fn device(&self) -> &Self::Device {
-        &self.device
-    }
-
-    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        match self.dtype{
-            DType::F32 => {
-// self.buffer.read_to_vec(self.buffer.length() as usize / 4);
-                let mut buffer = vec![0.0; 32000];
-buffer[0] = 1.0;
-                Ok(CpuStorage::F32(buffer))},
-            dtype => todo!("Unsupported dtype {dtype:?}")
-        }
-    }
-
-    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
-        println!("TODO Affine");
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        todo!()
-    }
-
-    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
-        todo!()
-    }
-
-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
-        println!("TODO reduce_op");
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        todo!()
-    }
-
-    fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
-        todo!("Implement {:?} {layout:?}  - {dtype:?}", self.dtype)
-    }
-
-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
-        // todo!()
-        // TODO
-        println!("TODO {:?}", B::NAME);
-        Ok(self.clone())
-    }
-
-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        println!("TODO Binary {:?}", B::NAME);
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn where_cond(&self, _: &Layout, rhs: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
-        println!("TODO where_cond");
-        Ok(rhs.clone())
-        // todo!()
-    }
-
-    fn conv1d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &ParamsConv1D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn conv2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &ParamsConv2D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        todo!()
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        todo!()
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        todo!()
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        todo!()
-    }
-
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        todo!()
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        println!("TODO Index select");
-        Ok(self.clone())
-        // todo!()
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        todo!()
-    }
-
-    fn matmul(
-        &self,
-        rhs: &Self,
-        (b, m, n, k): (usize, usize, usize, usize),
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-    ) -> Result<Self> {
-        let transpose_left = false;
-        let transpose_right = false;
-        let alpha = 1.0;
-        let beta = 0.0;
-        self.matmul_generic(
-            rhs,
-            (b, m, n, k),
-            lhs_l,
-            rhs_l,
-            transpose_left,
-            transpose_right,
-            alpha,
-            beta,
-        )
-    }
-
-    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
-        println!("TODO Copy strided");
-        Ok(())
-    }
-}
-
-impl MetalStorage {
-    pub(crate) fn matmul_t(
-        &self,
-        rhs: &Self,
-        (b, m, n, k): (usize, usize, usize, usize),
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-    ) -> Result<Self> {
-        let transpose_left = false;
-        let transpose_right = true;
-        let alpha = 1.0;
-        let beta = 0.0;
-        self.matmul_generic(
-            rhs,
-            (b, m, n, k),
-            lhs_l,
-            rhs_l,
-            transpose_left,
-            transpose_right,
-            alpha,
-            beta,
-        )
-    }
-    pub(crate) fn matmul_generic(
-        &self,
-        rhs: &Self,
-        (b, m, n, k): (usize, usize, usize, usize),
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-        transpose_left: bool,
-        transpose_right: bool,
-        alpha: f64,
-        beta: f64,
-    ) -> Result<Self> {
-        let elem_count = b * m * n;
-        match (self.dtype, rhs.dtype) {
-            (DType::F32, DType::F32) => {
-            let span= tracing::span!(tracing::Level::TRACE, "metal alloc matmul");
-            let _enter = span.enter();
-
-                let out_buffer = self.device.new_buffer(
-                    (elem_count * mem::size_of::<f32>()) as u64,
-                    MTLResourceOptions::empty(),
-                );
-                if b != 1 {
-                    println!("TODO implement batched matmul for B={b}");
-                    // bail!("Didn't implemented strided matmul yet");
-                    return Ok(Self {
-                        buffer: out_buffer,
-                        device: self.device.clone(),
-                        dtype: self.dtype(),
-                    });
-                }
-                if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() {
-                    println!("Didn't implemented non contiguous matmul yet {:?} {:?}", lhs_l.is_contiguous(), rhs_l.is_contiguous());
-                    return Ok(Self {
-                        buffer: out_buffer,
-                        device: self.device.clone(),
-                        dtype: self.dtype(),
-                    });
-                }
-                return Ok(Self {
-                    buffer: out_buffer,
-                    device: self.device.clone(),
-                    dtype: self.dtype(),
-                });
-                let m: u64 = m.try_into().expect("usize should fit u64");
-                let n: u64 = n.try_into().expect("usize should fit u64");
-                let k: u64 = k.try_into().expect("usize should fit u64");
-                // Create descriptors
-                let left_descriptor =
-                    MatrixDescriptor::init_single(m, k, k * Float32::SIZE, Float32::TYPE_ID);
-                let right_descriptor =
-                    MatrixDescriptor::init_single(k, n, n * Float32::SIZE, Float32::TYPE_ID);
-                let result_descriptor =
-                    MatrixDescriptor::init_single(m, n, n * Float32::SIZE, Float32::TYPE_ID);
-
-                println!("lhs {:?} {m} {k}", self.buffer.length());
-                println!("rhs {:?} {k} {n}", rhs.buffer.length());
-                println!("out {:?} {m} {n}", out_buffer.length());
-                // Create matrix objects
-                let left_matrix =
-                    Matrix::init_with_buffer_descriptor(&self.buffer, &left_descriptor)
-                        .expect("Failed to create left matrix");
-                let right_matrix =
-                    Matrix::init_with_buffer_descriptor(&rhs.buffer, &right_descriptor)
-                        .expect("Failed to create left matrix");
-
-                let result_matrix =
-                    Matrix::init_with_buffer_descriptor(&out_buffer, &result_descriptor)
-                        .expect("Failed to create left matrix");
-
-                println!("lhs {:?}", lhs_l.shape());
-
-                // Create kernel
-                let matrix_multiplication = MatrixMultiplication::init(
-                    &self.device,
-                    transpose_left,
-                    transpose_right,
-                    m,
-                    n,
-                    k,
-                    alpha,
-                    beta,
-                )
-                .expect("Failed to create matrix multiplication kernel");
-
-                // Encode kernel to command buffer
-                matrix_multiplication.encode_to_command_buffer(
-                    &self.device.command_buffer,
-                    &left_matrix,
-                    &right_matrix,
-                    &result_matrix,
-                );
-                Ok(Self {
-                    buffer: out_buffer,
-                    device: self.device.clone(),
-                    dtype: self.dtype(),
-                })
-            }
-            _ => todo!("Unimplemented matmul for this pair"),
-        }
-    }
-}
-
-impl MetalDevice{
-    pub fn flush(&mut self){
-        self.command_buffer.commit();
-        self.command_buffer.wait_until_completed();
-        self.command_buffer = self._command_queue.new_owned_command_buffer();
-    }
-
-}
-
-impl BackendDevice for MetalDevice {
-    type Storage = MetalStorage;
-
-    fn new(ordinal: usize) -> Result<Self> {
-        let device = metal::Device::all().swap_remove(ordinal);
-        let _command_queue = device.new_command_queue();
-        let command_buffer = _command_queue.new_owned_command_buffer();
-        Ok(Self {
-            device,
-            _command_queue,
-            command_buffer,
-        })
-    }
-
-    fn set_seed(&self, _seed: u64) -> Result<()> {
-        todo!("set_seed")
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        crate::DeviceLocation::Metal
-    }
-
-    fn same_device(&self, rhs: &Self) -> bool {
-        self.device.registry_id() == rhs.device.registry_id()
-    }
-
-    fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<MetalStorage> {
-        // TODO Is there a faster way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.zeros_impl(shape, dtype)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-
-    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
-        // TODO Is there a faster way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.ones_impl(shape, dtype)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-
-    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<Self::Storage> {
-        let option = metal::MTLResourceOptions::CPUCacheModeDefaultCache;
-        let span= tracing::span!(tracing::Level::TRACE, "metal alloc");
-        let _enter = span.enter();
-
-        let buffer = self.device.new_buffer(4, option);
-        // let buffer = match storage {
-        //     CpuStorage::U8(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<u8>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::U32(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<u32>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::I64(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<i64>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::BF16(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<bf16>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::F16(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<f16>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::F32(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<f32>()) as u64,
-        //         option,
-        //     ),
-        //     CpuStorage::F64(storage) => self.device.new_buffer_with_data(
-        //         storage.as_ptr() as *const core::ffi::c_void,
-        //         (storage.len() * mem::size_of::<f64>()) as u64,
-        //         option,
-        //     ),
-        // };
-        Ok(Self::Storage {
-            buffer,
-            device: self.clone(),
-            dtype: storage.dtype(),
-        })
-    }
-
-    fn rand_uniform(&self, shape: &Shape, dtype: DType, mean: f64, stddev: f64) -> Result<Self::Storage> {
-        // TODO is there a better way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.rand_uniform(shape, dtype, mean, stddev)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-
-    fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, stddev: f64) -> Result<Self::Storage> {
-        // TODO is there a better way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.rand_normal(shape, dtype, mean, stddev)?;
-        self.storage_from_cpu_storage(&cpu_storage)
-    }
-}
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,5 +1,5 @@
 #![allow(clippy::redundant_closure_call)]
-use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
+use crate::{CpuStorage, CudaStorage, Layout, Result, Shape, Tensor};
 use half::{bf16, f16};
 use num_traits::float::Float;

@ -174,18 +174,6 @@ pub trait CustomOp1 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _storage: &MetalStorage,
-        _layout: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    /// This function takes as argument the argument `arg` used in the forward pass, the result
    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
    /// The function should return the gradient of the argument.
@ -221,20 +209,6 @@ pub trait CustomOp2 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    fn bwd(
        &self,
        _arg1: &Tensor,
@ -277,22 +251,6 @@ pub trait CustomOp3 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    fn bwd(
        &self,
        _arg1: &Tensor,
@ -962,6 +920,10 @@ impl BackpropOp {
        };
        Self(op)
    }
+
+    pub(crate) fn is_none(&self) -> bool {
+        self.0.is_none()
+    }
 }

 impl std::ops::Deref for BackpropOp {
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -1,7 +1,7 @@
 //! Support for the GGML file format.

 use super::{k_quants, GgmlDType};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;

@ -121,12 +121,11 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
    raw_data: &[u8],
    size_in_bytes: usize,
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let raw_data_ptr = raw_data.as_ptr();
    let n_blocks = size_in_bytes / std::mem::size_of::<T>();
    let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
-    super::QTensor::new(data.to_vec(), dims, device)
+    super::QTensor::new(data.to_vec(), dims)
 }

 /// Creates a [Tensor] from a raw GGML tensor.
@ -134,7 +133,6 @@ pub fn qtensor_from_ggml(
    ggml_dtype: GgmlDType,
    raw_data: &[u8],
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let tensor_elems = dims.iter().product::<usize>();
    let blck_size = ggml_dtype.blck_size();
@ -146,38 +144,18 @@ pub fn qtensor_from_ggml(
    let size_in_bytes = tensor_elems / blck_size * ggml_dtype.type_size();

    match ggml_dtype {
-        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::Q4_0 => {
-            from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4_1 => {
-            from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_0 => {
-            from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_1 => {
-            from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q8_0 => {
-            from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q2K => {
-            from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q3K => {
-            from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4K => {
-            from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5K => {
-            from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q6K => {
-            from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
-        }
+        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims),
+        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_0 => from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_1 => from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_0 => from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_1 => from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q8_0 => from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q2K => from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q3K => from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4K => from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5K => from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q6K => from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims),
        _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
    }
 }
@ -185,7 +163,6 @@ pub fn qtensor_from_ggml(
 fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    reader: &mut R,
    magic: VersionedMagic,
-    device: &Device,
 ) -> Result<(String, super::QTensor)> {
    let n_dims = reader.read_u32::<LittleEndian>()?;
    let name_len = reader.read_u32::<LittleEndian>()?;
@ -210,7 +187,7 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    // TODO: Mmap version to avoid copying the data around?
    let mut raw_data = vec![0u8; size_in_bytes];
    reader.read_exact(&mut raw_data)?;
-    match qtensor_from_ggml(ggml_dtype, &raw_data, dims, device) {
+    match qtensor_from_ggml(ggml_dtype, &raw_data, dims) {
        Ok(tensor) => Ok((name, tensor)),
        Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
    }
@ -224,10 +201,7 @@ pub struct Content {
 }

 impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(
-        reader: &mut R,
-        device: &Device,
-    ) -> Result<Content> {
+    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Content> {
        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
        let last_position = reader.seek(std::io::SeekFrom::End(0))?;
        reader.seek(std::io::SeekFrom::Start(0))?;
@ -237,7 +211,7 @@ impl Content {
        let mut tensors = HashMap::new();

        while reader.stream_position()? != last_position {
-            let (name, tensor) = read_one_tensor(reader, magic, device)?;
+            let (name, tensor) = read_one_tensor(reader, magic)?;
            tensors.insert(name, tensor);
        }
        Ok(Self {
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -3,7 +3,7 @@
 //! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md

 use super::{GgmlDType, QTensor};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::collections::HashMap;

@ -57,7 +57,6 @@ impl TensorInfo {
        &self,
        reader: &mut R,
        tensor_data_offset: u64,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_elems = self.shape.elem_count();
        let blck_size = self.ggml_dtype.blck_size();
@ -70,12 +69,7 @@ impl TensorInfo {
        let mut raw_data = vec![0u8; size_in_bytes];
        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(
-            self.ggml_dtype,
-            &raw_data,
-            self.shape.dims().to_vec(),
-            device,
-        )
+        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
    }
 }

@ -456,13 +450,12 @@ impl Content {
        &self,
        reader: &mut R,
        name: &str,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_info = match self.tensor_infos.get(name) {
            Some(tensor_info) => tensor_info,
            None => crate::bail!("cannot find tensor-infor for {name}"),
        };
-        tensor_info.read(reader, self.tensor_data_offset, device)
+        tensor_info.read(reader, self.tensor_data_offset)
    }
 }

--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -14,7 +14,6 @@ pub mod utils;
 pub use k_quants::GgmlType;

 pub struct QTensor {
-    device: Device,
    data: Box<dyn QuantizedType>,
    shape: Shape,
 }
@ -171,20 +170,17 @@ impl QTensor {
    pub fn new<S: Into<Shape>, T: k_quants::GgmlType + Send + Sync + 'static>(
        data: Vec<T>,
        shape: S,
-        device: &Device,
    ) -> Result<Self> {
        let shape = shape.into();
        check_shape::<T>(&shape)?;
        Ok(Self {
            data: Box::new(data),
            shape,
-            device: device.clone(),
        })
    }

    pub fn quantize<T: k_quants::GgmlType + Send + Sync + 'static>(src: &Tensor) -> Result<Self> {
        let shape = src.shape();
-        let device = src.device();
        check_shape::<T>(shape)?;
        let src = src
            .to_dtype(crate::DType::F32)?
@ -201,7 +197,6 @@ impl QTensor {
        Ok(Self {
            data: Box::new(data),
            shape: shape.clone(),
-            device: device.clone(),
        })
    }

@ -217,12 +212,7 @@ impl QTensor {
        &self.shape
    }

-    pub fn device(&self) -> &Device {
-        &self.device
-    }
-
    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
-        // TODO Skip the CPU part on metal
        let mut f32_data = vec![0f32; self.shape.elem_count()];
        self.data.to_float(&mut f32_data)?;
        Tensor::from_vec(f32_data, &self.shape, device)
@ -315,49 +305,6 @@ impl crate::CustomOp1 for QTensor {
        )?;
        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
    }
-
-    fn metal_fwd(
-        &self,
-        storage: &crate::MetalStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::MetalStorage, Shape)> {
-        println!("TODO qmatmul");
-        if !layout.is_contiguous() {
-            crate::bail!("input tensor is not contiguous {layout:?}")
-        }
-        let src_shape = layout.shape();
-        // self is transposed so n is first then k.
-        let (n, k) = self.shape.dims2()?;
-        if src_shape.rank() < 2 {
-            crate::bail!("input tensor has only one dimension {layout:?}")
-        }
-        let mut dst_shape = src_shape.dims().to_vec();
-        let last_k = dst_shape.pop().unwrap();
-        if last_k != k {
-            crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape)
-        }
-        dst_shape.push(n);
-        let dst_shape = Shape::from(dst_shape);
-        // let storage = storage.as_slice::<f32>()?;
-        // let storage =
-        //     &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
-        let dst_storage = vec![0f32; dst_shape.elem_count()];
-        // self.matmul_t(
-        //     (dst_shape.elem_count() / n, k, n),
-        //     storage,
-        //     &mut dst_storage,
-        // )?;
-        let cpu_storage = crate::CpuStorage::F32(dst_storage);
-        use crate::backend::{BackendDevice, BackendStorage};
-        if let Device::Metal(device) = &self.device{
-        Ok((
-            device.storage_from_cpu_storage(&cpu_storage)?,
-            dst_shape,
-        ))
-        }else{
-            crate::bail!("qtensor not on metal device")
-        }
-    }
 }

 impl QMatMul {
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,6 +1,6 @@
 use crate::backend::BackendStorage;
 use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
-use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
+use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};

 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -8,7 +8,6 @@ use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage,
 pub enum Storage {
    Cpu(CpuStorage),
    Cuda(CudaStorage),
-    Metal(MetalStorage),
 }

 impl Storage {
@ -19,10 +18,6 @@ impl Storage {
                let storage = storage.try_clone(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.try_clone(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -30,7 +25,6 @@ impl Storage {
        match self {
            Self::Cpu(_) => Device::Cpu,
            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
-            Self::Metal(storage) => Device::Metal(storage.device().clone()),
        }
    }

@ -38,7 +32,6 @@ impl Storage {
        match self {
            Self::Cpu(storage) => storage.dtype(),
            Self::Cuda(storage) => storage.dtype(),
-            Self::Metal(storage) => storage.dtype(),
        }
    }

@ -72,10 +65,6 @@ impl Storage {
                let storage = storage.affine(layout, mul, add)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.affine(layout, mul, add)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -89,10 +78,6 @@ impl Storage {
                let storage = storage.powf(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.powf(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -106,10 +91,6 @@ impl Storage {
                let storage = storage.elu(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.elu(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -131,10 +112,6 @@ impl Storage {
                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -158,10 +135,6 @@ impl Storage {
                let storage = storage.reduce_op(op, layout, s)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.reduce_op(op, layout, s)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -175,10 +148,6 @@ impl Storage {
                let storage = storage.to_dtype(layout, dtype)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.to_dtype(layout, dtype)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -192,10 +161,6 @@ impl Storage {
                let (storage, shape) = c.cuda_fwd(storage, l)?;
                Ok((Self::Cuda(storage), shape))
            }
-            Self::Metal(storage) => {
-                let (storage, shape) = c.metal_fwd(storage, l)?;
-                Ok((Self::Metal(storage), shape))
-            }
        }
    }

@ -216,10 +181,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -244,10 +205,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2, s3, l3)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -262,10 +219,6 @@ impl Storage {
                let storage = storage.unary_impl::<B>(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.unary_impl::<B>(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -286,10 +239,6 @@ impl Storage {
                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -321,10 +270,6 @@ impl Storage {
                let s = inp.conv1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -352,10 +297,6 @@ impl Storage {
                let s = inp.conv2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -383,10 +324,6 @@ impl Storage {
                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -411,10 +348,6 @@ impl Storage {
                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -433,10 +366,6 @@ impl Storage {
                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -450,10 +379,6 @@ impl Storage {
                let storage = storage.upsample_nearest1d(layout, sz)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -467,10 +392,6 @@ impl Storage {
                let storage = storage.upsample_nearest2d(layout, h, w)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest2d(layout, h, w)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -494,10 +415,6 @@ impl Storage {
                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(cond), Self::Metal(t), Self::Metal(f)) => {
-                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
-                Ok(Self::Metal(storage))
-            }
            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -524,10 +441,6 @@ impl Storage {
                let storage = s.gather(l, indexes, indexes_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes)) => {
-                let storage = s.gather(l, indexes, indexes_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -552,10 +465,6 @@ impl Storage {
                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -580,10 +489,6 @@ impl Storage {
                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -605,10 +510,6 @@ impl Storage {
                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -636,10 +537,6 @@ impl Storage {
                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -659,9 +556,6 @@ impl Storage {
        match (self, dst) {
            (Self::Cpu(src), Self::Cpu(dst)) => src.copy_strided_src(dst, dst_offset, src_l),
            (Self::Cuda(src), Self::Cuda(dst)) => Ok(src.copy_strided_src(dst, dst_offset, src_l)?),
-            (Self::Metal(src), Self::Metal(dst)) => {
-                Ok(src.copy_strided_src(dst, dst_offset, src_l)?)
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -6,7 +6,7 @@ use crate::op::{
 };
 use crate::scalar::TensorOrScalar;
 use crate::shape::{Dim, Dims};
-use crate::{bail, storage::Storage, DType, Device, Error, Layout, Result, Shape};
+use crate::{storage::Storage, DType, Device, Error, Layout, Result, Shape};
 use std::sync::{Arc, RwLock};

 /// Unique identifier for tensors.
@ -523,7 +523,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(cpu_storage) => from_cpu_storage(cpu_storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1449,7 +1448,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1480,7 +1478,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1521,7 +1518,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1811,17 +1807,23 @@ impl Tensor {

    /// Returns a new tensor detached from the current graph, gradient are not propagated through
    /// this new node. The storage of this tensor is shared with the initial tensor.
+    ///
+    /// If the tensor is already detached from the computation graph, the same tensor is returned.
    pub fn detach(&self) -> Result<Tensor> {
-        let tensor_ = Tensor_ {
-            id: TensorId::new(),
-            storage: self.storage.clone(),
-            layout: self.layout.clone(),
-            op: BackpropOp::none(),
-            is_variable: false,
-            dtype: self.dtype,
-            device: self.device.clone(),
-        };
-        Ok(Tensor(Arc::new(tensor_)))
+        if self.op.is_none() && !self.is_variable {
+            Ok(self.clone())
+        } else {
+            let tensor_ = Tensor_ {
+                id: TensorId::new(),
+                storage: self.storage.clone(),
+                layout: self.layout.clone(),
+                op: BackpropOp::none(),
+                is_variable: false,
+                dtype: self.dtype,
+                device: self.device.clone(),
+            };
+            Ok(Tensor(Arc::new(tensor_)))
+        }
    }

    /// If the target device is the same as the tensor device, only a shallow copy is performed.
@ -1841,9 +1843,6 @@ impl Tensor {
                    Storage::Cuda(cuda.storage_from_cpu_storage(&cpu_storage)?)
                }
                (Storage::Cpu(storage), Device::Cpu) => Storage::Cpu(storage.clone()),
-                _ => {
-                    bail!("not implemented yet")
-                }
            };
            let op = BackpropOp::new1(self, Op::ToDevice);
            let tensor_ = Tensor_ {
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -23,10 +23,6 @@ pub fn cuda_is_available() -> bool {
    cfg!(feature = "cuda")
 }

-pub fn metal_is_available() -> bool {
-    cfg!(feature = "metal")
-}
-
 pub fn with_avx() -> bool {
    cfg!(target_feature = "avx")
 }
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -51,7 +51,6 @@ anyhow = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
 cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
-metal = ["candle/metal", "candle-nn/metal", "candle-transformers/metal"]
 cudnn = ["candle/cudnn"]
 flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -9,7 +9,7 @@ use std::io::Write;
 use tokenizers::Tokenizer;

 use candle::quantized::{ggml_file, gguf_file};
-use candle::{Tensor};
+use candle::{Device, Tensor};
 use candle_transformers::generation::LogitsProcessor;

 use candle_transformers::models::quantized_llama as model;
@ -232,7 +232,6 @@ fn main() -> anyhow::Result<()> {
    use tracing_subscriber::prelude::*;

    let args = Args::parse();
-    let mut device = candle_examples::device(false)?;
    let temperature = if args.temperature == 0. {
        None
    } else {
@ -277,10 +276,10 @@ fn main() -> anyhow::Result<()> {
                &format_size(total_size_in_bytes),
                start.elapsed().as_secs_f32(),
            );
-            ModelWeights::from_gguf(model, &mut file, &device)?
+            ModelWeights::from_gguf(model, &mut file)?
        }
        Some("ggml" | "bin") | Some(_) | None => {
-            let model = ggml_file::Content::read(&mut file, &device)?;
+            let model = ggml_file::Content::read(&mut file)?;
            let mut total_size_in_bytes = 0;
            for (_, tensor) in model.tensors.iter() {
                let elem_count = tensor.shape().elem_count();
@ -308,7 +307,7 @@ fn main() -> anyhow::Result<()> {
                | Which::L70b
                | Which::L70bChat => 8,
            };
-            ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa), &device)?
+            ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa))?
        }
    };
    println!("model built");
@ -367,11 +366,9 @@ fn main() -> anyhow::Result<()> {

        let start_prompt_processing = std::time::Instant::now();
        let mut next_token = {
-            let input = Tensor::new(prompt_tokens.as_slice(), &device)?.unsqueeze(0)?;
+            let input = Tensor::new(prompt_tokens.as_slice(), &Device::Cpu)?.unsqueeze(0)?;
            let logits = model.forward(&input, 0)?;
            let logits = logits.squeeze(0)?;
-            // TODO Remove this once implementation is finished.
-            let logits = logits.ones_like()?;
            logits_processor.sample(&logits)?
        };
        let prompt_dt = start_prompt_processing.elapsed();
@ -382,24 +379,19 @@ fn main() -> anyhow::Result<()> {

        let start_post_prompt = std::time::Instant::now();
        for index in 0..to_sample {
-            let input = Tensor::new(&[next_token], &device)?.unsqueeze(0)?;
+            let input = Tensor::new(&[next_token], &Device::Cpu)?.unsqueeze(0)?;
            let logits = model.forward(&input, prompt_tokens.len() + index)?;
-            if let candle::Device::Metal(device) = &mut device{
-                device.flush()
-            }
            let logits = logits.squeeze(0)?;
-            // let logits = if args.repeat_penalty == 1. {
-            //     logits
-            // } else {
-            //     let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
-            //     candle_transformers::utils::apply_repeat_penalty(
-            //         &logits,
-            //         args.repeat_penalty,
-            //         &all_tokens[start_at..],
-            //     )?
-            // };
-            // TODO Remove this once implementation is finished.
-            let logits = logits.ones_like()?;
+            let logits = if args.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    args.repeat_penalty,
+                    &all_tokens[start_at..],
+                )?
+            };
            next_token = logits_processor.sample(&logits)?;
            all_tokens.push(next_token);
            print_token(next_token, &tokenizer);
--- a/candle-examples/src/lib.rs
+++ b/candle-examples/src/lib.rs
@ -2,30 +2,17 @@ pub mod coco_classes;
 pub mod imagenet;
 pub mod token_output_stream;

-use candle::utils::{cuda_is_available, metal_is_available};
 use candle::{Device, Result, Tensor};

 pub fn device(cpu: bool) -> Result<Device> {
    if cpu {
        Ok(Device::Cpu)
    } else {
-        if cuda_is_available() {
-            Ok(Device::new_cuda(0)?)
-        } else if metal_is_available() {
-            Ok(Device::new_metal(0)?)
-        } else {
-            #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-            {
-                println!("Running on CPU, to run on GPU(metal), build this example with `--features metal`");
-            }
-            #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-            {
-                println!(
-                    "Running on CPU, to run on GPU, build this example with `--features cuda`"
-                );
-            }
-            Ok(Device::Cpu)
+        let device = Device::cuda_if_available(0)?;
+        if !device.is_cuda() {
+            println!("Running on CPU, to run on GPU, build this example with `--features cuda`");
        }
+        Ok(device)
    }
 }

--- a/candle-metal-kernels/Cargo.toml
+++ b/candle-metal-kernels/Cargo.toml
@ -1,12 +0,0 @@
-[package]
-name = "candle-metal-kernels"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-repository.workspace = true
-keywords.workspace = true
-categories.workspace = true
-license.workspace = true
-
-[dependencies]
-metal = { workspace = true }
--- a/candle-metal-kernels/README.md
+++ b/candle-metal-kernels/README.md
@ -1,3 +0,0 @@
-# candle-metal-kernels
-
-This crate contains Metal kernels used from candle.
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
@ -1 +0,0 @@
-
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@ -28,5 +28,4 @@ clap = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate"]
 cuda = ["candle/cuda"]
-metal = ["candle/metal"]
 mkl = ["dep:intel-mkl-src", "candle/mkl"]
--- a/candle-nn/src/ops.rs
+++ b/candle-nn/src/ops.rs
@ -190,16 +190,6 @@ impl candle::CustomOp1 for SoftmaxLastDim {
            device: dev.clone(),
        };
        Ok((dst, layout.shape().clone()))
-    }    
-
-    #[cfg(feature = "metal")]
-    fn metal_fwd(
-        &self,
-        storage: &candle::MetalStorage,
-        layout: &Layout,
-    ) -> Result<(candle::MetalStorage, Shape)> {
-        println!("TODO softmax-last-dim");
-        Ok((storage.clone(), layout.shape().clone()))
    }
 }

--- a/candle-pyo3/src/lib.rs
+++ b/candle-pyo3/src/lib.rs
@ -81,7 +81,6 @@ impl PyDevice {
        match device {
            Device::Cpu => Self::Cpu,
            Device::Cuda(_) => Self::Cuda,
-            Device::Metal(_) => unimplemented!(),
        }
    }

--- a/candle-transformers/Cargo.toml
+++ b/candle-transformers/Cargo.toml
@ -28,6 +28,5 @@ wav = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"]
 cuda = ["candle/cuda", "candle-nn/cuda"]
-metal = ["candle/metal", "candle-nn/metal"]
 flash-attn = ["cuda", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl"]
--- a/candle-transformers/src/models/bert.rs
+++ b/candle-transformers/src/models/bert.rs
@ -1,3 +1,4 @@
+use super::with_tracing::{linear, Linear};
 use candle::{DType, Device, Result, Tensor};
 use candle_nn::{Embedding, Module, VarBuilder};
 use serde::Deserialize;
@ -32,35 +33,6 @@ impl HiddenActLayer {
    }
 }

-#[derive(Debug)]
-pub struct Linear {
-    weight: Tensor,
-    bias: Option<Tensor>,
-    span: tracing::Span,
-}
-
-impl Linear {
-    pub fn new(weight: Tensor, bias: Option<Tensor>) -> Self {
-        let span = tracing::span!(tracing::Level::TRACE, "linear");
-        Self { weight, bias, span }
-    }
-}
-
-impl Module for Linear {
-    fn forward(&self, x: &Tensor) -> candle::Result<Tensor> {
-        let _enter = self.span.enter();
-        let w = match x.dims() {
-            &[bsize, _, _] => self.weight.broadcast_left(bsize)?.t()?,
-            _ => self.weight.t()?,
-        };
-        let x = x.matmul(&w)?;
-        match &self.bias {
-            None => Ok(x),
-            Some(bias) => x.broadcast_add(bias),
-        }
-    }
-}
-
 #[derive(Debug)]
 pub struct LayerNorm {
    weight: Tensor,
@ -184,12 +156,6 @@ fn embedding(vocab_size: usize, hidden_size: usize, vb: VarBuilder) -> Result<Em
    Ok(Embedding::new(embeddings, hidden_size))
 }

-fn linear(size1: usize, size2: usize, vb: VarBuilder) -> Result<Linear> {
-    let weight = vb.get((size2, size1), "weight")?;
-    let bias = vb.get(size2, "bias")?;
-    Ok(Linear::new(weight, Some(bias)))
-}
-
 struct Dropout {
    #[allow(dead_code)]
    pr: f64,
--- a/candle-transformers/src/models/llama.rs
+++ b/candle-transformers/src/models/llama.rs
@ -1,3 +1,4 @@
+use super::with_tracing::{linear_no_bias as linear, Linear};
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{Embedding, Module, VarBuilder};
 use serde::Deserialize;
@ -81,21 +82,6 @@ impl Config {
    }
 }

-// We wrap the `Linear` layer here to add some tracing so that it's easier to profile the resulting
-// model.
-#[derive(Debug)]
-pub struct Linear {
-    inner: candle_nn::Linear,
-    span: tracing::Span,
-}
-
-impl Linear {
-    fn forward(&self, x: &Tensor) -> Result<Tensor> {
-        let _enter = self.span.enter();
-        self.inner.forward(x)
-    }
-}
-
 #[derive(Clone)]
 pub struct Cache {
    masks: Arc<Mutex<HashMap<usize, Tensor>>>,
@ -150,12 +136,6 @@ impl Cache {
    }
 }

-fn linear(size1: usize, size2: usize, vb: VarBuilder) -> Result<Linear> {
-    let span = tracing::span!(tracing::Level::TRACE, "linear");
-    let inner = candle_nn::linear_no_bias(size1, size2, vb)?;
-    Ok(Linear { inner, span })
-}
-
 fn embedding(cfg: &Config, vb: VarBuilder) -> Result<Embedding> {
    let embeddings = vb.get((cfg.vocab_size, cfg.hidden_size), "weight")?;
    Ok(Embedding::new(embeddings, cfg.hidden_size))
--- a/candle-transformers/src/models/marian.rs
+++ b/candle-transformers/src/models/marian.rs
@ -1,6 +1,5 @@
-#![allow(unused)]
-use super::with_tracing::{linear, linear_no_bias, Embedding, Linear};
-use candle::{Module, Result, Tensor};
+use super::with_tracing::{linear, Embedding, Linear};
+use candle::{Result, Tensor};
 use candle_nn::{layer_norm, LayerNorm, VarBuilder};

 #[derive(Debug, Clone)]
@ -170,7 +169,6 @@ impl Attention {
        kv_states: Option<&Tensor>,
        attn_mask: Option<&Tensor>,
    ) -> Result<Tensor> {
-        let is_cross_attn = kv_states.is_some();
        let (b_sz, tgt_len, _) = xs.dims3()?;
        let query_states = (xs.apply(&self.q_proj)? * self.scaling)?;
        let (key_states, value_states) = match kv_states {
@ -259,6 +257,10 @@ impl EncoderLayer {
            .apply(&self.fc2)?;
        (xs + residual)?.apply(&self.final_layer_norm)
    }
+
+    fn reset_kv_cache(&mut self) {
+        self.self_attn.reset_kv_cache()
+    }
 }

 #[derive(Debug, Clone)]
@ -320,6 +322,11 @@ impl DecoderLayer {
        let xs = (xs + residual)?.apply(&self.final_layer_norm)?;
        Ok(xs)
    }
+
+    fn reset_kv_cache(&mut self) {
+        self.self_attn.reset_kv_cache();
+        self.encoder_attn.reset_kv_cache()
+    }
 }

 #[derive(Debug, Clone)]
@ -368,6 +375,12 @@ impl Encoder {
        }
        Ok(xs)
    }
+
+    pub fn reset_kv_cache(&mut self) {
+        for layer in self.layers.iter_mut() {
+            layer.reset_kv_cache()
+        }
+    }
 }

 #[derive(Debug, Clone)]
@ -422,6 +435,12 @@ impl Decoder {
        }
        Ok(xs)
    }
+
+    pub fn reset_kv_cache(&mut self) {
+        for layer in self.layers.iter_mut() {
+            layer.reset_kv_cache()
+        }
+    }
 }

 #[derive(Debug, Clone)]
@ -442,6 +461,11 @@ impl Model {
            decoder,
        })
    }
+
+    fn reset_kv_cache(&mut self) {
+        self.encoder.reset_kv_cache();
+        self.decoder.reset_kv_cache();
+    }
 }

 #[derive(Debug, Clone)]
@ -489,4 +513,8 @@ impl MTModel {
            .apply(&self.lm_head)?
            .broadcast_add(&self.final_logits_bias)
    }
+
+    pub fn reset_kv_cache(&mut self) {
+        self.model.reset_kv_cache();
+    }
 }
--- a/candle-transformers/src/models/quantized_llama.rs
+++ b/candle-transformers/src/models/quantized_llama.rs
@ -2,7 +2,7 @@ use std::collections::HashMap;

 use candle::quantized::QTensor;
 use candle::quantized::{ggml_file, gguf_file};
-use candle::{Device, IndexOp, Result, Tensor, D};
+use candle::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn::{Embedding, Module};

 pub const MAX_SEQ_LEN: usize = 4096;
@ -16,7 +16,7 @@ struct RmsNorm {
 impl RmsNorm {
    fn new(scale: QTensor, eps: f32) -> Result<Self> {
        let span = tracing::span!(tracing::Level::TRACE, "rms-norm");
-        let scale = scale.dequantize(scale.device())?;
+        let scale = scale.dequantize(&Device::Cpu)?;
        let inner = candle_nn::LayerNorm::rms_norm(scale, eps as f64);
        Ok(Self { inner, span })
    }
@ -79,8 +79,6 @@ fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result<Tensor>
 impl LayerWeights {
    fn apply_rotary_emb(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
        let _enter = self.span_rot.enter();
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cos");
-        let _enter = span.enter();
        let (b_sz, n_head, seq_len, n_embd) = x.dims4()?;
        let cos = self
            .cos
@ -90,37 +88,21 @@ impl LayerWeights {
            .sin
            .narrow(0, index_pos, seq_len)?
            .reshape((seq_len, n_embd / 2, 1))?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad");
-        let _enter = span.enter();
        let cos = cos.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?;
        let sin = sin.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?;
-        drop(_enter);
        // This mimics the llama.cpp behavior.
        // https://github.com/ggerganov/llama.cpp/blob/1f0bccb27929e261744c979bc75114955da49e98/ggml.c#L12104-L12105
        // The x0 and x1 value are interleaved on the n_embd (= head_dim) dimension.
        // The resulting y0 and y1 are also interleaved with:
        //   y0 = x0*cos - x1*sin
        //   y1 = x0*sin + x1*cos
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-reshape");
-        let _enter = span.enter();
        let x = x.reshape((b_sz, n_head, seq_len, n_embd / 2, 2))?;
        let x0 = x.narrow(D::Minus1, 0, 1)?;
        let x1 = x.narrow(D::Minus1, 1, 1)?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad-mul");
-        let _enter = span.enter();
        let y0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?;
        let y1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cat");
-        let _enter = span.enter();
        let rope = Tensor::cat(&[y0, y1], D::Minus1)?;
-        drop(_enter);
-        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-flatten");
-        let _enter = span.enter();
        let rope = rope.flatten_from(D::Minus2)?;
-        drop(_enter);
        Ok(rope)
    }

@ -130,7 +112,6 @@ impl LayerWeights {
        let q = self.attention_wq.forward(x)?;
        let k = self.attention_wk.forward(x)?;
        let v = self.attention_wv.forward(x)?;
-        // println!("Q {:?} K {:?} V {:?}", q.dtype(), k.dtype(), v.dtype());

        let q = q
            .reshape((b_sz, seq_len, self.n_head, self.head_dim))?
@ -164,12 +145,9 @@ impl LayerWeights {
        let v = self.repeat_kv(v)?;

        let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?;
-        // println!("att {:?}", att.dtype());
        let mask = mask.broadcast_as(att.shape())?;
-        // println!("mask {:?}", mask.dtype());
        let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?;
        let att = candle_nn::ops::softmax_last_dim(&att)?;
-        // println!("att {:?} v {:?}", att.dtype(), v.dtype());
        // Convert to contiguous as matmul doesn't support strided vs for now.
        let y = att.matmul(&v.contiguous()?)?;
        let y = y.transpose(1, 2)?.reshape(&[b_sz, seq_len, n_embd])?;
@ -203,37 +181,28 @@ pub struct ModelWeights {
    span_output: tracing::Span,
 }

-fn precomput_freqs_cis(
-    head_dim: usize,
-    freq_base: f32,
-    device: &Device,
-) -> Result<(Tensor, Tensor)> {
+fn precomput_freqs_cis(head_dim: usize, freq_base: f32) -> Result<(Tensor, Tensor)> {
    let theta: Vec<_> = (0..head_dim)
        .step_by(2)
        .map(|i| 1f32 / freq_base.powf(i as f32 / head_dim as f32))
        .collect();
-    let theta = Tensor::new(theta.as_slice(), device)?;
-    let range: Vec<f32> = (0..MAX_SEQ_LEN).map(|r| r as f32).collect();
-    let idx_theta = Tensor::new(range.as_slice(), device)?
+    let theta = Tensor::new(theta.as_slice(), &Device::Cpu)?;
+    let idx_theta = Tensor::arange(0, MAX_SEQ_LEN as u32, &Device::Cpu)?
+        .to_dtype(DType::F32)?
        .reshape((MAX_SEQ_LEN, 1))?
        .matmul(&theta.reshape((1, theta.elem_count()))?)?;
-    // TODO This change avoids allocating on Metal and then casting since allocating directly on
-    // CPU as f32 seems just as fast
-    // let idx_theta = Tensor::arange(0, MAX_SEQ_LEN as u32, device)?
-    //     .to_dtype(DType::F32)?
-    //     .reshape((MAX_SEQ_LEN, 1))?
-    //     .matmul(&theta.reshape((1, theta.elem_count()))?)?;
    let cos = idx_theta.cos()?;
    let sin = idx_theta.sin()?;
    Ok((cos, sin))
 }

 impl ModelWeights {
-    pub fn from_ggml(mut ct: ggml_file::Content, gqa: usize, device: &Device) -> Result<Self> {
+    pub fn from_ggml(mut ct: ggml_file::Content, gqa: usize) -> Result<Self> {
+        let cpu = &Device::Cpu;
        let head_dim = (ct.hparams.n_embd / ct.hparams.n_head) as usize;
-        let (cos, sin) = precomput_freqs_cis(head_dim, 10000., device)?;
+        let (cos, sin) = precomput_freqs_cis(head_dim, 10000.)?;
        let tok_embeddings = ct.remove("tok_embeddings.weight")?;
-        let tok_embeddings = tok_embeddings.dequantize(device)?;
+        let tok_embeddings = tok_embeddings.dequantize(cpu)?;
        let norm = RmsNorm::new(ct.remove("norm.weight")?, 1e-5)?;
        let output = ct.remove("output.weight")?;
        let mut layers = Vec::with_capacity(ct.hparams.n_layer as usize);
@ -288,8 +257,8 @@ impl ModelWeights {
    pub fn from_gguf<R: std::io::Seek + std::io::Read>(
        ct: gguf_file::Content,
        reader: &mut R,
-        device: &Device,
    ) -> Result<Self> {
+        let cpu = &Device::Cpu;
        let md_get = |s: &str| match ct.metadata.get(s) {
            None => candle::bail!("cannot find {s} in metadata"),
            Some(v) => Ok(v),
@ -307,31 +276,24 @@ impl ModelWeights {
        let rope_freq_base = md_get("llama.rope.freq_base")
            .and_then(|m| m.to_f32())
            .unwrap_or(10000f32);
-        let (cos, sin) = precomput_freqs_cis(rope_dim, rope_freq_base, device)?;
+        let (cos, sin) = precomput_freqs_cis(rope_dim, rope_freq_base)?;

-        let tok_embeddings = ct.tensor(reader, "token_embd.weight", device)?;
-        let tok_embeddings = tok_embeddings.dequantize(device)?;
-        let norm = RmsNorm::new(
-            ct.tensor(reader, "output_norm.weight", device)?,
-            rms_norm_eps,
-        )?;
-        let output = ct.tensor(reader, "output.weight", device)?;
+        let tok_embeddings = ct.tensor(reader, "token_embd.weight")?;
+        let tok_embeddings = tok_embeddings.dequantize(cpu)?;
+        let norm = RmsNorm::new(ct.tensor(reader, "output_norm.weight")?, rms_norm_eps)?;
+        let output = ct.tensor(reader, "output.weight")?;
        let mut layers = Vec::with_capacity(block_count);
        for layer_idx in 0..block_count {
            let prefix = format!("blk.{layer_idx}");
-            let attention_wq = ct.tensor(reader, &format!("{prefix}.attn_q.weight"), device)?;
-            let attention_wk = ct.tensor(reader, &format!("{prefix}.attn_k.weight"), device)?;
-            let attention_wv = ct.tensor(reader, &format!("{prefix}.attn_v.weight"), device)?;
-            let attention_wo =
-                ct.tensor(reader, &format!("{prefix}.attn_output.weight"), device)?;
-            let feed_forward_w1 =
-                ct.tensor(reader, &format!("{prefix}.ffn_gate.weight"), device)?;
-            let feed_forward_w2 =
-                ct.tensor(reader, &format!("{prefix}.ffn_down.weight"), device)?;
-            let feed_forward_w3 = ct.tensor(reader, &format!("{prefix}.ffn_up.weight"), device)?;
-            let attention_norm =
-                ct.tensor(reader, &format!("{prefix}.attn_norm.weight"), device)?;
-            let ffn_norm = ct.tensor(reader, &format!("{prefix}.ffn_norm.weight"), device)?;
+            let attention_wq = ct.tensor(reader, &format!("{prefix}.attn_q.weight"))?;
+            let attention_wk = ct.tensor(reader, &format!("{prefix}.attn_k.weight"))?;
+            let attention_wv = ct.tensor(reader, &format!("{prefix}.attn_v.weight"))?;
+            let attention_wo = ct.tensor(reader, &format!("{prefix}.attn_output.weight"))?;
+            let feed_forward_w1 = ct.tensor(reader, &format!("{prefix}.ffn_gate.weight"))?;
+            let feed_forward_w2 = ct.tensor(reader, &format!("{prefix}.ffn_down.weight"))?;
+            let feed_forward_w3 = ct.tensor(reader, &format!("{prefix}.ffn_up.weight"))?;
+            let attention_norm = ct.tensor(reader, &format!("{prefix}.attn_norm.weight"))?;
+            let ffn_norm = ct.tensor(reader, &format!("{prefix}.ffn_norm.weight"))?;
            let span_attn = tracing::span!(tracing::Level::TRACE, "attn");
            let span_rot = tracing::span!(tracing::Level::TRACE, "attn-rot");
            let span_mlp = tracing::span!(tracing::Level::TRACE, "attn-mlp");
@ -369,14 +331,14 @@ impl ModelWeights {
        })
    }

-    fn mask(&mut self, t: usize, device: &Device) -> Result<Tensor> {
+    fn mask(&mut self, t: usize) -> Result<Tensor> {
        if let Some(mask) = self.masks.get(&t) {
            Ok(mask.clone())
        } else {
            let mask: Vec<_> = (0..t)
                .flat_map(|i| (0..t).map(move |j| u8::from(j > i)))
                .collect();
-            let mask = Tensor::from_slice(&mask, (t, t), device)?;
+            let mask = Tensor::from_slice(&mask, (t, t), &Device::Cpu)?;
            self.masks.insert(t, mask.clone());
            Ok(mask)
        }
@ -384,7 +346,7 @@ impl ModelWeights {

    pub fn forward(&mut self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
        let (_b_sz, seq_len) = x.dims2()?;
-        let mask = self.mask(seq_len, x.device())?;
+        let mask = self.mask(seq_len)?;
        let _enter = self.span.enter();
        let mut layer_in = self.tok_embeddings.forward(x)?;
        for layer in self.layers.iter_mut() {
--- a/candle-transformers/src/models/segment_anything/mod.rs
+++ b/candle-transformers/src/models/segment_anything/mod.rs
@ -1,3 +1,4 @@
+pub use crate::models::with_tracing::Linear;
 use candle::{Result, Tensor};
 use candle_nn::{Module, VarBuilder};

@ -9,13 +10,11 @@ pub mod tiny_vit;
 pub mod transformer;

 pub fn linear(vb: VarBuilder, in_dim: usize, out_dim: usize, bias: bool) -> Result<Linear> {
-    let inner = if bias {
-        candle_nn::linear(in_dim, out_dim, vb)?
+    if bias {
+        crate::models::with_tracing::linear(in_dim, out_dim, vb)
    } else {
-        candle_nn::linear_no_bias(in_dim, out_dim, vb)?
-    };
-    let span = tracing::span!(tracing::Level::TRACE, "linear");
-    Ok(Linear { inner, span })
+        crate::models::with_tracing::linear_no_bias(in_dim, out_dim, vb)
+    }
 }

 #[derive(Debug)]
@ -85,16 +84,3 @@ impl Module for MlpBlock {
            .apply(&self.lin2)
    }
 }
-
-#[derive(Debug)]
-pub struct Linear {
-    inner: candle_nn::Linear,
-    span: tracing::Span,
-}
-
-impl Module for Linear {
-    fn forward(&self, x: &Tensor) -> Result<Tensor> {
-        let _enter = self.span.enter();
-        self.inner.forward(x)
-    }
-}
--- a/candle-transformers/src/models/stable_diffusion/clip.rs
+++ b/candle-transformers/src/models/stable_diffusion/clip.rs
@ -102,6 +102,14 @@ impl Config {
        }
    }

+    pub fn ssd1b() -> Self {
+        Self::sdxl()
+    }
+
+    pub fn ssd1b2() -> Self {
+        Self::sdxl2()
+    }
+
    // https://huggingface.co/warp-ai/wuerstchen/blob/main/text_encoder/config.json
    pub fn wuerstchen() -> Self {
        Self {
--- a/candle-transformers/src/models/stable_diffusion/mod.rs
+++ b/candle-transformers/src/models/stable_diffusion/mod.rs
@ -249,6 +249,71 @@ impl StableDiffusionConfig {
        )
    }

+    pub fn ssd1b(
+        sliced_attention_size: Option<usize>,
+        height: Option<usize>,
+        width: Option<usize>,
+    ) -> Self {
+        let bc = |out_channels, use_cross_attn, attention_head_dim| unet_2d::BlockConfig {
+            out_channels,
+            use_cross_attn,
+            attention_head_dim,
+        };
+        // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/unet/config.json
+        let unet = unet_2d::UNet2DConditionModelConfig {
+            blocks: vec![
+                bc(320, None, 5),
+                bc(640, Some(2), 10),
+                bc(1280, Some(10), 20),
+            ],
+            center_input_sample: false,
+            cross_attention_dim: 2048,
+            downsample_padding: 1,
+            flip_sin_to_cos: true,
+            freq_shift: 0.,
+            layers_per_block: 2,
+            mid_block_scale_factor: 1.,
+            norm_eps: 1e-5,
+            norm_num_groups: 32,
+            sliced_attention_size,
+            use_linear_projection: true,
+        };
+        // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/vae/config.json
+        let autoencoder = vae::AutoEncoderKLConfig {
+            block_out_channels: vec![128, 256, 512, 512],
+            layers_per_block: 2,
+            latent_channels: 4,
+            norm_num_groups: 32,
+        };
+        let scheduler = ddim::DDIMSchedulerConfig {
+            ..Default::default()
+        };
+
+        let height = if let Some(height) = height {
+            assert_eq!(height % 8, 0, "height has to be divisible by 8");
+            height
+        } else {
+            1024
+        };
+
+        let width = if let Some(width) = width {
+            assert_eq!(width % 8, 0, "width has to be divisible by 8");
+            width
+        } else {
+            1024
+        };
+
+        Self {
+            width,
+            height,
+            clip: clip::Config::ssd1b(),
+            clip2: Some(clip::Config::ssd1b2()),
+            autoencoder,
+            scheduler,
+            unet,
+        }
+    }
+
    pub fn build_vae<P: AsRef<std::path::Path>>(
        &self,
        vae_weights: P,
--- a/candle-transformers/src/models/whisper/model.rs
+++ b/candle-transformers/src/models/whisper/model.rs
@ -1,4 +1,5 @@
 use super::Config;
+use crate::models::with_tracing::{linear, linear_no_bias, Linear};
 use candle::{Device, IndexOp, Result, Tensor, D};
 use candle_nn::{Conv1d, Conv1dConfig, Embedding, LayerNorm, Module, VarBuilder};

@ -6,33 +7,6 @@ fn embedding(vocab_size: usize, hidden_size: usize, vb: VarBuilder) -> Result<Em
    let embeddings = vb.get((vocab_size, hidden_size), "weight")?;
    Ok(Embedding::new(embeddings, hidden_size))
 }
-//
-// We wrap the `Linear` layer here to add some tracing so that it's easier to profile the resulting
-// model.
-#[derive(Debug, Clone)]
-pub struct Linear {
-    inner: candle_nn::Linear,
-    span: tracing::Span,
-}
-
-impl Linear {
-    fn forward(&self, x: &Tensor) -> Result<Tensor> {
-        let _enter = self.span.enter();
-        self.inner.forward(x)
-    }
-}
-
-fn linear(size1: usize, size2: usize, vb: VarBuilder) -> Result<Linear> {
-    let span = tracing::span!(tracing::Level::TRACE, "linear");
-    let inner = candle_nn::linear(size1, size2, vb)?;
-    Ok(Linear { inner, span })
-}
-
-fn linear_no_bias(size1: usize, size2: usize, vb: VarBuilder) -> Result<Linear> {
-    let span = tracing::span!(tracing::Level::TRACE, "linear");
-    let inner = candle_nn::linear_no_bias(size1, size2, vb)?;
-    Ok(Linear { inner, span })
-}

 fn conv1d(
    in_channels: usize,
--- a/candle-transformers/src/quantized_var_builder.rs
+++ b/candle-transformers/src/quantized_var_builder.rs
@ -10,12 +10,12 @@ pub struct VarBuilder {
 }

 impl VarBuilder {
-    pub fn from_gguf<P: AsRef<std::path::Path>>(p: P, device: &Device) -> Result<Self> {
+    pub fn from_gguf<P: AsRef<std::path::Path>>(p: P) -> Result<Self> {
        let mut file = std::fs::File::open(p)?;
        let content = candle::quantized::gguf_file::Content::read(&mut file)?;
        let mut data = std::collections::HashMap::new();
        for tensor_name in content.tensor_infos.keys() {
-            let tensor = content.tensor(&mut file, tensor_name, device)?;
+            let tensor = content.tensor(&mut file, tensor_name)?;
            data.insert(tensor_name.to_string(), Arc::new(tensor));
        }
        Ok(Self {
@ -25,12 +25,12 @@ impl VarBuilder {
        })
    }

-    pub fn from_gguf_buffer(buffer: &[u8], device: &Device) -> Result<Self> {
+    pub fn from_gguf_buffer(buffer: &[u8]) -> Result<Self> {
        let mut cursor = std::io::Cursor::new(buffer);
        let content = candle::quantized::gguf_file::Content::read(&mut cursor)?;
        let mut data = std::collections::HashMap::new();
        for tensor_name in content.tensor_infos.keys() {
-            let tensor = content.tensor(&mut cursor, tensor_name, device)?;
+            let tensor = content.tensor(&mut cursor, tensor_name)?;
            data.insert(tensor_name.to_string(), Arc::new(tensor));
        }
        Ok(Self {
--- a/candle-wasm-examples/llama2-c/src/model.rs
+++ b/candle-wasm-examples/llama2-c/src/model.rs
@ -1,5 +1,7 @@
 use candle::{DType, Device, IndexOp, Result, Tensor, D};
-use candle_nn::{embedding, linear, rms_norm, Embedding, Linear, Module, RmsNorm, VarBuilder};
+use candle_nn::{
+    embedding, linear_no_bias as linear, rms_norm, Embedding, Linear, Module, RmsNorm, VarBuilder,
+};
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
Author	SHA1	Message	Date
Laurent	b97463098c	llama2-c wasm fix.	2023-11-02 10:31:47 +01:00
Laurent Mazare	fbd69f952c	Lazy detach. (#1242 )	2023-11-02 07:33:48 +00:00
Laurent Mazare	6c990a33ea	Remove the unused pragma for marian. (#1236 )	2023-11-01 20:04:52 +00:00
Laurent Mazare	1704f1b3ae	Consolidate the with-tracing usage. (#1234 )	2023-11-01 18:21:36 +00:00
Laurent Mazare	693fad511c	Preliminary support for ssd1b. (#1233 )	2023-11-01 14:37:52 +00:00
Laurent Mazare	36fb84f038	Add a hack for generating random uniform/normal for f16/bf16. (#1228 )	2023-10-31 20:27:59 +00:00