Enable the test for meshgrid + fix the implementation.

2025-06-17 02:58:50 +00:00 · 2023-10-25 13:38:42 +01:00
189 changed files with 759 additions and 17857 deletions
--- a/.github/workflows/ci_cuda.yaml
+++ b/.github/workflows/ci_cuda.yaml
@ -59,7 +59,7 @@ jobs:
      - name: Install Rust Stable
        run: curl https://sh.rustup.rs -sSf | sh -s -- -y
      - uses: Swatinem/rust-cache@v2
-      - run: apt-get update -y && apt-get install libssl-dev protobuf-compiler -y
+      - run: apt-get update -y && apt-get install libssl-dev -y
      - name: Test (cuda)
        run: PATH=$PATH:/usr/local/cuda-11.8/bin/ /root/.cargo/bin/cargo test --features cuda
  stop-runner:
--- a/.github/workflows/maturin.yml
+++ b/.github/workflows/maturin.yml
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@ -39,12 +39,6 @@ jobs:
          path: ~/.cargo/registry
          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}

-      - name: Install Protoc
-        uses: arduino/setup-protoc@v2
-        with:
-            version: "25.0"
-            repo-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Install
        working-directory: ./candle-pyo3
        run: |
@ -52,7 +46,7 @@ jobs:
          source .env/bin/activate
          pip install -U pip
          pip install pytest maturin black
-          python -m maturin develop -r --features onnx
+          python -m maturin develop -r

      - name: Check style
        working-directory: ./candle-pyo3
--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,19 +7,20 @@ members = [
    "candle-nn",
    "candle-pyo3",
    "candle-transformers",
-    "candle-wasm-examples/*",
+    "candle-wasm-examples/llama2-c",
+    "candle-wasm-examples/segment-anything",
+    "candle-wasm-examples/whisper",
+    "candle-wasm-examples/yolo",
+    "candle-wasm-examples/bert",
+    "candle-wasm-examples/phi",
+    "candle-wasm-examples/t5",
    "candle-wasm-tests",
 ]
-exclude = [
-   "candle-flash-attn",
-   "candle-kernels",
-   "candle-metal-kernels",
-   "candle-onnx",
-]
+exclude = ["candle-flash-attn", "candle-kernels"]
 resolver = "2"

 [workspace.package]
-version = "0.3.1"
+version = "0.3.0"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@ -51,7 +52,6 @@ rayon = "1.7.0"
 rusttype = { version = "0.9", default-features = false }
 safetensors = "0.3.1"
 serde = { version = "1.0.171", features = ["derive"] }
-serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
 tokenizers = { version = "0.13.4", default-features = false }
@ -61,7 +61,6 @@ tracing-subscriber = "0.3.7"
 wav = "1.0.0"
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "0.6.6", default-features = false }
-metal = { version = "0.27.0", features = ["mps"]}

 [profile.release-with-debug]
 inherits = "release"
--- a/README.md
+++ b/README.md
@ -51,12 +51,11 @@ For more advanced examples, please have a look at the following section.
 These online demos run entirely in your browser:
 - [yolo](https://huggingface.co/spaces/lmz/candle-yolo): pose estimation and
  object recognition.
- [whisper](https://huggingface.co/spaces/lmz/candle-whisper): speech recognition.
+- [whisper](https://huggingface.co/spaces/lmz/candle-whisper): text to speech.
 - [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2): text generation.
 - [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm): text generation.
 - [Phi-v1.5](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
 - [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm): Image segmentation.
- [BLIP](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning): image captioning.

 We also provide a some command line based examples using state of the art models:

@ -69,8 +68,6 @@ We also provide a some command line based examples using state of the art models
  performance larger than all publicly available 13b models as of 2023-09-28.
 - [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code generation.
 - [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
- [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
-  (English/Chinese) general LLMs with 6b and 34b parameters.
 - [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
  the LLaMA model using the same quantization techniques as
  [llama.cpp](https://github.com/ggerganov/llama.cpp).
@ -98,15 +95,12 @@ We also provide a some command line based examples using state of the art models
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/segment-anything/assets/sam_merged.jpg" width="200">

 - [Whisper](./candle-examples/examples/whisper/): speech recognition model.
- [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
-  [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
+- [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/): useful for sentence embeddings.
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
  using self-supervision (can be used for imagenet classification, depth
  evaluation, segmentation).
 - [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
  generate captions for an image.
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
-  model, generates the translated text from the input text.

 Run them using commands like:
 ```
@ -139,17 +133,10 @@ And then head over to
 <!--- ANCHOR: useful_libraries --->

 ## Useful External Resources
- [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): A
+- [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): a
  very detailed tutorial showing how to convert a PyTorch model to Candle.
- [`candle-lora`](https://github.com/EricLBuehler/candle-lora): Efficient and ergonomic LoRA implemenation for Candle. `candle-lora` has      
-  out-of-the-box LoRA support for many models from Candle, which can be found [here](https://github.com/EricLBuehler/candle-lora/tree/master/candle-lora-transformers/examples).
- [`optimisers`](https://github.com/KGrewal1/optimisers): A collection of optimisers
-  including SGD with momentum, AdaGrad, AdaDelta, AdaMax, NAdam, RAdam, and RMSprop.
- [`candle-vllm`](https://github.com/EricLBuehler/candle-vllm): Efficient platform for inference and
-  serving local LLMs including an OpenAI compatible API server.
- [`candle-ext`](https://github.com/mokeyish/candle-ext): An extension library to Candle that provides PyTorch functions not currently available in Candle.
- [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
- [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
+- [`candle-lora`](https://github.com/EricLBuehler/candle-lora): a LoRA implementation
+  that conforms to the official `peft` implementation.

 If you have an addition to this list, please submit a pull request.

@ -175,16 +162,8 @@ If you have an addition to this list, please submit a pull request.
        - Mistral 7b v0.1.
        - StableLM-3B-4E1T.
        - Replit-code-v1.5-3B.
+        - T5.
        - Bert.
-        - Yi-6B and Yi-34B.
-    - Quantized LLMs.
-        - Llama 7b, 13b, 70b, as well as the chat and code variants.
-        - Mistral 7b, and 7b instruct.
-        - Zephyr 7b a and b (Mistral based).
-        - OpenChat 3.5 (Mistral based).
-    - Text to text.
-        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
-        - Marian MT (Machine Translation).
    - Whisper (multi-lingual support).
    - Text to image.
        - Stable Diffusion v1.5, v2.1, XL v1.0.
@ -231,7 +210,6 @@ Cheatsheet:
 - [candle-datasets](./candle-datasets/): Datasets and data loaders.
 - [candle-transformers](./candle-transformers): transformers-related utilities.
 - [candle-flash-attn](./candle-flash-attn): Flash attention v2 layer.
- [candle-onnx](./candle-onnx/): ONNX model evaluation.

 ## FAQ

--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@ -11,11 +11,11 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.3.1", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.3.1" }
-candle-nn = { path = "../candle-nn", version = "0.3.1" }
-candle-transformers = { path = "../candle-transformers", version = "0.3.1" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.1", optional = true }
+candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.3.0" }
+candle-nn = { path = "../candle-nn", version = "0.3.0" }
+candle-transformers = { path = "../candle-transformers", version = "0.3.0" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.0", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -12,9 +12,7 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { path = "../candle-kernels", version = "0.3.1", optional = true }
-candle-metal-kernels = { path = "../candle-metal-kernels", version = "0.3.1", optional = true }
-metal = { workspace = true, optional = true}
+candle-kernels = { path = "../candle-kernels", version = "0.3.0", optional = true }
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
@ -41,4 +39,3 @@ cuda = ["cudarc", "dep:candle-kernels"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
-metal = ["dep:metal", "dep:candle-metal-kernels"]
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -8,10 +8,11 @@ use anyhow::Result;
 use candle_core::{Device, Tensor};

 fn main() -> Result<()> {
-    let a = Tensor::new(&[[0.0f32, 1.0, 2.0], [3.0, 4.0, 5.0]], &Device::Cpu)?;
-    let b = Tensor::new(&[[88.0f32, 99.0]], &Device::Cpu)?;
-    let new_a = a.slice_scatter(&b, 1, 2)?;
-    assert_eq!(a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
-    assert_eq!(new_a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
+    let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
+    let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
+    let start = std::time::Instant::now();
+    let res = inp.conv2d(&w, 0, 1, 1, 1)?;
+    println!("{:?}", start.elapsed());
+    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@ -39,14 +39,6 @@ pub trait BackendStorage: Sized {
        _params: &crate::conv::ParamsConv1D,
    ) -> Result<Self>;

-    fn conv_transpose1d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self>;
-
    fn conv2d(
        &self,
        _l: &Layout,
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -15,17 +15,6 @@ fn broadcast_back(arg: &Tensor, node: &Tensor, reduced_dims: &[usize]) -> Result
    }
 }

-thread_local! {
-    static CANDLE_GRAD_DO_NOT_DETACH: bool = {
-        match std::env::var("CANDLE_GRAD_DO_NOT_DETACH") {
-            Ok(s) => {
-                !s.is_empty() && s != "0"
-            },
-            Err(_) => false,
-        }
-    }
-}
-
 impl Tensor {
    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
    /// elements having dependencies on the latter ones, e.g. the first element if any is the
@ -68,11 +57,6 @@ impl Tensor {
                        kernel: rhs,
                        ..
                    }
-                    | Op::ConvTranspose1D {
-                        arg: lhs,
-                        kernel: rhs,
-                        ..
-                    }
                    | Op::Conv2D {
                        arg: lhs,
                        kernel: rhs,
@ -166,16 +150,10 @@ impl Tensor {
            if node.is_variable() {
                continue;
            }
-            let grad = grads
-                .remove(node)
-                .expect("candle internal error - grad not populated");
-            // https://github.com/huggingface/candle/issues/1241
-            // Ideally, we would make these operations in place where possible to ensure that we
-            // do not have to allocate too often. Here we just call `.detach` to avoid computing
-            // the backprop graph of the backprop itself. This would be an issue for second order
-            // derivatives but these are out of scope at the moment.
-            let do_not_detach = CANDLE_GRAD_DO_NOT_DETACH.with(|b| *b);
-            let grad = if do_not_detach { grad } else { grad.detach()? };
+            let grad = grads.remove(node).unwrap();
+            // TODO: We should perform all these operations in place (or at least not track the
+            // whole graph). The only drawback would be if we wanted to support grad of grad but
+            // this is out of scope.
            if let Some(op) = node.op() {
                match op {
                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
@ -230,44 +208,7 @@ impl Tensor {
                        let f_grad = pred.where_cond(&zeros, &grad)?;
                        *f_sum_grad = f_sum_grad.add(&f_grad)?;
                    }
-                    Op::Conv1D {
-                        arg,
-                        kernel,
-                        padding,
-                        stride,
-                        dilation,
-                    } => {
-                        // The output height for conv_transpose1d is:
-                        // (l_in - 1) * stride - 2 * padding + dilation * (k_size - 1) + out_padding + 1
-                        let grad_l_in = grad.dim(2)?;
-                        let k_size = kernel.dim(2)?;
-                        let out_size =
-                            (grad_l_in - 1) * stride + dilation * (k_size - 1) + 1 - 2 * padding;
-                        let out_padding = arg.dim(2)? - out_size;
-                        let grad_arg = grad.conv_transpose1d(
-                            kernel,
-                            *padding,
-                            out_padding,
-                            *stride,
-                            *dilation,
-                        )?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-
-                        let grad_kernel = arg
-                            .transpose(0, 1)?
-                            .conv1d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
-                            .transpose(0, 1)?;
-                        let sum_grad = grads.or_insert(kernel)?;
-                        let (_, _, k0) = kernel.dims3()?;
-                        let (_, _, g_k0) = grad_kernel.dims3()?;
-                        let grad_kernel = if g_k0 != k0 {
-                            grad_kernel.narrow(2, 0, k0)?
-                        } else {
-                            grad_kernel
-                        };
-                        *sum_grad = sum_grad.add(&grad_kernel)?;
-                    }
+                    Op::Conv1D { .. } => Err(Error::BackwardNotSupported { op: "conv1d" })?,
                    Op::Conv2D {
                        arg,
                        kernel,
@ -297,18 +238,8 @@ impl Tensor {
                            .conv2d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
-                        let (_, _, k0, k1) = kernel.dims4()?;
-                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
-                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
-                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
-                        } else {
-                            grad_kernel
-                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
-                    Op::ConvTranspose1D { .. } => Err(Error::BackwardNotSupported {
-                        op: "conv-transpose1d",
-                    })?,
                    Op::ConvTranspose2D { .. } => Err(Error::BackwardNotSupported {
                        op: "conv-transpose2d",
                    })?,
@ -549,38 +480,16 @@ impl Tensor {
                            + 0.5)?;
                        *sum_grad = sum_grad.add(&(&grad * gelu_grad)?)?
                    }
-                    Op::Unary(arg, UnaryOp::Erf) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        // d/dx erf(x) = 2/sqrt(pi) * e^(-x^2)
-                        let erf_grad =
-                            (2. / std::f64::consts::PI.sqrt()) * (arg.sqr()?.neg()?).exp()?;
-                        *sum_grad = sum_grad.add(&(&grad * erf_grad)?)?
-                    }
-                    Op::Unary(arg, UnaryOp::GeluErf) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        // d/dx gelu_erf(x) = 0.5 + 0.398942 e^(-x^2/2) x + 0.5 erf(x/sqrt(2))
-                        let neg_half_square = (arg.sqr()?.neg()? / 2.)?;
-                        let scaled_exp_arg = (0.398942 * neg_half_square.exp()? * arg)?;
-                        let arg_scaled_sqrt = (arg / 2f64.sqrt())?;
-                        let erf_scaled_sqrt = (0.5 * arg_scaled_sqrt.erf()?)?;
-                        let gelu_erf_grad = (0.5 + scaled_exp_arg + erf_scaled_sqrt)?;
-                        *sum_grad = sum_grad.add(&(&grad * gelu_erf_grad)?)?;
+                    Op::Unary(_, UnaryOp::Erf) => Err(Error::BackwardNotSupported { op: "erf" })?,
+                    Op::Unary(_, UnaryOp::GeluErf) => {
+                        Err(Error::BackwardNotSupported { op: "gelu-erf" })?
                    }
                    Op::Unary(arg, UnaryOp::Relu) => {
                        let sum_grad = grads.or_insert(arg)?;
                        let relu_grad = arg.ge(&arg.zeros_like()?)?.to_dtype(arg.dtype())?;
                        *sum_grad = sum_grad.add(&(&grad * relu_grad)?)?
                    }
-                    Op::Elu(arg, alpha) => {
-                        // d/dx elu(x) = 1 for x > 0, alpha * e^x for x <= 0
-                        let sum_grad = grads.or_insert(arg)?;
-                        let zeros = arg.zeros_like()?;
-                        let positive_mask = arg.gt(&zeros)?.to_dtype(arg.dtype())?;
-                        let negative_mask = arg.le(&zeros)?.to_dtype(arg.dtype())?;
-                        let negative_exp_mask = ((negative_mask * arg.exp())? * *alpha)?;
-                        let combined_mask = (positive_mask + negative_exp_mask)?;
-                        *sum_grad = sum_grad.add(&(grad * combined_mask)?)?
-                    }
+                    Op::Elu(..) => Err(Error::BackwardNotSupported { op: "elu" })?,
                    Op::Powf(arg, e) => {
                        let arg_grad = (&(grad * arg.powf(e - 1.)?)? * *e)?;
                        let sum_grad = grads.or_insert(arg)?;
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
@ -25,33 +25,6 @@ impl ParamsConv1D {
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ParamsConvTranspose1D {
-    pub(crate) b_size: usize,
-    pub(crate) l_in: usize,
-    pub(crate) c_out: usize,
-    pub(crate) c_in: usize,
-    pub(crate) k_size: usize,
-    pub(crate) padding: usize,
-    pub(crate) output_padding: usize,
-    pub(crate) stride: usize,
-    pub(crate) dilation: usize,
-}
-
-impl ParamsConvTranspose1D {
-    pub(crate) fn l_out(&self) -> usize {
-        (self.l_in - 1) * self.stride - 2 * self.padding
-            + self.dilation * (self.k_size - 1)
-            + self.output_padding
-            + 1
-    }
-
-    pub(crate) fn out_dims(&self) -> Vec<usize> {
-        let l_out = self.l_out();
-        vec![self.b_size, self.c_out, l_out]
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum CudnnFwdAlgo {
    ImplicitGemm,
@ -187,49 +160,6 @@ impl Tensor {
        }
    }

-    /// Applies a 1D transposed convolution over the input tensor.
-    pub fn conv_transpose1d(
-        &self,
-        kernel: &Self,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-    ) -> Result<Self> {
-        let (b_size, c_in, l_in) = self.dims3()?;
-        let (c_in_k, c_out, k_size) = kernel.dims3()?;
-        if c_in != c_in_k {
-            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
-        }
-        let params = ParamsConvTranspose1D {
-            b_size,
-            l_in,
-            k_size,
-            c_out,
-            c_in,
-            padding,
-            output_padding,
-            stride,
-            dilation,
-        };
-        let storage = self.storage().conv_transpose1d(
-            self.layout(),
-            &kernel.storage(),
-            kernel.layout(),
-            &params,
-        )?;
-        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose1D {
-            arg,
-            kernel,
-            padding: params.padding,
-            output_padding: params.output_padding,
-            stride: params.stride,
-            dilation: params.dilation,
-        });
-        let out_dims = params.out_dims();
-        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
-    }
-
    fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> Result<Self> {
        let storage =
            self.storage()
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
@ -804,11 +804,11 @@ impl<'a, I: IntDType> Map1 for Gather<'a, I> {
    fn f<T: WithDType>(&self, src: &[T], src_l: &Layout) -> Result<Vec<T>> {
        let ids = match self.ids_l.contiguous_offsets() {
            Some((a, b)) => &self.ids[a..b],
-            None => Err(Error::RequiresContiguous { op: "gather" }.bt())?,
+            None => Err(Error::RequiresContiguous { op: "gather" })?,
        };
        let src = match src_l.contiguous_offsets() {
            Some((a, b)) => &src[a..b],
-            None => Err(Error::RequiresContiguous { op: "gather" }.bt())?,
+            None => Err(Error::RequiresContiguous { op: "gather" })?,
        };
        let dim = self.dim;
        let ids_dims = self.ids_l.dims();
@ -857,7 +857,7 @@ impl<'a, I: IntDType> Map1 for IndexSelect<'a, I> {
    fn f<T: WithDType>(&self, src: &[T], layout: &Layout) -> Result<Vec<T>> {
        let src = match layout.contiguous_offsets() {
            Some((a, b)) => &src[a..b],
-            None => Err(Error::RequiresContiguous { op: "index-select" }.bt())?,
+            None => Err(Error::RequiresContiguous { op: "index-select" })?,
        };
        let dim = self.dim;
        let n_ids = match self.ids_l.dims() {
@ -913,7 +913,7 @@ impl<'a, I: IntDType> Map2 for ScatterAdd<'a, I> {
        let mut dst = vec![T::zero(); dst_len];
        copy_strided_src_(v1, &mut dst, 0, l1);
        let src = match src_l.contiguous_offsets() {
-            None => Err(Error::RequiresContiguous { op: "scatter-add" }.bt())?,
+            None => Err(Error::RequiresContiguous { op: "scatter-add" })?,
            Some((o1, o2)) => &src[o1..o2],
        };

@ -929,7 +929,7 @@ impl<'a, I: IntDType> Map2 for ScatterAdd<'a, I> {

        let ids = match self.ids_l.contiguous_offsets() {
            Some((a, b)) => &self.ids[a..b],
-            None => Err(Error::RequiresContiguous { op: "gather" }.bt())?,
+            None => Err(Error::RequiresContiguous { op: "gather" })?,
        };
        for left_i in 0..ids_left_len {
            let start_ids_idx = left_i * ids_right_len * ids_dim_len;
@ -971,7 +971,7 @@ impl<'a, I: IntDType> Map2 for IndexAdd<'a, I> {
        let mut dst = vec![T::zero(); dst_len];
        copy_strided_src_(v1, &mut dst, 0, l1);
        let src = match src_l.contiguous_offsets() {
-            None => Err(Error::RequiresContiguous { op: "index-add" }.bt())?,
+            None => Err(Error::RequiresContiguous { op: "index-add" })?,
            Some((o1, o2)) => &src[o1..o2],
        };
        let dim = self.dim;
@ -1256,74 +1256,6 @@ impl Map1 for Im2Col {
    }
 }

-struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);
-
-impl<'a> Map2 for ConvTranspose1D<'a> {
-    const OP: &'static str = "conv_transpose1d";
-    fn f<T: WithDType>(&self, inp: &[T], inp_l: &Layout, k: &[T], k_l: &Layout) -> Result<Vec<T>> {
-        let p = self.0;
-        let inp = &inp[inp_l.start_offset()..];
-        let (inp_s0, inp_s1, inp_s2) = crate::shape::dims3(inp_l.stride())?;
-        let (k_s0, k_s1, k_s2) = crate::shape::dims3(k_l.stride())?;
-        let l_out = p.l_out();
-
-        // Output shape: [b_size, c_out, l_out].
-        let dst_elems = p.c_out * l_out * p.b_size;
-        let dst = vec![T::zero(); dst_elems];
-        let dst_s0 = p.c_out * l_out;
-        let dst_s1 = l_out;
-        let dst_s2 = 1;
-
-        // TODO: Avoid making this copy if `inp` already has the appropriate layout.
-        let mut inp_cont = vec![T::zero(); p.b_size * p.c_in * p.l_in];
-        let cont_s0 = p.l_in * p.c_in;
-        let cont_s1 = p.c_in;
-        for b_idx in 0..p.b_size {
-            for l_idx in 0..p.l_in {
-                for c_idx in 0..p.c_in {
-                    let src_idx = b_idx * inp_s0 + c_idx * inp_s1 + l_idx * inp_s2;
-                    let dst_idx = b_idx * cont_s0 + l_idx * cont_s1 + c_idx;
-                    inp_cont[dst_idx] = inp[src_idx]
-                }
-            }
-        }
-
-        for k_idx in 0..p.k_size {
-            (0..p.c_out).into_par_iter().for_each(|dst_c_idx| {
-                let k_cont = (0..p.c_in)
-                    .map(|c_in_idx| k[c_in_idx * k_s0 + dst_c_idx * k_s1 + k_idx * k_s2])
-                    .collect::<Vec<_>>();
-                for b_idx in 0..p.b_size {
-                    for l_idx in 0..p.l_in {
-                        let out_idx = l_idx * p.stride + k_idx * p.dilation;
-                        if out_idx < p.padding {
-                            continue;
-                        }
-                        let out_idx = out_idx - p.padding;
-                        if out_idx < l_out {
-                            let inp_cont = &inp_cont[b_idx * cont_s0 + l_idx * cont_s1..];
-                            let dst_idx = b_idx * dst_s0 + out_idx * dst_s2 + dst_c_idx * dst_s1;
-                            let mut d = T::zero();
-                            unsafe {
-                                T::vec_dot(inp_cont.as_ptr(), k_cont.as_ptr(), &mut d, p.c_in)
-                            }
-                            let dst_p = dst.as_ptr();
-                            // Safety: dst_idx are uniques per dst_c_idx which is used to
-                            // parallelise the different tasks so no two threads can try to
-                            // write at the same location.
-                            unsafe {
-                                let ptr = dst_p.add(dst_idx) as *mut T;
-                                *ptr += d
-                            }
-                        }
-                    }
-                }
-            })
-        }
-        Ok(dst)
-    }
-}
-
 struct Conv2D<'a>(&'a crate::conv::ParamsConv2D);

 impl<'a> Map2 for Conv2D<'a> {
@ -2503,16 +2435,6 @@ impl BackendStorage for CpuStorage {
        Ok(res_t)
    }

-    fn conv_transpose1d(
-        &self,
-        l: &Layout,
-        kernel: &Self,
-        kernel_l: &Layout,
-        params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        ConvTranspose1D(params).map(self, l, kernel, kernel_l)
-    }
-
    fn conv2d(
        &self,
        l: &Layout,
@ -2617,25 +2539,25 @@ impl BackendStorage for CpuStorage {
            Self::U8(ids) => {
                let ids = match ids_l.contiguous_offsets() {
                    Some((a, b)) => &ids[a..b],
-                    None => Err(Error::RequiresContiguous { op: "index-add" }.bt())?,
+                    None => Err(Error::RequiresContiguous { op: "index-add" })?,
                };
                IndexAdd { ids, dim }.map(self, l, src, src_l)
            }
            Self::U32(ids) => {
                let ids = match ids_l.contiguous_offsets() {
                    Some((a, b)) => &ids[a..b],
-                    None => Err(Error::RequiresContiguous { op: "index-add" }.bt())?,
+                    None => Err(Error::RequiresContiguous { op: "index-add" })?,
                };
                IndexAdd { ids, dim }.map(self, l, src, src_l)
            }
            Self::I64(ids) => {
                let ids = match ids_l.contiguous_offsets() {
                    Some((a, b)) => &ids[a..b],
-                    None => Err(Error::RequiresContiguous { op: "index-add" }.bt())?,
+                    None => Err(Error::RequiresContiguous { op: "index-add" })?,
                };
                IndexAdd { ids, dim }.map(self, l, src, src_l)
            }
-            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "index-add").bt()),
+            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "index-add")),
        }
    }

--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@ -1808,16 +1808,6 @@ impl BackendStorage for CudaStorage {
        Ok(res_t)
    }

-    fn conv_transpose1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        todo!()
-    }
-
    #[cfg(not(feature = "cudnn"))]
    fn conv2d(
        &self,
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -8,14 +8,12 @@ use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
 pub enum DeviceLocation {
    Cpu,
    Cuda { gpu_id: usize },
-    Metal { gpu_id: usize },
 }

 #[derive(Debug, Clone)]
 pub enum Device {
    Cpu,
    Cuda(crate::CudaDevice),
-    Metal(crate::MetalDevice),
 }

 pub trait NdArray {
@ -130,15 +128,10 @@ impl Device {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
    }

-    pub fn new_metal(ordinal: usize) -> Result<Self> {
-        Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
-    }
-
    pub fn set_seed(&self, seed: u64) -> Result<()> {
        match self {
-            Self::Cpu => CpuDevice.set_seed(seed),
+            Self::Cpu => crate::cpu_backend::CpuDevice.set_seed(seed),
            Self::Cuda(c) => c.set_seed(seed),
-            Self::Metal(m) => m.set_seed(seed),
        }
    }

@ -146,7 +139,6 @@ impl Device {
        match (self, rhs) {
            (Self::Cpu, Self::Cpu) => true,
            (Self::Cuda(lhs), Self::Cuda(rhs)) => lhs.same_device(rhs),
-            (Self::Metal(lhs), Self::Metal(rhs)) => lhs.same_device(rhs),
            _ => false,
        }
    }
@ -155,20 +147,21 @@ impl Device {
        match self {
            Self::Cpu => DeviceLocation::Cpu,
            Self::Cuda(device) => device.location(),
-            Device::Metal(device) => device.location(),
        }
    }

    pub fn is_cpu(&self) -> bool {
-        matches!(self, Self::Cpu)
+        match self {
+            Self::Cpu => true,
+            Self::Cuda(_) => false,
+        }
    }

    pub fn is_cuda(&self) -> bool {
-        matches!(self, Self::Cuda(_))
-    }
-
-    pub fn is_metal(&self) -> bool {
-        matches!(self, Self::Metal(_))
+        match self {
+            Self::Cpu => false,
+            Self::Cuda(_) => true,
+        }
    }

    pub fn cuda_if_available(ordinal: usize) -> Result<Self> {
@ -192,18 +185,8 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
-                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
-                if dtype == DType::F16 || dtype == DType::BF16 {
-                    let storage = device.rand_uniform(shape, DType::F32, lo, up)?;
-                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
-                } else {
-                    let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                    Ok(Storage::Cuda(storage))
-                }
-            }
-            Device::Metal(device) => {
                let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                Ok(Storage::Metal(storage))
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -230,18 +213,8 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
-                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
-                if dtype == DType::F16 || dtype == DType::BF16 {
-                    let storage = device.rand_normal(shape, DType::F32, mean, std)?;
-                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
-                } else {
-                    let storage = device.rand_normal(shape, dtype, mean, std)?;
-                    Ok(Storage::Cuda(storage))
-                }
-            }
-            Device::Metal(device) => {
                let storage = device.rand_normal(shape, dtype, mean, std)?;
-                Ok(Storage::Metal(storage))
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -265,10 +238,6 @@ impl Device {
                let storage = device.ones_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.ones_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -282,10 +251,6 @@ impl Device {
                let storage = device.zeros_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.zeros_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -297,11 +262,6 @@ impl Device {
                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage(&storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -313,11 +273,6 @@ impl Device {
                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage(&storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }
 }
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -14,9 +14,6 @@ impl Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            crate::DeviceLocation::Metal { gpu_id } => {
-                format!(", metal:{}", gpu_id)
-            }
        };

        write!(f, "Tensor[")?;
@ -479,9 +476,6 @@ impl std::fmt::Display for Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            crate::DeviceLocation::Metal { gpu_id } => {
-                format!(", metal:{}", gpu_id)
-            }
        };

        write!(
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -79,16 +79,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn conv_transpose1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn conv2d(
        &self,
        _: &Layout,
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -1,223 +0,0 @@
-#![allow(dead_code)]
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
-use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
-
-#[derive(Debug, Clone)]
-pub struct MetalDevice;
-
-#[derive(Debug)]
-pub struct MetalStorage;
-
-#[derive(thiserror::Error, Debug)]
-pub enum MetalError {
-    #[error("{0}")]
-    Message(String),
-}
-
-impl From<String> for MetalError {
-    fn from(e: String) -> Self {
-        MetalError::Message(e)
-    }
-}
-
-macro_rules! fail {
-    () => {
-        unimplemented!("metal support has not been enabled, add `metal` feature to enable.")
-    };
-}
-
-impl crate::backend::BackendStorage for MetalStorage {
-    type Device = MetalDevice;
-
-    fn try_clone(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn dtype(&self) -> DType {
-        fail!()
-    }
-
-    fn device(&self) -> &Self::Device {
-        fail!()
-    }
-
-    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv_transpose1d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv2d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn matmul(
-        &self,
-        _: &Self,
-        _: (usize, usize, usize, usize),
-        _: &Layout,
-        _: &Layout,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
-
-impl crate::backend::BackendDevice for MetalDevice {
-    type Storage = MetalStorage;
-    fn new(_: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn set_seed(&self, _: u64) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        fail!()
-    }
-
-    fn same_device(&self, _: &Self) -> bool {
-        fail!()
-    }
-
-    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -1,4 +1,4 @@
-use crate::{DType, DeviceLocation, Layout, MetalError, Shape};
+use crate::{DType, DeviceLocation, Layout, Shape};

 #[derive(Debug, Clone)]
 pub struct MatMulUnexpectedStriding {
@ -152,9 +152,6 @@ pub enum Error {
    #[error("the candle crate has not been built with cuda support")]
    NotCompiledWithCudaSupport,

-    #[error("the candle crate has not been built with metal support")]
-    NotCompiledWithMetalSupport,
-
    #[error("cannot find tensor {path}")]
    CannotFindTensor { path: String },

@ -162,9 +159,6 @@ pub enum Error {
    #[error(transparent)]
    Cuda(Box<dyn std::error::Error + Send + Sync>),

-    #[error("Metal error {0}")]
-    Metal(#[from] MetalError),
-
    #[error(transparent)]
    TryFromIntError(#[from] core::num::TryFromIntError),

--- a/candle-core/src/indexer.rs
+++ b/candle-core/src/indexer.rs
@ -104,31 +104,37 @@ impl From<&Tensor> for TensorIndexer {
    }
 }

-trait RB: RangeBounds<usize> {}
-impl RB for Range<usize> {}
-impl RB for RangeFrom<usize> {}
-impl RB for RangeFull {}
-impl RB for RangeInclusive<usize> {}
-impl RB for RangeTo<usize> {}
-impl RB for RangeToInclusive<usize> {}
+macro_rules! impl_from_range {
+    ($range_type:ty) => {
+        impl From<$range_type> for TensorIndexer {
+            fn from(range: $range_type) -> Self {
+                use std::ops::Bound::*;

-impl<T: RB> From<T> for TensorIndexer {
-    fn from(range: T) -> Self {
-        use std::ops::Bound::*;
-        let start = match range.start_bound() {
-            Included(idx) => Included(*idx),
-            Excluded(idx) => Excluded(*idx),
-            Unbounded => Unbounded,
-        };
-        let end = match range.end_bound() {
-            Included(idx) => Included(*idx),
-            Excluded(idx) => Excluded(*idx),
-            Unbounded => Unbounded,
-        };
-        TensorIndexer::Narrow(start, end)
-    }
+                let start = match range.start_bound() {
+                    Included(idx) => Included(*idx),
+                    Excluded(idx) => Excluded(*idx),
+                    Unbounded => Unbounded,
+                };
+
+                let end = match range.end_bound() {
+                    Included(idx) => Included(*idx),
+                    Excluded(idx) => Excluded(*idx),
+                    Unbounded => Unbounded,
+                };
+
+                TensorIndexer::Narrow(start, end)
+            }
+        }
+    };
 }

+impl_from_range!(Range<usize>);
+impl_from_range!(RangeFrom<usize>);
+impl_from_range!(RangeFull);
+impl_from_range!(RangeInclusive<usize>);
+impl_from_range!(RangeTo<usize>);
+impl_from_range!(RangeToInclusive<usize>);
+
 /// Trait used to implement multiple signatures for ease of use of the slicing
 /// of a tensor
 pub trait IndexOp<T> {
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -49,12 +49,9 @@ mod device;
 pub mod display;
 mod dtype;
 mod dummy_cuda_backend;
-mod dummy_metal_backend;
 pub mod error;
 mod indexer;
 pub mod layout;
-#[cfg(feature = "metal")]
-pub mod metal_backend;
 #[cfg(feature = "mkl")]
 mod mkl;
 pub mod npy;
@ -90,12 +87,6 @@ pub use cuda_backend::{CudaDevice, CudaStorage};
 #[cfg(not(feature = "cuda"))]
 pub use dummy_cuda_backend::{CudaDevice, CudaStorage};

-#[cfg(feature = "metal")]
-pub use metal_backend::{MetalDevice, MetalError, MetalStorage};
-
-#[cfg(not(feature = "metal"))]
-pub use dummy_metal_backend::{MetalDevice, MetalError, MetalStorage};
-
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

@ -123,20 +114,14 @@ pub trait Module {
    fn forward(&self, xs: &Tensor) -> Result<Tensor>;
 }

+impl Module for quantized::QMatMul {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        self.forward(xs)
+    }
+}
+
 impl<T: Fn(&Tensor) -> Result<Tensor>> Module for T {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        self(xs)
    }
 }
-
-// A trait defining a module with forward method using a single tensor argument and a flag to
-// separate the training and evaluation behaviors.
-pub trait ModuleT {
-    fn forward_t(&self, xs: &Tensor, train: bool) -> Result<Tensor>;
-}
-
-impl<M: Module> ModuleT for M {
-    fn forward_t(&self, xs: &Tensor, _train: bool) -> Result<Tensor> {
-        self.forward(xs)
-    }
-}
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,5 +1,5 @@
 #![allow(clippy::redundant_closure_call)]
-use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
+use crate::{CpuStorage, CudaStorage, Layout, Result, Shape, Tensor};
 use half::{bf16, f16};
 use num_traits::float::Float;

@ -90,16 +90,6 @@ pub enum Op {
        dilation: usize,
    },

-    #[allow(dead_code)]
-    ConvTranspose1D {
-        arg: Tensor,
-        kernel: Tensor,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-    },
-
    #[allow(dead_code)]
    Conv2D {
        arg: Tensor,
@ -184,18 +174,6 @@ pub trait CustomOp1 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _storage: &MetalStorage,
-        _layout: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    /// This function takes as argument the argument `arg` used in the forward pass, the result
    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
    /// The function should return the gradient of the argument.
@ -231,20 +209,6 @@ pub trait CustomOp2 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    fn bwd(
        &self,
        _arg1: &Tensor,
@ -287,22 +251,6 @@ pub trait CustomOp3 {
        ))
    }

-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
    fn bwd(
        &self,
        _arg1: &Tensor,
@ -588,13 +536,13 @@ unary_op!(Log, "log", v, v.ln(), vs_ln, vd_ln);
 unary_op!(Sin, "sin", v, v.sin(), vs_sin, vd_sin);
 unary_op!(Cos, "cos", v, v.cos(), vs_cos, vd_cos);
 unary_op!(Tanh, "tanh", v, v.tanh(), vs_tanh, vd_tanh);
+unary_op!(Abs, "abs", v, v.abs());
 unary_op!(Neg, "neg", v, -v);
 unary_op!(Recip, "recip", v, v.recip());
 unary_op!(Sqr, "sqr", v, v * v, vs_sqr, vd_sqr);
 unary_op!(Sqrt, "sqrt", v, v.sqrt(), vs_sqrt, vd_sqrt);

-/// Tanh based approximation of the `gelu` operation
-/// GeluErf is the more precise one.
+/// `gelu` operation
 /// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
 impl UnaryOpT for Gelu {
    const NAME: &'static str = "gelu";
@ -684,8 +632,6 @@ impl UnaryOpT for Gelu {
    }
 }

-/// `erf` operation
-/// <https://en.wikipedia.org/wiki/Error_function>
 impl UnaryOpT for Erf {
    const NAME: &'static str = "erf";
    const KERNEL: &'static str = "uerf";
@ -720,40 +666,6 @@ impl UnaryOpT for Erf {
    }
 }

-impl UnaryOpT for Abs {
-    const NAME: &'static str = "abs";
-    const KERNEL: &'static str = "uabs";
-    const V: Self = Abs;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v.abs()
-    }
-}
-
 impl UnaryOpT for Ceil {
    const NAME: &'static str = "ceil";
    const KERNEL: &'static str = "uceil";
@ -975,10 +887,6 @@ impl BackpropOp {
        };
        Self(op)
    }
-
-    pub(crate) fn is_none(&self) -> bool {
-        self.0.is_none()
-    }
 }

 impl std::ops::Deref for BackpropOp {
--- a/candle-core/src/quantized/avx.rs
+++ b/candle-core/src/quantized/avx.rs
@ -50,9 +50,14 @@ pub(crate) unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> __m256 {
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
+    let nb = n / qk;
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+    }
+
    unsafe {
        let mut acc = _mm256_setzero_ps();
        for (x, y) in xs.iter().zip(ys.iter()) {
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -29,7 +29,6 @@ impl TryFrom<u32> for Magic {
 pub enum VersionedMagic {
    GgufV1,
    GgufV2,
-    GgufV3,
 }

 impl VersionedMagic {
@ -40,7 +39,6 @@ impl VersionedMagic {
        let versioned_magic = match (magic, version) {
            (Magic::Gguf, 1) => Self::GgufV1,
            (Magic::Gguf, 2) => Self::GgufV2,
-            (Magic::Gguf, 3) => Self::GgufV3,
            _ => crate::bail!("ggml: unsupported magic/version {magic:?}/{version}"),
        };
        Ok(versioned_magic)
@ -86,9 +84,7 @@ pub struct Content {
 fn read_string<R: std::io::Read>(reader: &mut R, magic: &VersionedMagic) -> Result<String> {
    let len = match magic {
        VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-        VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-            reader.read_u64::<LittleEndian>()? as usize
-        }
+        VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
    };
    let mut v = vec![0u8; len];
    reader.read_exact(&mut v)?;
@ -288,9 +284,7 @@ impl Value {
                let value_type = ValueType::from_u32(value_type)?;
                let len = match magic {
                    VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-                    VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-                        reader.read_u64::<LittleEndian>()? as usize
-                    }
+                    VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
                };
                let mut vs = Vec::with_capacity(len);
                for _ in 0..len {
@ -387,15 +381,11 @@ impl Content {

        let tensor_count = match magic {
            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-                reader.read_u64::<LittleEndian>()? as usize
-            }
+            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
        };
        let metadata_kv_count = match magic {
            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-                reader.read_u64::<LittleEndian>()? as usize
-            }
+            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
        };

        let mut metadata = HashMap::new();
@ -417,7 +407,7 @@ impl Content {
                    reader.read_u32_into::<LittleEndian>(&mut dimensions)?;
                    dimensions.into_iter().map(|c| c as usize).collect()
                }
-                VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                VersionedMagic::GgufV2 => {
                    let mut dimensions = vec![0; n_dimensions as usize];
                    reader.read_u64_into::<LittleEndian>(&mut dimensions)?;
                    dimensions.into_iter().map(|c| c as usize).collect()
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@ -236,9 +236,14 @@ impl GgmlType for BlockQ4_0 {

    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = QK8_0;
+        let nb = n / qk;
        if n % QK8_0 != 0 {
            crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
        }
+        if nb % 2 != 0 {
+            crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+        }
+
        // Generic implementation.
        let mut sumf = 0f32;
        for (xs, ys) in xs.iter().zip(ys.iter()) {
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -307,8 +307,8 @@ impl crate::CustomOp1 for QTensor {
    }
 }

-impl crate::Module for QMatMul {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+impl QMatMul {
+    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        match self {
            Self::QTensor(t) => xs.apply_op1_no_bwd(t.as_ref()),
            Self::Tensor(w) => {
--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
@ -19,29 +19,42 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+    }

    unsafe {
        let mut sumv0 = vdupq_n_f32(0.0f32);
-        for i in 0..nb {
+        let mut sumv1 = vdupq_n_f32(0.0f32);
+        for i in (0..nb).step_by(2) {
            let x0 = &xs[i];
+            let x1 = &xs[i + 1];
            let y0 = &ys[i];
+            let y1 = &ys[i + 1];

            let m4b = vdupq_n_u8(0x0F);
            let s8b = vdupq_n_s8(0x8);

            let v0_0 = vld1q_u8(x0.qs.as_ptr());
+            let v0_1 = vld1q_u8(x1.qs.as_ptr());

            // 4-bit -> 8-bit
            let v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
            let v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            let v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
+            let v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));

            // sub 8
            let v0_0ls = vsubq_s8(v0_0l, s8b);
            let v0_0hs = vsubq_s8(v0_0h, s8b);
+            let v0_1ls = vsubq_s8(v0_1l, s8b);
+            let v0_1hs = vsubq_s8(v0_1h, s8b);

            // load y
            let v1_0l = vld1q_s8(y0.qs.as_ptr());
            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));
+            let v1_1l = vld1q_s8(y1.qs.as_ptr());
+            let v1_1h = vld1q_s8(y1.qs.as_ptr().add(16));

            // TODO: Support dotprod when it's available outside of nightly.
            let pl0l = vmull_s8(vget_low_s8(v0_0ls), vget_low_s8(v1_0l));
@ -49,16 +62,28 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
            let ph0l = vmull_s8(vget_low_s8(v0_0hs), vget_low_s8(v1_0h));
            let ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));

+            let pl1l = vmull_s8(vget_low_s8(v0_1ls), vget_low_s8(v1_1l));
+            let pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+            let ph1l = vmull_s8(vget_low_s8(v0_1hs), vget_low_s8(v1_1h));
+            let ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
+
            let pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
            let ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+            let pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+            let ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));

            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
                x0.d.to_f32() * y0.d.to_f32(),
            );
+            sumv1 = vmlaq_n_f32(
+                sumv1,
+                vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
+                x1.d.to_f32() * y1.d.to_f32(),
+            );
        }
-        Ok(vaddvq_f32(sumv0))
+        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
    }
 }

@ -69,18 +94,28 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
    }
    let nb = n / QK8_0;
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q8_0_q8_0: {nb} is not even")
+    }
    unsafe {
        let mut sumv0 = vdupq_n_f32(0.0f32);
-        for i in 0..nb {
+        let mut sumv1 = vdupq_n_f32(0.0f32);
+        for i in (0..nb).step_by(2) {
            let x0 = &xs[i];
+            let x1 = &xs[i + 1];
            let y0 = &ys[i];
+            let y1 = &ys[i + 1];

            let x0_0 = vld1q_s8(x0.qs.as_ptr());
            let x0_1 = vld1q_s8(x0.qs.as_ptr().add(16));
+            let x1_0 = vld1q_s8(x1.qs.as_ptr());
+            let x1_1 = vld1q_s8(x1.qs.as_ptr().add(16));

            // load y
            let y0_0 = vld1q_s8(y0.qs.as_ptr());
            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));
+            let y1_0 = vld1q_s8(y1.qs.as_ptr());
+            let y1_1 = vld1q_s8(y1.qs.as_ptr().add(16));

            // TODO dotprod once this is the intrinsics are.
            let p0_0 = vmull_s8(vget_low_s8(x0_0), vget_low_s8(y0_0));
@ -88,16 +123,28 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
            let p0_2 = vmull_s8(vget_low_s8(x0_1), vget_low_s8(y0_1));
            let p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));

+            let p1_0 = vmull_s8(vget_low_s8(x1_0), vget_low_s8(y1_0));
+            let p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
+            let p1_2 = vmull_s8(vget_low_s8(x1_1), vget_low_s8(y1_1));
+            let p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
+
            let p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
            let p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+            let p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
+            let p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));

            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(p0, p1)),
                x0.d.to_f32() * y0.d.to_f32(),
            );
+            sumv1 = vmlaq_n_f32(
+                sumv1,
+                vcvtq_f32_s32(vaddq_s32(p2, p3)),
+                x1.d.to_f32() * y1.d.to_f32(),
+            );
        }
-        Ok(vaddvq_f32(sumv0))
+        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
    }
 }

--- a/candle-core/src/quantized/simd128.rs
+++ b/candle-core/src/quantized/simd128.rs
@ -11,6 +11,10 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
+    let nb = n / QK8_0;
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+    }
    unsafe {
        let mut acc = f32x4_splat(0.0f32);
        for (x, y) in xs.iter().zip(ys.iter()) {
@ -57,6 +61,10 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
    }
+    let nb = n / QK8_0;
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q8_0_q8_0: {nb} is not even")
+    }
    unsafe {
        let mut acc = f32x4_splat(0.0f32);
        for (x, y) in xs.iter().zip(ys.iter()) {
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -203,7 +203,7 @@ impl Shape {

    /// Check whether the two shapes are compatible for broadcast, and if it is the case return the
    /// broadcasted shape. This is to be used for binary pointwise ops.
-    pub fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
+    pub(crate) fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
        let lhs = self;
        let lhs_dims = lhs.dims();
        let rhs_dims = rhs.dims();
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,6 +1,6 @@
 use crate::backend::BackendStorage;
 use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
-use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
+use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};

 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -8,7 +8,6 @@ use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage,
 pub enum Storage {
    Cpu(CpuStorage),
    Cuda(CudaStorage),
-    Metal(MetalStorage),
 }

 impl Storage {
@ -19,10 +18,6 @@ impl Storage {
                let storage = storage.try_clone(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.try_clone(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -30,7 +25,6 @@ impl Storage {
        match self {
            Self::Cpu(_) => Device::Cpu,
            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
-            Self::Metal(storage) => Device::Metal(storage.device().clone()),
        }
    }

@ -38,7 +32,6 @@ impl Storage {
        match self {
            Self::Cpu(storage) => storage.dtype(),
            Self::Cuda(storage) => storage.dtype(),
-            Self::Metal(storage) => storage.dtype(),
        }
    }

@ -72,10 +65,6 @@ impl Storage {
                let storage = storage.affine(layout, mul, add)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.affine(layout, mul, add)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -89,10 +78,6 @@ impl Storage {
                let storage = storage.powf(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.powf(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -106,10 +91,6 @@ impl Storage {
                let storage = storage.elu(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.elu(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -131,10 +112,6 @@ impl Storage {
                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -158,10 +135,6 @@ impl Storage {
                let storage = storage.reduce_op(op, layout, s)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.reduce_op(op, layout, s)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -175,10 +148,6 @@ impl Storage {
                let storage = storage.to_dtype(layout, dtype)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.to_dtype(layout, dtype)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -192,10 +161,6 @@ impl Storage {
                let (storage, shape) = c.cuda_fwd(storage, l)?;
                Ok((Self::Cuda(storage), shape))
            }
-            Self::Metal(storage) => {
-                let (storage, shape) = c.metal_fwd(storage, l)?;
-                Ok((Self::Metal(storage), shape))
-            }
        }
    }

@ -216,10 +181,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -244,10 +205,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2, s3, l3)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -262,10 +219,6 @@ impl Storage {
                let storage = storage.unary_impl::<B>(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.unary_impl::<B>(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -286,10 +239,6 @@ impl Storage {
                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -321,10 +270,6 @@ impl Storage {
                let s = inp.conv1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -334,33 +279,6 @@ impl Storage {
        }
    }

-    pub(crate) fn conv_transpose1d(
-        &self,
-        l: &Layout,
-        kernel: &Self,
-        kernel_l: &Layout,
-        params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        self.same_device(kernel, "conv-transpose1d")?;
-        self.same_dtype(kernel, "conv-transpose1d")?;
-        match (self, &kernel) {
-            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
-                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cpu(s))
-            }
-            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
-                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cuda(s))
-            }
-            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
-                lhs: lhs.device().location(),
-                rhs: rhs.device().location(),
-                op: "conv-transpose1d",
-            }
-            .bt()),
-        }
-    }
-
    pub(crate) fn conv2d(
        &self,
        l: &Layout,
@ -379,10 +297,6 @@ impl Storage {
                let s = inp.conv2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -410,10 +324,6 @@ impl Storage {
                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -438,10 +348,6 @@ impl Storage {
                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -460,10 +366,6 @@ impl Storage {
                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -477,10 +379,6 @@ impl Storage {
                let storage = storage.upsample_nearest1d(layout, sz)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -494,10 +392,6 @@ impl Storage {
                let storage = storage.upsample_nearest2d(layout, h, w)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest2d(layout, h, w)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -521,10 +415,6 @@ impl Storage {
                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(cond), Self::Metal(t), Self::Metal(f)) => {
-                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
-                Ok(Self::Metal(storage))
-            }
            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -551,10 +441,6 @@ impl Storage {
                let storage = s.gather(l, indexes, indexes_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes)) => {
-                let storage = s.gather(l, indexes, indexes_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -579,10 +465,6 @@ impl Storage {
                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -607,10 +489,6 @@ impl Storage {
                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -632,10 +510,6 @@ impl Storage {
                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -663,10 +537,6 @@ impl Storage {
                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -686,9 +556,6 @@ impl Storage {
        match (self, dst) {
            (Self::Cpu(src), Self::Cpu(dst)) => src.copy_strided_src(dst, dst_offset, src_l),
            (Self::Cuda(src), Self::Cuda(dst)) => Ok(src.copy_strided_src(dst, dst_offset, src_l)?),
-            (Self::Metal(src), Self::Metal(dst)) => {
-                Ok(src.copy_strided_src(dst, dst_offset, src_l)?)
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -6,7 +6,7 @@ use crate::op::{
 };
 use crate::scalar::TensorOrScalar;
 use crate::shape::{Dim, Dims};
-use crate::{bail, storage::Storage, DType, Device, Error, Layout, Result, Shape};
+use crate::{storage::Storage, DType, Device, Error, Layout, Result, Shape};
 use std::sync::{Arc, RwLock};

 /// Unique identifier for tensors.
@ -385,21 +385,11 @@ impl Tensor {
        step: D,
        device: &Device,
    ) -> Result<Self> {
-        if D::is_zero(&step) {
-            crate::bail!("step cannot be zero")
-        }
        let mut data = vec![];
        let mut current = start;
-        if step >= D::zero() {
-            while current < end {
-                data.push(current);
-                current += step;
-            }
-        } else {
-            while current > end {
-                data.push(current);
-                current += step;
-            }
+        while current < end {
+            data.push(current);
+            current += step;
        }
        let len = data.len();
        Self::from_vec_impl(data, len, device, false)
@ -477,12 +467,6 @@ impl Tensor {
    broadcast_binary_op!(broadcast_div, div);
    broadcast_binary_op!(broadcast_maximum, maximum);
    broadcast_binary_op!(broadcast_minimum, minimum);
-    broadcast_binary_op!(broadcast_eq, eq);
-    broadcast_binary_op!(broadcast_ne, ne);
-    broadcast_binary_op!(broadcast_lt, lt);
-    broadcast_binary_op!(broadcast_le, le);
-    broadcast_binary_op!(broadcast_gt, gt);
-    broadcast_binary_op!(broadcast_ge, ge);

    unary_op!(recip, Recip);
    unary_op!(neg, Neg);
@ -529,7 +513,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(cpu_storage) => from_cpu_storage(cpu_storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -856,20 +839,6 @@ impl Tensor {
        self.sum_impl(mean_dims, false)? * scale
    }

-    /// Returns the unbiased variance over the selected dimension.
-    pub fn var_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
-        let dim = dim.to_index(self.shape(), "var")?;
-        let mean = self.mean_keepdim(dim)?;
-        let squares = self.broadcast_sub(&mean)?.sqr()?;
-        squares.sum_impl(dim, true)? / (self.dim(dim)? - 1) as f64
-    }
-
-    /// Returns the unbiased variance over the selected dimension.
-    pub fn var<D: Dim>(&self, dim: D) -> Result<Self> {
-        let dim = dim.to_index(self.shape(), "var")?;
-        self.var_keepdim(dim)?.squeeze(dim)
-    }
-
    /// Gathers the maximum value across the selected dimension. The resulting shape has the same
    /// number of dimensions as the original tensor and the select dimension has a single element.
    pub fn max_keepdim<D: Dim>(&self, dim: D) -> Result<Self> {
@ -1217,16 +1186,14 @@ impl Tensor {
                op: "scatter-add (self, src)",
                lhs: self.shape().clone(),
                rhs: source.shape().clone(),
-            }
-            .bt())?
+            })?
        }
        if indexes.dims() != source.dims() {
            Err(Error::ShapeMismatchBinaryOp {
                op: "scatter-add (indexes, src)",
                lhs: indexes.shape().clone(),
                rhs: source.shape().clone(),
-            }
-            .bt())?
+            })?
        }
        let storage = self.storage().scatter_add(
            self.layout(),
@ -1298,8 +1265,7 @@ impl Tensor {
                op: "slice-scatter (self, src)",
                lhs: self.shape().clone(),
                rhs: src.shape().clone(),
-            }
-            .bt())?
+            })?
        }
        let mut storage = self.device().zeros(self.shape(), self.dtype())?;
        self.storage()
@ -1333,8 +1299,7 @@ impl Tensor {
                op: "index-add (self, source)",
                lhs: self.shape().clone(),
                rhs: source.shape().clone(),
-            }
-            .bt())?
+            })?
        }
        // The number of element in indexes must match the dimension on which the add is
        // performed on the source tensor (and the index values from `indexes` are taken from
@ -1345,8 +1310,7 @@ impl Tensor {
                op: "index-add (ids, source))",
                lhs: indexes.shape().clone(),
                rhs: source.shape().clone(),
-            }
-            .bt())?
+            })?
        }
        let storage = self.storage().index_add(
            self.layout(),
@ -1394,8 +1358,7 @@ impl Tensor {
                op: "gather",
                lhs: self.shape().clone(),
                rhs: indexes.shape().clone(),
-            }
-            .bt())?
+            })?
        }
        let storage =
            self.storage()
@ -1469,7 +1432,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1500,7 +1462,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1541,7 +1502,6 @@ impl Tensor {
        match &*self.storage() {
            Storage::Cpu(storage) => from_cpu_storage(storage),
            Storage::Cuda(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
-            Storage::Metal(storage) => from_cpu_storage(&storage.to_cpu_storage()?),
        }
    }

@ -1831,23 +1791,17 @@ impl Tensor {

    /// Returns a new tensor detached from the current graph, gradient are not propagated through
    /// this new node. The storage of this tensor is shared with the initial tensor.
-    ///
-    /// If the tensor is already detached from the computation graph, the same tensor is returned.
    pub fn detach(&self) -> Result<Tensor> {
-        if self.op.is_none() && !self.is_variable {
-            Ok(self.clone())
-        } else {
-            let tensor_ = Tensor_ {
-                id: TensorId::new(),
-                storage: self.storage.clone(),
-                layout: self.layout.clone(),
-                op: BackpropOp::none(),
-                is_variable: false,
-                dtype: self.dtype,
-                device: self.device.clone(),
-            };
-            Ok(Tensor(Arc::new(tensor_)))
-        }
+        let tensor_ = Tensor_ {
+            id: TensorId::new(),
+            storage: self.storage.clone(),
+            layout: self.layout.clone(),
+            op: BackpropOp::none(),
+            is_variable: false,
+            dtype: self.dtype,
+            device: self.device.clone(),
+        };
+        Ok(Tensor(Arc::new(tensor_)))
    }

    /// If the target device is the same as the tensor device, only a shallow copy is performed.
@ -1859,11 +1813,7 @@ impl Tensor {
                (Storage::Cpu(storage), Device::Cuda(cuda)) => {
                    Storage::Cuda(cuda.storage_from_cpu_storage(storage)?)
                }
-                (Storage::Cpu(storage), Device::Metal(metal)) => {
-                    Storage::Metal(metal.storage_from_cpu_storage(storage)?)
-                }
                (Storage::Cuda(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
-                (Storage::Metal(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
                (Storage::Cuda(storage), Device::Cuda(cuda)) => {
                    // TODO: Avoid passing through the cpu storage here, especially if the gpu ids
                    // are the same.
@ -1871,9 +1821,6 @@ impl Tensor {
                    Storage::Cuda(cuda.storage_from_cpu_storage(&cpu_storage)?)
                }
                (Storage::Cpu(storage), Device::Cpu) => Storage::Cpu(storage.clone()),
-                _ => {
-                    bail!("not implemented yet")
-                }
            };
            let op = BackpropOp::new1(self, Op::ToDevice);
            let tensor_ = Tensor_ {
@ -2318,11 +2265,6 @@ impl Tensor {
        m.forward(self)
    }

-    /// Run the `forward` method of `m` on `self`.
-    pub fn apply_t<M: crate::ModuleT>(&self, m: &M, train: bool) -> Result<Self> {
-        m.forward_t(self, train)
-    }
-
    pub(crate) fn storage(&self) -> std::sync::RwLockReadGuard<'_, Storage> {
        self.storage.read().unwrap()
    }
@ -2437,127 +2379,6 @@ impl Tensor {
    ) -> Result<Self> {
        self.apply_op3_arc(t2, t3, Arc::new(Box::new(c)))
    }
-
-    /// Normalize a 'relative' axis value: positive values are kept, negative
-    /// values means counting the dimensions from the back.
-    pub fn normalize_axis(&self, axis: i64) -> Result<usize> {
-        let rank = self.rank() as i64;
-        if rank <= axis {
-            crate::bail!("axis {axis} is too large, tensor rank {rank}")
-        } else if 0 <= axis {
-            Ok(axis as usize)
-        } else {
-            let naxis = rank + axis;
-            if naxis < 0 {
-                crate::bail!("axis {axis} is too small, tensor rank {rank}")
-            }
-            Ok(naxis as usize)
-        }
-    }
-
-    /// Returns a lower triangular matrix of ones of size n by n.
-    pub fn tril2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
-        let t = Tensor::arange(0u32, n as u32, device)?;
-        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
-        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
-        t1.le(&t2)?.to_dtype(dtype)
-    }
-
-    /// Returns an upper triangular matrix of ones of size n by n.
-    pub fn triu2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
-        let t = Tensor::arange(0u32, n as u32, device)?;
-        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
-        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
-        t1.ge(&t2)?.to_dtype(dtype)
-    }
-
-    /// Returns a matrix with a diagonal of ones of size n by n.
-    pub fn eye(n: usize, dtype: DType, device: &Device) -> Result<Self> {
-        let t = Tensor::arange(0u32, n as u32, device)?;
-        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
-        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
-        t1.eq(&t2)?.to_dtype(dtype)
-    }
-
-    /// Returns the cumulative sum of elements of the input tensor summed over the specified
-    /// dimension.
-    ///
-    /// This operation is most efficient when dim is the last dimension of the tensor.
-    pub fn cumsum<D: Dim>(&self, dim: D) -> Result<Self> {
-        let dim = dim.to_index(self.shape(), "cumsum")?;
-        let rank = self.rank();
-        if rank == 0 {
-            return Ok(self.clone());
-        }
-        let n_axis = self.dim(dim)?;
-        let triu = Tensor::triu2(n_axis, self.dtype(), self.device())?;
-        if rank == 1 {
-            self.unsqueeze(0)?.matmul(&triu)?.squeeze(0)
-        } else {
-            let last = rank - 1;
-            let t = self.transpose(dim, last)?;
-            let t = t.broadcast_matmul(&triu)?;
-            t.transpose(dim, last)
-        }
-    }
-
-    /// Returns a copy of `self` where the values within `ranges` have been replaced with the
-    /// content of `src`.
-    pub fn slice_assign<D: std::ops::RangeBounds<usize>>(
-        &self,
-        ranges: &[D],
-        src: &Tensor,
-    ) -> Result<Self> {
-        let src_dims = src.dims();
-        let self_dims = self.dims();
-        if self_dims.len() != src_dims.len() {
-            crate::bail!(
-                "slice-assign requires input with the same rank {} <> {}",
-                self_dims.len(),
-                src_dims.len()
-            )
-        }
-        if self_dims.len() != ranges.len() {
-            crate::bail!(
-                "slice-assign requires input with the same rank as there are ranges {} <> {}",
-                self_dims.len(),
-                ranges.len()
-            )
-        }
-        let mut src = src.clone();
-        let mut mask = Self::ones(src.shape(), DType::U8, src.device())?;
-        for (i, range) in ranges.iter().enumerate() {
-            let start_included = match range.start_bound() {
-                std::ops::Bound::Unbounded => 0,
-                std::ops::Bound::Included(v) => *v,
-                std::ops::Bound::Excluded(v) => *v + 1,
-            };
-            let end_excluded = match range.end_bound() {
-                std::ops::Bound::Unbounded => self_dims[i],
-                std::ops::Bound::Included(v) => *v + 1,
-                std::ops::Bound::Excluded(v) => *v,
-            };
-            if end_excluded <= start_included {
-                crate::bail!(
-                    "slice-assign: empty range for dim {i}, {start_included} {end_excluded}"
-                )
-            }
-            if self_dims[i] < end_excluded {
-                crate::bail!(
-                    "slice-assign: upper bound is out of range for dim {i}, {end_excluded} {}",
-                    self_dims[i]
-                )
-            }
-            if end_excluded - start_included != src_dims[i] {
-                crate::bail!(
-                    "slice-assign: the range for dim {i} ({start_included}..{end_excluded}) does not match the size of src {}", src_dims[i]
-                )
-            }
-            src = src.pad_with_zeros(i, start_included, self_dims[i] - end_excluded)?;
-            mask = mask.pad_with_zeros(i, start_included, self_dims[i] - end_excluded)?
-        }
-        mask.where_cond(/* on_true= */ &src, /* on_false= */ self)
-    }
 }

 macro_rules! bin_trait {
--- a/candle-core/src/test_utils.rs
+++ b/candle-core/src/test_utils.rs
@ -4,7 +4,7 @@ use crate::{Result, Tensor};
 macro_rules! test_device {
    // TODO: Switch to generating the two last arguments automatically once concat_idents is
    // stable. https://github.com/rust-lang/rust/issues/29599
-    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident, $test_metal: ident) => {
+    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident) => {
        #[test]
        fn $test_cpu() -> Result<()> {
            $fn_name(&Device::Cpu)
@ -15,12 +15,6 @@ macro_rules! test_device {
        fn $test_cuda() -> Result<()> {
            $fn_name(&Device::new_cuda(0)?)
        }
-
-        #[cfg(feature = "metal")]
-        #[test]
-        fn $test_metal() -> Result<()> {
-            $fn_name(&Device::new_metal(0)?)
-        }
    };
 }

--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -23,10 +23,6 @@ pub fn cuda_is_available() -> bool {
    cfg!(feature = "cuda")
 }

-pub fn metal_is_available() -> bool {
-    cfg!(feature = "metal")
-}
-
 pub fn with_avx() -> bool {
    cfg!(target_feature = "avx")
 }
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -13,11 +13,6 @@ res = torch.nn.functional.conv1d(t, w)
 print(res.flatten())
 res = torch.nn.functional.conv1d(t, w, padding=1)
 print(res.flatten())
-
-w_t = w.transpose(0, 1)
-res = torch.nn.functional.conv_transpose1d(t, w_t)
-print(res.shape)
-print(res)
 */
 fn conv1d(dev: &Device) -> Result<()> {
    let t = Tensor::new(
@ -50,17 +45,6 @@ fn conv1d(dev: &Device) -> Result<()> {
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
-    if dev.is_cpu() {
-        let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-        assert_eq!(res.dims(), [1, 2, 7]);
-        assert_eq!(
-            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-            [
-                0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
-                4.7076, -5.9745, -0.8276, 1.621
-            ],
-        );
-    }
    Ok(())
 }

@ -495,103 +479,17 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
            ]
        ]
    );
-
-    // Replicate the issue from https://github.com/huggingface/candle/issues/1212
-    let res = t.i((.., .., 0..4, 0..4))?.conv2d(&w, 0, 2, 1, 1)?;
-    let loss = res.sqr()?.sum_all()?;
-    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 21.12f32);
-    let grads = loss.backward()?;
-    let grad_t = grads.get(&t).unwrap();
-    let grad_w = grads.get(&w).unwrap();
-    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
-    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_t.i(0)?, 2)?,
-        [
-            [
-                [9.29, -7.03, 7.87, 0.0, 0.0],
-                [-1.8, -7.82, 5.9, 0.0, 0.0],
-                [-3.12, 4.49, 5.52, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ],
-            [
-                [21.73, 3.39, 4.77, 0.0, 0.0],
-                [8.25, 3.73, 27.61, 0.0, 0.0],
-                [-20.55, -5.61, -2.77, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ],
-            [
-                [-8.98, 9.91, -7.15, 0.0, 0.0],
-                [4.93, -0.33, 4.56, 0.0, 0.0],
-                [-6.7, -5.76, -8.05, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ],
-            [
-                [23.54, 6.98, -10.0, 0.0, 0.0],
-                [9.65, 6.18, 18.72, 0.0, 0.0],
-                [3.29, -5.27, 0.79, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_w.i(0)?, 2)?,
-        [
-            [
-                [-3.47, 7.44, 0.66],
-                [12.89, -3.4, -9.29],
-                [-14.16, -0.83, 7.14]
-            ],
-            [
-                [-3.23, 5.37, -3.02],
-                [-2.12, -11.24, 1.94],
-                [6.97, 7.2, 2.99]
-            ],
-            [
-                [-4.04, -3.31, 4.87],
-                [-6.68, -5.68, 1.73],
-                [-5.54, 4.32, 0.52]
-            ],
-            [[-4.72, 1.5, 4.72], [3.79, 4.04, 6.76], [-4.6, 5.8, 6.93]]
-        ]
-    );
-
    Ok(())
 }

-test_device!(conv1d, conv1d_cpu, conv1d_gpu, conv1d_metal);
-test_device!(
-    conv1d_small,
-    conv1d_small_cpu,
-    conv1d_small_gpu,
-    conv1d_small_metal
-);
-test_device!(conv2d, conv2d_cpu, conv2d_gpu, conv2d_metal);
+test_device!(conv1d, conv1d_cpu, conv1d_gpu);
+test_device!(conv1d_small, conv1d_small_cpu, conv1d_small_gpu);
+test_device!(conv2d, conv2d_cpu, conv2d_gpu);
 test_device!(
    conv2d_non_square,
    conv2d_non_square_cpu,
-    conv2d_non_square_gpu,
-    conv2d_non_square_metal
-);
-test_device!(
-    conv2d_small,
-    conv2d_small_cpu,
-    conv2d_small_gpu,
-    conv2d_small_metal
-);
-test_device!(
-    conv2d_smaller,
-    conv2d_smaller_cpu,
-    conv2d_smaller_gpu,
-    conv2d_smaller_metal
-);
-test_device!(
-    conv2d_grad,
-    conv2d_grad_cpu,
-    conv2d_grad_gpu,
-    conv2_grad_metal
+    conv2d_non_square_gpu
 );
+test_device!(conv2d_small, conv2d_small_cpu, conv2d_small_gpu);
+test_device!(conv2d_smaller, conv2d_smaller_cpu, conv2d_smaller_gpu);
+test_device!(conv2d_grad, conv2d_grad_cpu, conv2d_grad_gpu);
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -205,71 +205,6 @@ fn unary_grad(device: &Device) -> Result<()> {
        test_utils::to_vec1_round(grad_x, 4)?,
        [1.0116, 1.0830, 1.0003, 0.6188],
    );
-
-    // Testing compared to pytorch torch.erf
-    //
-    // import torch
-    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
-    // y = x.erf()
-    // print(y)
-    // loss = y.sum()
-    // loss.backward()
-    // print(x.grad)
-    let y = x.erf()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(test_utils::to_vec1_round(&y, 4)?, [1.0, 0.8427, 1.0, 0.168]);
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [0.0001, 0.4151, 0.0, 1.1033],
-    );
-
-    // Testing compared to pytorch nn.GELU(approximate = 'none')
-    //
-    // import torch
-    // import torch.nn.functional as F
-    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
-    // y = F.gelu(x, approximate='none')
-    // print(y)
-    // loss = y.sum()
-    // loss.backward()
-    // print(x.grad)
-    let y = x.gelu_erf()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [2.9960, 0.8413, 3.9999, 0.0839]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [1.0119, 1.0833, 1.0005, 0.6188],
-    );
-
-    // Testing compared to pytorch elu
-    //
-    // import torch
-    // import torch.nn.functional as F
-    // x = torch.tensor([-1.0, 0.0, -2.0, 3.0], requires_grad=True)
-    // y = F.elu(x, alpha=2.0)
-    // print(y)
-    // loss = y.min
-    // loss = y.sum()
-    // loss.backward()
-    // print(x.grad)
-    let elu_x = Var::new(&[-1.0f32, 0., -2., 3.], device)?;
-    let y = elu_x.elu(2.)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&elu_x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [-1.2642, 0.0000, -1.7293, 3.0000]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [0.7358, 2.0000, 0.2707, 1.0000]
-    );
-
    Ok(())
 }

@ -315,29 +250,9 @@ fn binary_grad(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(
-    simple_grad,
-    simple_grad_cpu,
-    simple_grad_gpu,
-    simple_grad_metal
-);
-test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu, sum_grad_metal);
-test_device!(
-    matmul_grad,
-    matmul_grad_cpu,
-    matmul_grad_gpu,
-    matmul_grad_metal
-);
-test_device!(
-    grad_descent,
-    grad_descent_cpu,
-    grad_descent_gpu,
-    grad_descent_metal
-);
-test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu, unary_grad_metal);
-test_device!(
-    binary_grad,
-    binary_grad_cpu,
-    binary_grad_gpu,
-    binary_grad_metal
-);
+test_device!(simple_grad, simple_grad_cpu, simple_grad_gpu);
+test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu);
+test_device!(matmul_grad, matmul_grad_cpu, matmul_grad_gpu);
+test_device!(grad_descent, grad_descent_cpu, grad_descent_gpu);
+test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu);
+test_device!(binary_grad, binary_grad_cpu, binary_grad_gpu);
--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
@ -91,32 +91,3 @@ fn index_3d() -> Result<()> {
    assert_eq!(tensor.i((1, .., 3))?.to_vec1::<u32>()?, &[15, 19, 23]);
    Ok(())
 }
-
-#[test]
-fn slice_assign() -> Result<()> {
-    let dev = Device::Cpu;
-
-    let tensor = Tensor::arange(0u32, 4 * 5, &dev)?.reshape((4, 5))?;
-    let src = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((3, 2))?;
-    let out = tensor.slice_assign(&[1..4, 3..5], &src)?;
-    assert_eq!(
-        out.to_vec2::<u32>()?,
-        &[
-            [0, 1, 2, 3, 4],
-            [5, 6, 7, 0, 1],
-            [10, 11, 12, 2, 3],
-            [15, 16, 17, 4, 5]
-        ]
-    );
-    let out = tensor.slice_assign(&[0..3, 0..2], &src)?;
-    assert_eq!(
-        out.to_vec2::<u32>()?,
-        &[
-            [0, 1, 2, 3, 4],
-            [2, 3, 7, 8, 9],
-            [4, 5, 12, 13, 14],
-            [15, 16, 17, 18, 19]
-        ]
-    );
-    Ok(())
-}
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -49,7 +49,7 @@ fn contiguous(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(contiguous, contiguous_cpu, contiguous_gpu, contiguous_metal);
+test_device!(contiguous, contiguous_cpu, contiguous_gpu);

 #[test]
 fn strided_blocks() -> Result<()> {
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
@ -98,17 +98,15 @@ fn upsample_nearest2d(dev: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu, avg_pool2d_metal);
+test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu);
 test_device!(
    avg_pool2d_pytorch,
    avg_pool2d_pytorch_cpu,
-    avg_pool2d_pytorch_gpu,
-    avg_pool2d_pytorch_metal
+    avg_pool2d_pytorch_gpu
 );
-test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu, max_pool2d_metal);
+test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu);
 test_device!(
    upsample_nearest2d,
    upsample_nearest2d_cpu,
-    upsample_nearest2d_gpu,
-    upsample_nearest2d_metal
+    upsample_nearest2d_gpu
 );
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
@ -1,7 +1,7 @@
 use candle_core::{
    quantized::{self, GgmlDType},
    test_utils::to_vec2_round,
-    Device, Module, Result, Tensor,
+    Device, Result, Tensor,
 };
 use quantized::{k_quants, GgmlType};
 use rand::prelude::*;
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -29,26 +29,7 @@ fn ones(device: &Device) -> Result<()> {
        Tensor::ones((2, 3), DType::F64, device)?.to_vec2::<f64>()?,
        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
    );
-    Ok(())
-}

-fn arange(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::arange(0u8, 5u8, device)?.to_vec1::<u8>()?,
-        [0, 1, 2, 3, 4],
-    );
-    assert_eq!(
-        Tensor::arange_step(0u8, 5u8, 2, device)?.to_vec1::<u8>()?,
-        [0, 2, 4],
-    );
-    assert_eq!(
-        Tensor::arange_step(0u8, 5u8, 3, device)?.to_vec1::<u8>()?,
-        [0, 3],
-    );
-    assert_eq!(
-        Tensor::arange_step(5i64, 0i64, -1, device)?.to_vec1::<i64>()?,
-        [5, 4, 3, 2, 1],
-    );
    Ok(())
 }

@ -180,22 +161,6 @@ fn transpose(device: &Device) -> Result<()> {
    Ok(())
 }

-fn var(device: &Device) -> Result<()> {
-    // Values taken from https://pytorch.org/docs/stable/generated/torch.var.html
-    let data = &[
-        [0.2035f32, 1.2959, 1.8101, -0.4644],
-        [1.5027, -0.3270, 0.5905, 0.6538],
-        [-1.5745, 1.3330, -0.5596, -0.6548],
-        [0.1264, -0.5080, 1.6420, 0.1992],
-    ];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.var_keepdim(1)?, 4)?,
-        &[[1.0631], [0.559], [1.4893], [0.8258]]
-    );
-    Ok(())
-}
-
 fn sum(device: &Device) -> Result<()> {
    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
    let tensor = Tensor::new(data, device)?;
@ -1070,60 +1035,33 @@ fn randn(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal);
-test_device!(ones, ones_cpu, ones_gpu, ones_metal);
-test_device!(arange, arange_cpu, arange_gpu, arange_metal);
-test_device!(add_mul, add_mul_cpu, add_mul_gpu, add_mul_metal);
-test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu, tensor_2d_metal);
-test_device!(narrow, narrow_cpu, narrow_gpu, narrow_metal);
-test_device!(broadcast, broadcast_cpu, broadcast_gpu, broadcast_metal);
-test_device!(cat, cat_cpu, cat_gpu, cat_metal);
-test_device!(sum, sum_cpu, sum_gpu, sum_metal);
-test_device!(min, min_cpu, min_gpu, min_metal);
-test_device!(max, max_cpu, max_gpu, max_metal);
-test_device!(argmax, argmax_cpu, argmax_gpu, argmax_metal);
-test_device!(argmin, argmin_cpu, argmin_gpu, argmin_metal);
-test_device!(transpose, transpose_cpu, transpose_gpu, transpose_metal);
-test_device!(unary_op, unary_op_cpu, unary_op_gpu, unary_op_metal);
-test_device!(binary_op, binary_op_cpu, binary_op_gpu, binary_op_metal);
-test_device!(embeddings, embeddings_cpu, embeddings_gpu, embeddings_metal);
-test_device!(cmp, cmp_cpu, cmp_gpu, cmp_metal);
-test_device!(matmul, matmul_cpu, matmul_gpu, matmul_metal);
-test_device!(
-    broadcast_matmul,
-    broadcast_matmul_cpu,
-    broadcast_matmul_gpu,
-    broadcast_matmul_metal
-);
-test_device!(
-    broadcasting,
-    broadcasting_cpu,
-    broadcasting_gpu,
-    broadcasting_metal
-);
-test_device!(
-    index_select,
-    index_select_cpu,
-    index_select_gpu,
-    index_select_metal
-);
-test_device!(index_add, index_add_cpu, index_add_gpu, index_add_metal);
-test_device!(gather, gather_cpu, gather_gpu, gather_metal);
-test_device!(
-    scatter_add,
-    scatter_add_cpu,
-    scatter_add_gpu,
-    scatter_add_metal
-);
-test_device!(
-    slice_scatter,
-    slice_scatter_cpu,
-    slice_scatter_gpu,
-    slice_scatter_metal
-);
-test_device!(randn, randn_cpu, randn_gpu, randn_metal);
-test_device!(clamp, clamp_cpu, clamp_gpu, clamp_metal);
-test_device!(var, var_cpu, var_gpu, var_metal);
+test_device!(zeros, zeros_cpu, zeros_gpu);
+test_device!(ones, ones_cpu, ones_gpu);
+test_device!(add_mul, add_mul_cpu, add_mul_gpu);
+test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu);
+test_device!(narrow, narrow_cpu, narrow_gpu);
+test_device!(broadcast, broadcast_cpu, broadcast_gpu);
+test_device!(cat, cat_cpu, cat_gpu);
+test_device!(sum, sum_cpu, sum_gpu);
+test_device!(min, min_cpu, min_gpu);
+test_device!(max, max_cpu, max_gpu);
+test_device!(argmax, argmax_cpu, argmax_gpu);
+test_device!(argmin, argmin_cpu, argmin_gpu);
+test_device!(transpose, transpose_cpu, transpose_gpu);
+test_device!(unary_op, unary_op_cpu, unary_op_gpu);
+test_device!(binary_op, binary_op_cpu, binary_op_gpu);
+test_device!(embeddings, embeddings_cpu, embeddings_gpu);
+test_device!(cmp, cmp_cpu, cmp_gpu);
+test_device!(matmul, matmul_cpu, matmul_gpu);
+test_device!(broadcast_matmul, broadcast_matmul_cpu, broadcast_matmul_gpu);
+test_device!(broadcasting, broadcasting_cpu, broadcasting_gpu);
+test_device!(index_select, index_select_cpu, index_select_gpu);
+test_device!(index_add, index_add_cpu, index_add_gpu);
+test_device!(gather, gather_cpu, gather_gpu);
+test_device!(scatter_add, scatter_add_cpu, scatter_add_gpu);
+test_device!(slice_scatter, slice_scatter_cpu, slice_scatter_gpu);
+test_device!(randn, randn_cpu, randn_gpu);
+test_device!(clamp, clamp_cpu, clamp_gpu);

 // There was originally a bug on the CPU implementation for randn
 // https://github.com/huggingface/candle/issues/381
@ -1151,73 +1089,3 @@ fn pad_with_same() -> Result<()> {
    );
    Ok(())
 }
-
-#[test]
-fn i64_abs() -> Result<()> {
-    let t = Tensor::new(&[-42i64, 1337], &Device::Cpu)?;
-    let t = t.abs()?;
-    assert_eq!(t.to_vec1::<i64>()?, [42, 1337]);
-    Ok(())
-}
-
-#[test]
-fn tril_triu_eye() -> Result<()> {
-    let t = Tensor::tril2(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 1.0, 0.0, 0.0],
-            [1.0, 1.0, 1.0, 0.0],
-            [1.0, 1.0, 1.0, 1.0]
-        ],
-    );
-    let t = Tensor::triu2(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 1.0, 1.0, 1.0],
-            [0.0, 1.0, 1.0, 1.0],
-            [0.0, 0.0, 1.0, 1.0],
-            [0.0, 0.0, 0.0, 1.0]
-        ]
-    );
-    let t = Tensor::eye(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 0.0, 0.0, 0.0],
-            [0.0, 1.0, 0.0, 0.0],
-            [0.0, 0.0, 1.0, 0.0],
-            [0.0, 0.0, 0.0, 1.0]
-        ]
-    );
-    Ok(())
-}
-
-#[test]
-fn cumsum() -> Result<()> {
-    let t = &[3f32, 1., 4., 1., 5.];
-    let t = Tensor::new(t, &Device::Cpu)?;
-    assert_eq!(t.cumsum(0)?.to_vec1::<f32>()?, [3., 4., 8., 9., 14.]);
-    let t = t.unsqueeze(1)?;
-    assert_eq!(
-        t.cumsum(0)?.to_vec2::<f32>()?,
-        [[3.0], [4.0], [8.0], [9.0], [14.0]]
-    );
-    assert_eq!(
-        t.cumsum(1)?.to_vec2::<f32>()?,
-        [[3.0], [1.0], [4.0], [1.0], [5.0]]
-    );
-    let t = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
-    let t = Tensor::new(t, &Device::Cpu)?;
-    assert_eq!(
-        t.cumsum(1)?.to_vec2::<f32>()?,
-        [[3.0, 4.0, 8.0, 9.0, 14.0], [2.0, 3.0, 10.0, 18.0, 20.0]],
-    );
-    assert_eq!(
-        t.cumsum(0)?.to_vec2::<f32>()?,
-        [[3.0, 1.0, 4.0, 1.0, 5.0], [5.0, 2.0, 11.0, 9.0, 7.0]]
-    );
-    Ok(())
-}
--- a/candle-datasets/Cargo.toml
+++ b/candle-datasets/Cargo.toml
@ -11,8 +11,8 @@ readme = "README.md"

 [dependencies]
 byteorder = { workspace = true }
-candle = { path = "../candle-core", version = "0.3.1", package = "candle-core" }
-candle-nn = { path = "../candle-nn", version = "0.3.1" }
+candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
+candle-nn = { path = "../candle-nn", version = "0.3.0" }
 hf-hub = { workspace = true}
 intel-mkl-src = { workspace = true, optional = true }
 memmap2 = { workspace = true }
--- a/candle-datasets/src/vision/cifar.rs
+++ b/candle-datasets/src/vision/cifar.rs
@ -4,9 +4,7 @@
 //! <https://www.cs.toronto.edu/~kriz/cifar.html>
 //! The binary version of the dataset is used.
 use crate::vision::Dataset;
-use candle::{DType, Device, Error, Result, Tensor};
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use parquet::file::reader::{FileReader, SerializedFileReader};
+use candle::{DType, Device, Result, Tensor};
 use std::fs::File;
 use std::io::{BufReader, Read};

@ -62,58 +60,3 @@ pub fn load_dir<T: AsRef<std::path::Path>>(dir: T) -> Result<Dataset> {
        labels: 10,
    })
 }
-
-fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor, Tensor)> {
-    let samples = parquet.metadata().file_metadata().num_rows() as usize;
-    let mut buffer_images: Vec<u8> = Vec::with_capacity(samples * 1_024);
-    let mut buffer_labels: Vec<u8> = Vec::with_capacity(samples);
-    for row in parquet.into_iter().flatten() {
-        for (_name, field) in row.get_column_iter() {
-            if let parquet::record::Field::Group(subrow) = field {
-                for (_name, field) in subrow.get_column_iter() {
-                    if let parquet::record::Field::Bytes(value) = field {
-                        let image = image::load_from_memory(value.data()).unwrap();
-                        buffer_images.extend(image.to_rgb8().as_raw());
-                    }
-                }
-            } else if let parquet::record::Field::Long(label) = field {
-                buffer_labels.push(*label as u8);
-            }
-        }
-    }
-    let images = (Tensor::from_vec(buffer_images, (samples, 3, 32, 32), &Device::Cpu)?
-        .to_dtype(DType::U8)?
-        / 255.)?;
-    let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?;
-    Ok((images, labels))
-}
-
-pub fn load() -> Result<Dataset> {
-    let api = Api::new().map_err(|e| Error::Msg(format!("Api error: {e}")))?;
-    let dataset_id = "cifar10".to_string();
-    let repo = Repo::with_revision(
-        dataset_id,
-        RepoType::Dataset,
-        "refs/convert/parquet".to_string(),
-    );
-    let repo = api.repo(repo);
-    let test_parquet_filename = repo
-        .get("plain_text/test/0000.parquet")
-        .map_err(|e| Error::Msg(format!("Api error: {e}")))?;
-    let train_parquet_filename = repo
-        .get("plain_text/train/0000.parquet")
-        .map_err(|e| Error::Msg(format!("Api error: {e}")))?;
-    let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?)
-        .map_err(|e| Error::Msg(format!("Parquet error: {e}")))?;
-    let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?)
-        .map_err(|e| Error::Msg(format!("Parquet error: {e}")))?;
-    let (test_images, test_labels) = load_parquet(test_parquet)?;
-    let (train_images, train_labels) = load_parquet(train_parquet)?;
-    Ok(crate::vision::Dataset {
-        train_images,
-        train_labels,
-        test_images,
-        test_labels,
-        labels: 10,
-    })
-}
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -11,18 +11,17 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.3.1", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.3.1" }
-candle-nn = { path = "../candle-nn", version = "0.3.1" }
-candle-transformers = { path = "../candle-transformers", version = "0.3.1" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.1", optional = true }
-candle-onnx = { path = "../candle-onnx", version = "0.3.1", optional = true }
+candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.3.0" }
+candle-nn = { path = "../candle-nn", version = "0.3.0" }
+candle-transformers = { path = "../candle-transformers", version = "0.3.0" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.0", optional = true }
 cudarc = { workspace = true, optional = true }
 half = { workspace = true, optional = true }
 image = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 num-traits = { workspace = true }
-pyo3 = { version = "0.20.0", features = ["auto-initialize"], optional = true }
+pyo3 = { version = "0.19.0", features = ["auto-initialize"], optional = true }
 rayon = { workspace = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
@ -56,8 +55,6 @@ cudnn = ["candle/cudnn"]
 flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
 nccl = ["cuda", "cudarc/nccl", "dep:half"]
-onnx = ["candle-onnx"]
-metal = ["candle/metal", "candle-nn/metal"]

 [[example]]
 name = "llama_multiprocess"
@ -66,11 +63,3 @@ required-features = ["cuda", "nccl", "flash-attn"]
 [[example]]
 name = "reinforcement-learning"
 required-features = ["pyo3"]
-
-[[example]]
-name = "onnx"
-required-features = ["onnx"]
-
-[[example]]
-name = "onnx_basics"
-required-features = ["onnx"]
--- a/candle-examples/examples/blip/main.rs
+++ b/candle-examples/examples/blip/main.rs
@ -149,6 +149,6 @@ pub fn main() -> anyhow::Result<()> {
    if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
        print!("{rest}");
    }
-    println!();
+
    Ok(())
 }
--- a/candle-examples/examples/distilbert/README.md
+++ b/candle-examples/examples/distilbert/README.md
@ -1,22 +0,0 @@
-# candle-distilbert
-
-DistilBert is a distiled version of the Bert model.
-
-## Sentence embeddings
-
-DistilBert is used to compute the sentence embeddings for a prompt. The model weights
-are downloaded from the hub on the first run.
-
-```bash
-cargo run --example distilbert --release -- --prompt "Here is a test sentence"
-
-> [[[ 0.5109,  0.1280, -0.2635, ...,  0.3462, -1.0434,  0.1441],
->   [ 0.1735,  0.0818, -0.5549, ...,  0.3472, -0.8264, -0.0244],
->   [ 0.0702, -0.1311, -0.4914, ...,  0.3483, -0.6194,  0.1829],
->   ...
->   [ 0.2993, -0.0106, -0.4640, ...,  0.2844, -0.6732,  0.0042],
->   [ 0.1066, -0.0081, -0.4299, ...,  0.3435, -0.7729,  0.0190],
->   [ 0.8903,  0.2055, -0.2541, ...,  0.3208, -0.6585,  0.0586]]]
-> Tensor[[1, 7, 768], f32]
-
-```
--- a/candle-examples/examples/distilbert/main.rs
+++ b/candle-examples/examples/distilbert/main.rs
@ -1,135 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-use candle_transformers::models::distilbert::{Config, DistilBertModel, DTYPE};
-
-use anyhow::{Error as E, Result};
-use candle::{Device, Tensor};
-use candle_nn::VarBuilder;
-use clap::Parser;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    /// The model to use, check out available models: https://huggingface.co/models?library=sentence-transformers&sort=trending
-    #[arg(long)]
-    model_id: Option<String>,
-
-    #[arg(long)]
-    revision: Option<String>,
-
-    /// When set, compute embeddings for this prompt.
-    #[arg(long)]
-    prompt: String,
-
-    /// Use the pytorch weights rather than the safetensors ones
-    #[arg(long)]
-    use_pth: bool,
-
-    /// The number of times to run the prompt.
-    #[arg(long, default_value = "1")]
-    n: usize,
-
-    /// L2 normalization for embeddings.
-    #[arg(long, default_value = "true")]
-    normalize_embeddings: bool,
-}
-
-impl Args {
-    fn build_model_and_tokenizer(&self) -> Result<(DistilBertModel, Tokenizer)> {
-        let device = candle_examples::device(self.cpu)?;
-        let default_model = "distilbert-base-uncased".to_string();
-        let default_revision = "main".to_string();
-        let (model_id, revision) = match (self.model_id.to_owned(), self.revision.to_owned()) {
-            (Some(model_id), Some(revision)) => (model_id, revision),
-            (Some(model_id), None) => (model_id, "main".to_string()),
-            (None, Some(revision)) => (default_model, revision),
-            (None, None) => (default_model, default_revision),
-        };
-
-        let repo = Repo::with_revision(model_id, RepoType::Model, revision);
-        let (config_filename, tokenizer_filename, weights_filename) = {
-            let api = Api::new()?;
-            let api = api.repo(repo);
-            let config = api.get("config.json")?;
-            let tokenizer = api.get("tokenizer.json")?;
-            let weights = if self.use_pth {
-                api.get("pytorch_model.bin")?
-            } else {
-                api.get("model.safetensors")?
-            };
-            (config, tokenizer, weights)
-        };
-        let config = std::fs::read_to_string(config_filename)?;
-        let config: Config = serde_json::from_str(&config)?;
-        let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-        let vb = if self.use_pth {
-            VarBuilder::from_pth(&weights_filename, DTYPE, &device)?
-        } else {
-            unsafe { VarBuilder::from_mmaped_safetensors(&[weights_filename], DTYPE, &device)? }
-        };
-        let model = DistilBertModel::load(vb, &config)?;
-        Ok((model, tokenizer))
-    }
-}
-
-fn get_mask(size: usize, device: &Device) -> Tensor {
-    let mask: Vec<_> = (0..size)
-        .flat_map(|i| (0..size).map(move |j| u8::from(j > i)))
-        .collect();
-    Tensor::from_slice(&mask, (size, size), device).unwrap()
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        println!("tracing...");
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    let (model, mut tokenizer) = args.build_model_and_tokenizer()?;
-    let device = &model.device;
-
-    let tokenizer = tokenizer
-        .with_padding(None)
-        .with_truncation(None)
-        .map_err(E::msg)?;
-    let tokens = tokenizer
-        .encode(args.prompt, true)
-        .map_err(E::msg)?
-        .get_ids()
-        .to_vec();
-    let token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?;
-    let mask = get_mask(tokens.len(), device);
-
-    println!("token_ids: {:?}", token_ids.to_vec2::<u32>());
-    println!("mask: {:?}", mask.to_vec2::<u8>());
-
-    let ys = model.forward(&token_ids, &mask)?;
-    println!("{ys}");
-
-    Ok(())
-}
-
-pub fn normalize_l2(v: &Tensor) -> Result<Tensor> {
-    Ok(v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)?)
-}
--- a/candle-examples/examples/jina-bert/README.md
+++ b/candle-examples/examples/jina-bert/README.md
@ -1,45 +0,0 @@
-# candle-jina-bert
-
-Jina-Bert is a general large language model with a context size of 8192, [model
-card](https://huggingface.co/jinaai/jina-embeddings-v2-base-en). In this example
-it can be used for two different tasks:
- Compute sentence embeddings for a prompt.
- Compute similarities between a set of sentences.
-
-
-## Sentence embeddings
-
-Jina-Bert is used to compute the sentence embeddings for a prompt. The model weights
-are downloaded from the hub on the first run.
-
-```bash
-cargo run --example jina-bert --release -- --prompt "Here is a test sentence"
-
-> [[[ 0.1595, -0.9885,  0.6494, ...,  0.3003, -0.6901, -1.2355],
->   [ 0.0374, -0.1798,  1.3359, ...,  0.6731,  0.2133, -1.6807],
->   [ 0.1700, -0.8534,  0.8924, ..., -0.1785, -0.0727, -1.5087],
->   ...
->   [-0.3113, -1.3665,  0.2027, ..., -0.2519,  0.1711, -1.5811],
->   [ 0.0907, -1.0492,  0.5382, ...,  0.0242, -0.7077, -1.0830],
->   [ 0.0369, -0.6343,  0.6105, ...,  0.0671,  0.3778, -1.1505]]]
-> Tensor[[1, 7, 768], f32]
-```
-
-## Similarities
-
-In this example, Jina-Bert is used to compute the sentence embeddings for a set of
-sentences (hardcoded in the examples). Then cosine similarities are computed for
-each sentence pair and they are reported by decreasing values, hence the first
-reported pair contains the two sentences that have the highest similarity score.
-The sentence embeddings are computed using average pooling through all the
-sentence tokens, including some potential padding.
-
-```bash
-cargo run --example jina-bert --release
-
-> score: 0.94 'The new movie is awesome' 'The new movie is so great'
-> score: 0.81 'The cat sits outside' 'The cat plays in the garden'
-> score: 0.78 'I love pasta' 'Do you like pizza?'
-> score: 0.68 'I love pasta' 'The new movie is awesome'
-> score: 0.67 'A man is playing guitar' 'A woman watches TV'
-```
--- a/candle-examples/examples/jina-bert/main.rs
+++ b/candle-examples/examples/jina-bert/main.rs
@ -1,180 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use candle_transformers::models::jina_bert::{BertModel, Config};
-
-use anyhow::Error as E;
-use candle::{DType, Module, Tensor};
-use candle_nn::VarBuilder;
-use clap::Parser;
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    /// When set, compute embeddings for this prompt.
-    #[arg(long)]
-    prompt: Option<String>,
-
-    /// The number of times to run the prompt.
-    #[arg(long, default_value = "1")]
-    n: usize,
-
-    /// L2 normalization for embeddings.
-    #[arg(long, default_value = "true")]
-    normalize_embeddings: bool,
-
-    #[arg(long)]
-    tokenizer: Option<String>,
-
-    #[arg(long)]
-    model: Option<String>,
-}
-
-impl Args {
-    fn build_model_and_tokenizer(&self) -> anyhow::Result<(BertModel, tokenizers::Tokenizer)> {
-        use hf_hub::{api::sync::Api, Repo, RepoType};
-        let model = match &self.model {
-            Some(model_file) => std::path::PathBuf::from(model_file),
-            None => Api::new()?
-                .repo(Repo::new(
-                    "jinaai/jina-embeddings-v2-base-en".to_string(),
-                    RepoType::Model,
-                ))
-                .get("model.safetensors")?,
-        };
-        let tokenizer = match &self.tokenizer {
-            Some(file) => std::path::PathBuf::from(file),
-            None => Api::new()?
-                .repo(Repo::new(
-                    "sentence-transformers/all-MiniLM-L6-v2".to_string(),
-                    RepoType::Model,
-                ))
-                .get("tokenizer.json")?,
-        };
-        let device = candle_examples::device(self.cpu)?;
-        let config = Config::v2_base();
-        let tokenizer = tokenizers::Tokenizer::from_file(tokenizer).map_err(E::msg)?;
-        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model], DType::F32, &device)? };
-        let model = BertModel::new(vb, &config)?;
-        Ok((model, tokenizer))
-    }
-}
-
-fn main() -> anyhow::Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        println!("tracing...");
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    let start = std::time::Instant::now();
-
-    let (model, mut tokenizer) = args.build_model_and_tokenizer()?;
-    let device = &model.device;
-
-    if let Some(prompt) = args.prompt {
-        let tokenizer = tokenizer
-            .with_padding(None)
-            .with_truncation(None)
-            .map_err(E::msg)?;
-        let tokens = tokenizer
-            .encode(prompt, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        let token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?;
-        println!("Loaded and encoded {:?}", start.elapsed());
-        for idx in 0..args.n {
-            let start = std::time::Instant::now();
-            let ys = model.forward(&token_ids)?;
-            if idx == 0 {
-                println!("{ys}");
-            }
-            println!("Took {:?}", start.elapsed());
-        }
-    } else {
-        let sentences = [
-            "The cat sits outside",
-            "A man is playing guitar",
-            "I love pasta",
-            "The new movie is awesome",
-            "The cat plays in the garden",
-            "A woman watches TV",
-            "The new movie is so great",
-            "Do you like pizza?",
-        ];
-        let n_sentences = sentences.len();
-        if let Some(pp) = tokenizer.get_padding_mut() {
-            pp.strategy = tokenizers::PaddingStrategy::BatchLongest
-        } else {
-            let pp = tokenizers::PaddingParams {
-                strategy: tokenizers::PaddingStrategy::BatchLongest,
-                ..Default::default()
-            };
-            tokenizer.with_padding(Some(pp));
-        }
-        let tokens = tokenizer
-            .encode_batch(sentences.to_vec(), true)
-            .map_err(E::msg)?;
-        let token_ids = tokens
-            .iter()
-            .map(|tokens| {
-                let tokens = tokens.get_ids().to_vec();
-                Tensor::new(tokens.as_slice(), device)
-            })
-            .collect::<candle::Result<Vec<_>>>()?;
-
-        let token_ids = Tensor::stack(&token_ids, 0)?;
-        println!("running inference on batch {:?}", token_ids.shape());
-        let embeddings = model.forward(&token_ids)?;
-        println!("generated embeddings {:?}", embeddings.shape());
-        // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding)
-        let (_n_sentence, n_tokens, _hidden_size) = embeddings.dims3()?;
-        let embeddings = (embeddings.sum(1)? / (n_tokens as f64))?;
-        let embeddings = if args.normalize_embeddings {
-            normalize_l2(&embeddings)?
-        } else {
-            embeddings
-        };
-        println!("pooled embeddings {:?}", embeddings.shape());
-
-        let mut similarities = vec![];
-        for i in 0..n_sentences {
-            let e_i = embeddings.get(i)?;
-            for j in (i + 1)..n_sentences {
-                let e_j = embeddings.get(j)?;
-                let sum_ij = (&e_i * &e_j)?.sum_all()?.to_scalar::<f32>()?;
-                let sum_i2 = (&e_i * &e_i)?.sum_all()?.to_scalar::<f32>()?;
-                let sum_j2 = (&e_j * &e_j)?.sum_all()?.to_scalar::<f32>()?;
-                let cosine_similarity = sum_ij / (sum_i2 * sum_j2).sqrt();
-                similarities.push((cosine_similarity, i, j))
-            }
-        }
-        similarities.sort_by(|u, v| v.0.total_cmp(&u.0));
-        for &(score, i, j) in similarities[..5].iter() {
-            println!("score: {score:.2} '{}' '{}'", sentences[i], sentences[j])
-        }
-    }
-    Ok(())
-}
-
-pub fn normalize_l2(v: &Tensor) -> candle::Result<Tensor> {
-    v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)
-}
--- a/candle-examples/examples/llama2-c/main.rs
+++ b/candle-examples/examples/llama2-c/main.rs
@ -6,10 +6,9 @@ extern crate accelerate_src;
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

-use candle_transformers::models::llama2_c as model;
-use candle_transformers::models::llama2_c_weights as weights;
-use candle_transformers::models::quantized_llama2_c as qmodel;
+mod model;
 mod training;
+mod weights;
 use clap::{Parser, Subcommand};

 use anyhow::{Error as E, Result};
@ -20,7 +19,6 @@ use std::io::Write;
 use tokenizers::Tokenizer;

 use model::{Config, Llama};
-use qmodel::QLlama;
 use weights::TransformerWeights;

 #[derive(Parser, Debug, Clone)]
@ -154,20 +152,6 @@ fn main() -> anyhow::Result<()> {
    Ok(())
 }

-enum Model {
-    Llama(Llama),
-    QLlama(QLlama),
-}
-
-impl Model {
-    fn forward(&self, xs: &Tensor, pos: usize) -> anyhow::Result<Tensor> {
-        match self {
-            Self::Llama(l) => Ok(l.forward(xs, pos)?),
-            Self::QLlama(l) => Ok(l.forward(xs, pos)?),
-        }
-    }
-}
-
 fn run_eval(args: &EvaluationCmd, common_args: &Args) -> Result<()> {
    use std::io::BufRead;

@ -257,66 +241,24 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {

    let device = candle_examples::device(common_args.cpu)?;

-    let is_gguf = config_path.extension().map_or(false, |v| v == "gguf");
    let is_safetensors = config_path
        .extension()
        .map_or(false, |v| v == "safetensors");
-    let (model, config) = if is_gguf {
-        let vb = qmodel::VarBuilder::from_gguf(config_path)?;
-        let (_vocab_size, dim) = vb
-            .get_no_shape("model.embed_tokens.weight")?
-            .shape()
-            .dims2()?;
-        let config = match dim {
-            64 => Config::tiny_260k(),
-            288 => Config::tiny_15m(),
-            512 => Config::tiny_42m(),
-            768 => Config::tiny_110m(),
-            _ => anyhow::bail!("no config for dim {dim}"),
-        };
-        let freq_cis_real = vb
-            .get(
-                (config.seq_len, config.head_size() / 2),
-                "rot.freq_cis_real",
-            )?
-            .dequantize(&candle::Device::Cpu)?;
-        let freq_cis_imag = vb
-            .get(
-                (config.seq_len, config.head_size() / 2),
-                "rot.freq_cis_imag",
-            )?
-            .dequantize(&candle::Device::Cpu)?;
-
-        let fake_vb = candle_nn::VarBuilder::from_tensors(
-            [
-                ("freq_cis_real".to_string(), freq_cis_real),
-                ("freq_cis_imag".to_string(), freq_cis_imag),
-            ]
-            .into_iter()
-            .collect(),
-            candle::DType::F32,
-            &candle::Device::Cpu,
-        );
-        let cache = model::Cache::new(true, &config, fake_vb)?;
-        let model = Model::QLlama(QLlama::load(vb, &cache, config.clone())?);
-        (model, config)
-    } else if is_safetensors {
-        let config = Config::tiny_15m();
+    let (vb, config) = if is_safetensors {
+        let config = Config::tiny();
        let tensors = candle::safetensors::load(config_path, &device)?;
        let vb = candle_nn::VarBuilder::from_tensors(tensors, candle::DType::F32, &device);
-        let cache = model::Cache::new(true, &config, vb.pp("rot"))?;
-        let model = Model::Llama(Llama::load(vb, &cache, config.clone())?);
-        (model, config)
+        (vb, config)
    } else {
        let mut file = std::fs::File::open(config_path)?;
        let config = Config::from_reader(&mut file)?;
        println!("{config:?}");
        let weights = TransformerWeights::from_reader(&mut file, &config, &device)?;
        let vb = weights.var_builder(&config, &device)?;
-        let cache = model::Cache::new(true, &config, vb.pp("rot"))?;
-        let model = Model::Llama(Llama::load(vb, &cache, config.clone())?);
-        (model, config)
+        (vb, config)
    };
+    let cache = model::Cache::new(true, &config, vb.pp("rot"))?;
+    let model = Llama::load(vb, &cache, config)?;

    println!("starting the inference loop");
    let mut logits_processor = LogitsProcessor::new(299792458, args.temperature, args.top_p);
@ -331,7 +273,7 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {

    let start_gen = std::time::Instant::now();
    for index in 0.. {
-        if tokens.len() >= config.seq_len {
+        if tokens.len() >= model.config.seq_len {
            break;
        }
        let context_size = if index > 0 { 1 } else { tokens.len() };
--- a/candle-examples/examples/llama2-c/model.rs
+++ b/candle-examples/examples/llama2-c/model.rs
@ -17,20 +17,7 @@ pub struct Config {
 }

 impl Config {
-    pub fn tiny_260k() -> Self {
-        Self {
-            dim: 64,
-            hidden_dim: 768,
-            n_layers: 5,
-            n_heads: 8,
-            n_kv_heads: 4,
-            vocab_size: 32000,
-            seq_len: 512,
-            norm_eps: 1e-5,
-        }
-    }
-
-    pub fn tiny_15m() -> Self {
+    pub fn tiny() -> Self {
        Self {
            dim: 288,
            hidden_dim: 768,
@ -42,32 +29,6 @@ impl Config {
            norm_eps: 1e-5,
        }
    }
-
-    pub fn tiny_42m() -> Self {
-        Self {
-            dim: 512,
-            hidden_dim: 768,
-            n_layers: 8,
-            n_heads: 8,
-            n_kv_heads: 8,
-            vocab_size: 32000,
-            seq_len: 1024,
-            norm_eps: 1e-5,
-        }
-    }
-
-    pub fn tiny_110m() -> Self {
-        Self {
-            dim: 768,
-            hidden_dim: 768,
-            n_layers: 12,
-            n_heads: 12,
-            n_kv_heads: 12,
-            vocab_size: 32000,
-            seq_len: 1024,
-            norm_eps: 1e-5,
-        }
-    }
 }

 #[derive(Clone)]
@ -75,9 +36,9 @@ pub struct Cache {
    masks: Arc<Mutex<HashMap<usize, Tensor>>>,
    pub use_kv_cache: bool,
    #[allow(clippy::type_complexity)]
-    pub kvs: Arc<Mutex<Vec<Option<(Tensor, Tensor)>>>>,
-    pub cos: Tensor,
-    pub sin: Tensor,
+    kvs: Arc<Mutex<Vec<Option<(Tensor, Tensor)>>>>,
+    cos: Tensor,
+    sin: Tensor,
    device: Device,
 }

@ -114,7 +75,7 @@ impl Cache {
        })
    }

-    pub fn mask(&self, t: usize) -> Result<Tensor> {
+    fn mask(&self, t: usize) -> Result<Tensor> {
        let mut masks = self.masks.lock().unwrap();
        if let Some(mask) = masks.get(&t) {
            Ok(mask.clone())
--- a/candle-examples/examples/llama2-c/training.rs
+++ b/candle-examples/examples/llama2-c/training.rs
@ -33,7 +33,7 @@ pub fn run(args: &crate::TrainingCmd, common_args: &crate::Args) -> Result<()> {
    );
    let varmap = candle_nn::VarMap::new();
    let vb = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device);
-    let config = Config::tiny_15m();
+    let config = Config::tiny();
    let iter = DatasetRandomIter::new(&dataset, false, config.seq_len, device.clone());
    let batch_iter = candle_datasets::Batcher::new_r2(iter).batch_size(args.batch_size);

--- a/candle-transformers/src/models/llama2_c_weights.rs
+++ b/candle-transformers/src/models/llama2_c_weights.rs
@ -1,8 +1,9 @@
+use anyhow::Result;
 use byteorder::{LittleEndian, ReadBytesExt};
-use candle::{DType, Device, IndexOp, Result, Shape, Tensor};
+use candle::{DType, Device, IndexOp, Shape, Tensor};
 use candle_nn::VarBuilder;

-use super::llama2_c::Config;
+use crate::model::Config;

 pub struct TransformerWeights {
    // token embedding table
--- a/candle-examples/examples/marian-mt/README.md
+++ b/candle-examples/examples/marian-mt/README.md
@ -1,38 +0,0 @@
-# candle-marian-mt
-
-`marian-mt` is a neural machine translation model. In this example it is used to
-translate text from French to English. See the associated [model
-card](https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fr-en) for details on
-the model itself.
-
-## Running an example
-
-```bash
-cargo run --example marian-mt --release -- \
-    --text "Demain, dès l'aube, à l'heure où blanchit la campagne, Je partirai. Vois-tu, je sais que tu m'attends. J'irai par la forêt, j'irai par la montagne. Je ne puis demeurer loin de toi plus longtemps."
-```
-
-```
-<NIL> Tomorrow, at dawn, at the time when the country is whitening, I will go. See,
-I know you are waiting for me. I will go through the forest, I will go through the
-mountain. I cannot stay far from you any longer.</s>
-```
-
-## Generating the tokenizer.json files
-
-You can use the following script to generate the `tokenizer.json` config files
-from the hf-hub repos. This requires the `tokenizers` and `sentencepiece`
-packages to be install and use the `convert_slow_tokenizer.py` script from this
-directory.
-
-```python
-from convert_slow_tokenizer import MarianConverter
-from transformers import AutoTokenizer
-
-
-tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en", use_fast=False)
-fast_tokenizer = MarianConverter(tokenizer, index=0).converted()
-fast_tokenizer.save(f"tokenizer-marian-base-fr.json")
-fast_tokenizer = MarianConverter(tokenizer, index=1).converted()
-fast_tokenizer.save(f"tokenizer-marian-base-en.json")
-```
--- a/candle-examples/examples/marian-mt/convert_slow_tokenizer.py
+++ b/candle-examples/examples/marian-mt/convert_slow_tokenizer.py
--- a/candle-examples/examples/marian-mt/main.rs
+++ b/candle-examples/examples/marian-mt/main.rs
@ -1,152 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::Error as E;
-use clap::{Parser, ValueEnum};
-
-use candle::{DType, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::models::marian;
-
-use tokenizers::Tokenizer;
-
-#[derive(Clone, Debug, Copy, ValueEnum)]
-enum Which {
-    Base,
-    Big,
-}
-
-// TODO: Maybe add support for the conditional prompt.
-#[derive(Parser)]
-struct Args {
-    #[arg(long)]
-    model: Option<String>,
-
-    #[arg(long)]
-    tokenizer: Option<String>,
-
-    #[arg(long)]
-    tokenizer_dec: Option<String>,
-
-    /// Choose the variant of the model to run.
-    #[arg(long, default_value = "big")]
-    which: Which,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Use the quantized version of the model.
-    #[arg(long)]
-    quantized: bool,
-
-    /// Text to be translated
-    #[arg(long)]
-    text: String,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    use hf_hub::api::sync::Api;
-    let args = Args::parse();
-
-    let config = match args.which {
-        Which::Base => marian::Config::opus_mt_fr_en(),
-        Which::Big => marian::Config::opus_mt_tc_big_fr_en(),
-    };
-    let tokenizer = {
-        let tokenizer = match args.tokenizer {
-            Some(tokenizer) => std::path::PathBuf::from(tokenizer),
-            None => {
-                let name = match args.which {
-                    Which::Base => "tokenizer-marian-base-fr.json",
-                    Which::Big => "tokenizer-marian-fr.json",
-                };
-                Api::new()?
-                    .model("lmz/candle-marian".to_string())
-                    .get(name)?
-            }
-        };
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
-    };
-
-    let tokenizer_dec = {
-        let tokenizer = match args.tokenizer_dec {
-            Some(tokenizer) => std::path::PathBuf::from(tokenizer),
-            None => {
-                let name = match args.which {
-                    Which::Base => "tokenizer-marian-base-en.json",
-                    Which::Big => "tokenizer-marian-en.json",
-                };
-                Api::new()?
-                    .model("lmz/candle-marian".to_string())
-                    .get(name)?
-            }
-        };
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
-    };
-    let mut tokenizer_dec = TokenOutputStream::new(tokenizer_dec);
-
-    let device = candle_examples::device(args.cpu)?;
-    let vb = {
-        let model = match args.model {
-            Some(model) => std::path::PathBuf::from(model),
-            None => match args.which {
-                Which::Base => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "Helsinki-NLP/opus-mt-fr-en".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/4".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-                Which::Big => Api::new()?
-                    .model("Helsinki-NLP/opus-mt-tc-big-fr-en".to_string())
-                    .get("model.safetensors")?,
-            },
-        };
-        unsafe { VarBuilder::from_mmaped_safetensors(&[&model], DType::F32, &device)? }
-    };
-    let mut model = marian::MTModel::new(&config, vb)?;
-
-    let mut logits_processor =
-        candle_transformers::generation::LogitsProcessor::new(1337, None, None);
-
-    let encoder_xs = {
-        let mut tokens = tokenizer
-            .encode(args.text, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        tokens.push(config.eos_token_id);
-        let tokens = Tensor::new(tokens.as_slice(), &device)?.unsqueeze(0)?;
-        model.encoder().forward(&tokens, 0)?
-    };
-
-    let mut token_ids = vec![config.decoder_start_token_id];
-    for index in 0..1000 {
-        let context_size = if index >= 1 { 1 } else { token_ids.len() };
-        let start_pos = token_ids.len().saturating_sub(context_size);
-        let input_ids = Tensor::new(&token_ids[start_pos..], &device)?.unsqueeze(0)?;
-        let logits = model.decode(&input_ids, &encoder_xs, start_pos)?;
-        let logits = logits.squeeze(0)?;
-        let logits = logits.get(logits.dim(0)? - 1)?;
-        let token = logits_processor.sample(&logits)?;
-        token_ids.push(token);
-        if let Some(t) = tokenizer_dec.next_token(token)? {
-            use std::io::Write;
-            print!("{t}");
-            std::io::stdout().flush()?;
-        }
-        if token == config.eos_token_id || token == config.forced_eos_token_id {
-            break;
-        }
-    }
-    if let Some(rest) = tokenizer_dec.decode_rest().map_err(E::msg)? {
-        print!("{rest}");
-    }
-    println!();
-    Ok(())
-}
--- a/candle-examples/examples/mnist-training/main.rs
+++ b/candle-examples/examples/mnist-training/main.rs
@ -9,7 +9,7 @@ use clap::{Parser, ValueEnum};
 use rand::prelude::*;

 use candle::{DType, Result, Tensor, D};
-use candle_nn::{loss, ops, Conv2d, Linear, Module, ModuleT, Optimizer, VarBuilder, VarMap};
+use candle_nn::{loss, ops, Conv2d, Linear, Module, Optimizer, VarBuilder, VarMap};

 const IMAGE_DIM: usize = 784;
 const LABELS: usize = 10;
@ -95,7 +95,7 @@ impl ConvNet {
            .flatten_from(1)?
            .apply(&self.fc1)?
            .relu()?;
-        self.dropout.forward_t(&xs, train)?.apply(&self.fc2)
+        self.dropout.forward(&xs, train)?.apply(&self.fc2)
    }
 }

--- a/candle-examples/examples/musicgen/encodec_model.rs
+++ b/candle-examples/examples/musicgen/encodec_model.rs
@ -8,7 +8,6 @@ use candle_nn::{conv1d, Conv1d, Conv1dConfig, VarBuilder};
 #[derive(Debug, Clone, PartialEq)]
 enum NormType {
    WeightNorm,
-    TimeGroupNorm,
    None,
 }

@ -269,7 +268,6 @@ impl Module for EncodecConvTranspose1d {
 struct EncodecConv1d {
    causal: bool,
    conv: Conv1d,
-    norm: Option<candle_nn::GroupNorm>,
 }

 impl EncodecConv1d {
@ -294,7 +292,7 @@ impl EncodecConv1d {
                },
                vb.pp("conv"),
            )?,
-            NormType::None | NormType::TimeGroupNorm => conv1d(
+            NormType::None => conv1d(
                in_c,
                out_c,
                kernel_size,
@ -307,17 +305,9 @@ impl EncodecConv1d {
                vb.pp("conv"),
            )?,
        };
-        let norm = match cfg.norm_type {
-            NormType::None | NormType::WeightNorm => None,
-            NormType::TimeGroupNorm => {
-                let gn = candle_nn::group_norm(1, out_c, 1e-5, vb.pp("norm"))?;
-                Some(gn)
-            }
-        };
        Ok(Self {
            causal: cfg.use_causal_conv,
            conv,
-            norm,
        })
    }
 }
@ -326,10 +316,8 @@ impl Module for EncodecConv1d {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        // TODO: padding, depending on causal.
        let xs = self.conv.forward(xs)?;
-        match &self.norm {
-            None => Ok(xs),
-            Some(norm) => xs.apply(norm),
-        }
+        // If we add support for NormType "time_group_norm", we should add some normalization here.
+        Ok(xs)
    }
 }

--- a/candle-examples/examples/onnx/README.md
+++ b/candle-examples/examples/onnx/README.md
@ -1,10 +0,0 @@
-## Using ONNX models in Candle
-
-This example demonstrates how to run ONNX based models in Candle, the model
-being used here is a small sequeezenet variant.
-
-You can run the example with the following command:
-
-```bash
-cargo run --example squeezenet-onnx --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
-```
--- a/candle-examples/examples/onnx/main.rs
+++ b/candle-examples/examples/onnx/main.rs
@ -1,78 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use candle::{IndexOp, D};
-use clap::{Parser, ValueEnum};
-
-#[derive(Clone, Copy, Debug, ValueEnum)]
-enum Which {
-    SqueezeNet,
-    EfficientNet,
-}
-
-#[derive(Parser)]
-struct Args {
-    #[arg(long)]
-    image: String,
-
-    #[arg(long)]
-    model: Option<String>,
-
-    /// The model to be used.
-    #[arg(value_enum, long, default_value_t = Which::SqueezeNet)]
-    which: Which,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-    let image = candle_examples::imagenet::load_image224(args.image)?;
-    let image = match args.which {
-        Which::SqueezeNet => image,
-        Which::EfficientNet => image.permute((1, 2, 0))?,
-    };
-
-    println!("loaded image {image:?}");
-
-    let model = match args.model {
-        Some(model) => std::path::PathBuf::from(model),
-        None => match args.which {
-            Which::SqueezeNet => hf_hub::api::sync::Api::new()?
-                .model("lmz/candle-onnx".into())
-                .get("squeezenet1.1-7.onnx")?,
-            Which::EfficientNet => hf_hub::api::sync::Api::new()?
-                .model("onnx/EfficientNet-Lite4".into())
-                .get("efficientnet-lite4-11.onnx")?,
-        },
-    };
-
-    let model = candle_onnx::read_file(model)?;
-    let graph = model.graph.as_ref().unwrap();
-    let mut inputs = std::collections::HashMap::new();
-    inputs.insert(graph.input[0].name.to_string(), image.unsqueeze(0)?);
-    let mut outputs = candle_onnx::simple_eval(&model, inputs)?;
-    let output = outputs.remove(&graph.output[0].name).unwrap();
-    let prs = match args.which {
-        Which::SqueezeNet => candle_nn::ops::softmax(&output, D::Minus1)?,
-        Which::EfficientNet => output,
-    };
-    let prs = prs.i(0)?.to_vec1::<f32>()?;
-
-    // Sort the predictions and take the top 5
-    let mut top: Vec<_> = prs.iter().enumerate().collect();
-    top.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
-    let top = top.into_iter().take(5).collect::<Vec<_>>();
-
-    // Print the top predictions
-    for &(i, p) in &top {
-        println!(
-            "{:50}: {:.2}%",
-            candle_examples::imagenet::CLASSES[i],
-            p * 100.0
-        );
-    }
-
-    Ok(())
-}
--- a/candle-examples/examples/onnx_basics.rs
+++ b/candle-examples/examples/onnx_basics.rs
@ -1,87 +0,0 @@
-use anyhow::Result;
-use candle::{Device, Tensor};
-
-use clap::{Parser, Subcommand};
-
-#[derive(Subcommand, Debug, Clone)]
-enum Command {
-    Print {
-        #[arg(long)]
-        file: String,
-    },
-    SimpleEval {
-        #[arg(long)]
-        file: String,
-    },
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-pub struct Args {
-    #[command(subcommand)]
-    command: Command,
-}
-
-pub fn main() -> Result<()> {
-    let args = Args::parse();
-    match args.command {
-        Command::Print { file } => {
-            let model = candle_onnx::read_file(file)?;
-            println!("{model:?}");
-            let graph = model.graph.unwrap();
-            for node in graph.node.iter() {
-                println!("{node:?}");
-            }
-        }
-        Command::SimpleEval { file } => {
-            let model = candle_onnx::read_file(file)?;
-            let graph = model.graph.as_ref().unwrap();
-            let constants: std::collections::HashSet<_> =
-                graph.initializer.iter().map(|i| i.name.as_str()).collect();
-            let mut inputs = std::collections::HashMap::new();
-            for input in graph.input.iter() {
-                use candle_onnx::onnx::tensor_proto::DataType;
-                if constants.contains(input.name.as_str()) {
-                    continue;
-                }
-
-                let type_ = input.r#type.as_ref().expect("no type for input");
-                let type_ = type_.value.as_ref().expect("no type.value for input");
-                let value = match type_ {
-                    candle_onnx::onnx::type_proto::Value::TensorType(tt) => {
-                        let dt = match DataType::try_from(tt.elem_type) {
-                            Ok(dt) => match candle_onnx::dtype(dt) {
-                                Some(dt) => dt,
-                                None => {
-                                    anyhow::bail!(
-                                        "unsupported 'value' data-type {dt:?} for {}",
-                                        input.name
-                                    )
-                                }
-                            },
-                            type_ => anyhow::bail!("unsupported input type {type_:?}"),
-                        };
-                        let shape = tt.shape.as_ref().expect("no tensortype.shape for input");
-                        let dims = shape
-                                .dim
-                                .iter()
-                                .map(|dim| match dim.value.as_ref().expect("no dim value") {
-                                    candle_onnx::onnx::tensor_shape_proto::dimension::Value::DimValue(v) => Ok(*v as usize),
-                                    candle_onnx::onnx::tensor_shape_proto::dimension::Value::DimParam(_) => Ok(42),
-                                })
-                                .collect::<Result<Vec<usize>>>()?;
-                        Tensor::zeros(dims, dt, &Device::Cpu)?
-                    }
-                    type_ => anyhow::bail!("unsupported input type {type_:?}"),
-                };
-                println!("input {}: {value:?}", input.name);
-                inputs.insert(input.name.clone(), value);
-            }
-            let outputs = candle_onnx::simple_eval(&model, inputs)?;
-            for (name, value) in outputs.iter() {
-                println!("output {name}: {value:?}")
-            }
-        }
-    }
-    Ok(())
-}
--- a/candle-examples/examples/phi/main.rs
+++ b/candle-examples/examples/phi/main.rs
@ -124,7 +124,6 @@ enum WhichModel {
    #[value(name = "1.5")]
    V1_5,
    PuffinPhiV2,
-    PhiHermes,
 }

 #[derive(Parser, Debug)]
@ -225,9 +224,7 @@ fn main() -> Result<()> {
                match args.model {
                    WhichModel::V1 => "microsoft/phi-1".to_string(),
                    WhichModel::V1_5 => "microsoft/phi-1_5".to_string(),
-                    WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
-                        "lmz/candle-quantized-phi".to_string()
-                    }
+                    WhichModel::PuffinPhiV2 => "lmz/candle-quantized-phi".to_string(),
                }
            }
        }
@ -241,7 +238,7 @@ fn main() -> Result<()> {
                match args.model {
                    WhichModel::V1 => "refs/pr/2".to_string(),
                    WhichModel::V1_5 => "refs/pr/18".to_string(),
-                    WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => "main".to_string(),
+                    WhichModel::PuffinPhiV2 => "main".to_string(),
                }
            }
        }
@ -251,9 +248,7 @@ fn main() -> Result<()> {
        Some(file) => std::path::PathBuf::from(file),
        None => match args.model {
            WhichModel::V1 | WhichModel::V1_5 => repo.get("tokenizer.json")?,
-            WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
-                repo.get("tokenizer-puffin-phi-v2.json")?
-            }
+            WhichModel::PuffinPhiV2 => repo.get("tokenizer-puffin-phi-v2.json")?,
        },
    };
    let filename = match args.weight_file {
@ -264,13 +259,11 @@ fn main() -> Result<()> {
                    WhichModel::V1 => repo.get("model-v1-q4k.gguf")?,
                    WhichModel::V1_5 => repo.get("model-q4k.gguf")?,
                    WhichModel::PuffinPhiV2 => repo.get("model-puffin-phi-v2-q4k.gguf")?,
-                    WhichModel::PhiHermes => repo.get("model-phi-hermes-1_3B-q4k.gguf")?,
                }
            } else {
                match args.model {
                    WhichModel::V1 | WhichModel::V1_5 => repo.get("model.safetensors")?,
                    WhichModel::PuffinPhiV2 => repo.get("model-puffin-phi-v2.safetensors")?,
-                    WhichModel::PhiHermes => repo.get("model-phi-hermes-1_3B.safetensors")?,
                }
            }
        }
@ -283,7 +276,6 @@ fn main() -> Result<()> {
        WhichModel::V1 => Config::v1(),
        WhichModel::V1_5 => Config::v1_5(),
        WhichModel::PuffinPhiV2 => Config::puffin_phi_v2(),
-        WhichModel::PhiHermes => Config::phi_hermes_1_3b(),
    };
    let (model, device) = if args.quantized {
        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(&filename)?;
--- a/candle-examples/examples/quantized-t5/README.md
+++ b/candle-examples/examples/quantized-t5/README.md
@ -1,7 +1,5 @@
 # candle-quantized-t5

-## Seq2Seq example
-
 This example uses a quantized version of the t5 model.

 ```bash
@ -10,8 +8,6 @@ $ cargo run --example quantized-t5 --release -- --prompt "translate to German: A
 Eine schöne Kerze.
 ```

-## Generating Quantized weight files
-
 The weight file is automatically retrieved from the hub. It is also possible to
 generate quantized weight files from the original safetensors file by using the
 `tensor-tools` command line utility via:
@ -20,11 +16,8 @@ generate quantized weight files from the original safetensors file by using the
 $ cargo run --example tensor-tools --release -- quantize --quantization q6k PATH/TO/T5/model.safetensors /tmp/model.gguf
 ```

-## Using custom models
-
-To use a different model, specify the `model-id`.
-
-For example, for text editing, you can use quantized [CoEdit models](https://huggingface.co/jbochi/candle-coedit-quantized).
+To use a different model, specify the `model-id`. For example, you can use
+quantized [CoEdit models](https://huggingface.co/jbochi/candle-coedit-quantized).

 ```bash
 $ cargo run --example quantized-t5 --release  -- \
@ -33,7 +26,6 @@ $ cargo run --example quantized-t5 --release  -- \
  --temperature 0
 ...
 Although their flight is weak, they run quickly through the tree canopy.
-```

 By default, it will look for `model.gguf` and `config.json`, but you can specify
 custom local or remote `weight-file` and `config-file`s:
@ -48,16 +40,3 @@ cargo run --example quantized-t5 --release  -- \
 ...
 Note that a storm surge is what forecasters consider a hurricane's most dangerous part.
 ```
-
-### [MADLAD-400](https://arxiv.org/abs/2309.04662)
-
-MADLAD-400 is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models.
-
-```bash
-cargo run --example quantized-t5 --release  -- \
-  --model-id "jbochi/madlad400-3b-mt" --weight-file "model-q4k.gguf" \
-  --prompt "<2de> How are you, my friend?" \
-  --temperature 0
-...
- Wie geht es dir, mein Freund?
-```
--- a/candle-examples/examples/quantized-t5/main.rs
+++ b/candle-examples/examples/quantized-t5/main.rs
@ -173,11 +173,7 @@ fn main() -> Result<()> {
        .to_vec();
    let input_token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?;
    let mut model = builder.build_model()?;
-    let mut output_token_ids = [builder
-        .config
-        .decoder_start_token_id
-        .unwrap_or(builder.config.pad_token_id) as u32]
-    .to_vec();
+    let mut output_token_ids = [builder.config.pad_token_id as u32].to_vec();
    let temperature = if args.temperature <= 0. {
        None
    } else {
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -12,7 +12,6 @@ use candle::quantized::{ggml_file, gguf_file};
 use candle::{Device, Tensor};
 use candle_transformers::generation::LogitsProcessor;

-use candle_examples::token_output_stream::TokenOutputStream;
 use candle_transformers::models::quantized_llama as model;
 use model::ModelWeights;

@ -25,7 +24,7 @@ enum Prompt {
    One(String),
 }

-#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
+#[derive(Clone, Debug, Copy, ValueEnum)]
 enum Which {
    #[value(name = "7b")]
    L7b,
@ -49,12 +48,8 @@ enum Which {
    Mistral7b,
    #[value(name = "7b-mistral-instruct")]
    Mistral7bInstruct,
-    #[value(name = "7b-zephyr-a")]
-    Zephyr7bAlpha,
-    #[value(name = "7b-zephyr-b")]
-    Zephyr7bBeta,
-    #[value(name = "7b-open-chat-3.5")]
-    OpenChat35,
+    #[value(name = "7b-zephyr")]
+    Zephyr7b,
 }

 impl Which {
@ -69,50 +64,7 @@ impl Which {
            | Self::L7bCode
            | Self::L13bCode
            | Self::L34bCode => false,
-            // Zephyr and OpenChat are fine tuned versions of mistral and should be treated in the
-            // same way.
-            Self::OpenChat35
-            | Self::Zephyr7bAlpha
-            | Self::Zephyr7bBeta
-            | Self::Mistral7b
-            | Self::Mistral7bInstruct => true,
-        }
-    }
-
-    fn is_zephyr(&self) -> bool {
-        match self {
-            Self::L7b
-            | Self::L13b
-            | Self::L70b
-            | Self::L7bChat
-            | Self::L13bChat
-            | Self::L70bChat
-            | Self::L7bCode
-            | Self::L13bCode
-            | Self::L34bCode
-            | Self::Mistral7b
-            | Self::Mistral7bInstruct
-            | Self::OpenChat35 => false,
-            Self::Zephyr7bAlpha | Self::Zephyr7bBeta => true,
-        }
-    }
-
-    fn is_open_chat(&self) -> bool {
-        match self {
-            Which::L7b
-            | Which::L13b
-            | Which::L70b
-            | Which::L7bChat
-            | Which::L13bChat
-            | Which::L70bChat
-            | Which::L7bCode
-            | Which::L13bCode
-            | Which::L34bCode
-            | Which::Mistral7b
-            | Which::Mistral7bInstruct
-            | Which::Zephyr7bAlpha
-            | Which::Zephyr7bBeta => false,
-            Which::OpenChat35 => true,
+            Self::Mistral7b | Self::Mistral7bInstruct | Self::Zephyr7b => true,
        }
    }
 }
@ -131,7 +83,7 @@ struct Args {
    prompt: Option<String>,

    /// The length of the sample to generate (in tokens).
-    #[arg(short = 'n', long, default_value_t = 1000)]
+    #[arg(short = 'n', long, default_value_t = 100)]
    sample_len: usize,

    /// The tokenizer config in json format.
@ -181,9 +133,7 @@ impl Args {
            Some(config) => std::path::PathBuf::from(config),
            None => {
                let api = hf_hub::api::sync::Api::new()?;
-                let repo = if self.which.is_open_chat() {
-                    "openchat/openchat_3.5"
-                } else if self.which.is_mistral() {
+                let repo = if self.which.is_mistral() {
                    "mistralai/Mistral-7B-v0.1"
                } else {
                    "hf-internal-testing/llama-tokenizer"
@ -226,14 +176,10 @@ impl Args {
                        "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
                        "mistral-7b-instruct-v0.1.Q4_K_S.gguf",
                    ),
-                    Which::Zephyr7bAlpha => (
+                    Which::Zephyr7b => (
                        "TheBloke/zephyr-7B-alpha-GGUF",
                        "zephyr-7b-alpha.Q4_K_M.gguf",
                    ),
-                    Which::Zephyr7bBeta => {
-                        ("TheBloke/zephyr-7B-beta-GGUF", "zephyr-7b-beta.Q4_K_M.gguf")
-                    }
-                    Which::OpenChat35 => ("TheBloke/openchat_3.5-GGUF", "openchat_3.5.Q4_K_M.gguf"),
                };
                let api = hf_hub::api::sync::Api::new()?;
                let api = api.model(repo.to_string());
@ -244,6 +190,31 @@ impl Args {
    }
 }

+fn print_token(next_token: u32, tokenizer: &Tokenizer) {
+    // Extracting the last token as a string is complicated, here we just apply some simple
+    // heuristics as it seems to work well enough for this example. See the following for more
+    // details:
+    // https://github.com/huggingface/tokenizers/issues/1141#issuecomment-1562644141
+    if let Some(text) = tokenizer.id_to_token(next_token) {
+        let text = text.replace('▁', " ");
+        let ascii = text
+            .strip_prefix("<0x")
+            .and_then(|t| t.strip_suffix('>'))
+            .and_then(|t| u8::from_str_radix(t, 16).ok());
+        match ascii {
+            None => print!("{text}"),
+            Some(ascii) => {
+                if let Some(chr) = char::from_u32(ascii as u32) {
+                    if chr.is_ascii() {
+                        print!("{chr}")
+                    }
+                }
+            }
+        }
+        let _ = std::io::stdout().flush();
+    }
+}
+
 fn format_size(size_in_bytes: usize) -> String {
    if size_in_bytes < 1_000 {
        format!("{}B", size_in_bytes)
@ -332,11 +303,9 @@ fn main() -> anyhow::Result<()> {
                | Which::L34bCode => 1,
                Which::Mistral7b
                | Which::Mistral7bInstruct
-                | Which::Zephyr7bAlpha
-                | Which::Zephyr7bBeta
+                | Which::Zephyr7b
                | Which::L70b
-                | Which::L70bChat
-                | Which::OpenChat35 => 8,
+                | Which::L70bChat => 8,
            };
            ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa))?
        }
@ -344,7 +313,6 @@ fn main() -> anyhow::Result<()> {
    println!("model built");

    let tokenizer = args.tokenizer()?;
-    let mut tos = TokenOutputStream::new(tokenizer);
    let prompt = match args.prompt.as_deref() {
        Some("chat") => Prompt::Chat,
        Some("interactive") => Prompt::Interactive,
@ -353,11 +321,10 @@ fn main() -> anyhow::Result<()> {
    };

    let mut pre_prompt_tokens = vec![];
-    for prompt_index in 0.. {
+    loop {
        let prompt_str = match &prompt {
            Prompt::One(prompt) => prompt.clone(),
            Prompt::Interactive | Prompt::Chat => {
-                let is_interactive = matches!(prompt, Prompt::Interactive);
                print!("> ");
                std::io::stdout().flush()?;
                let mut prompt = String::new();
@ -368,15 +335,7 @@ fn main() -> anyhow::Result<()> {
                        prompt.pop();
                    }
                }
-                if args.which.is_open_chat() {
-                    format!("User: {prompt}<|end_of_turn|>Assistant: ")
-                } else if args.which.is_zephyr() {
-                    if prompt_index == 0 || is_interactive {
-                        format!("<|system|>\n</s>\n<|user|>\n{prompt}</s>\n<|assistant|>",)
-                    } else {
-                        format!("<|user|>\n{prompt}</s>\n<|assistant|>")
-                    }
-                } else if args.which.is_mistral() {
+                if args.which.is_mistral() {
                    format!("[INST] {prompt} [/INST]")
                } else {
                    prompt
@ -384,8 +343,7 @@ fn main() -> anyhow::Result<()> {
            }
        };
        print!("{}", &prompt_str);
-        let tokens = tos
-            .tokenizer()
+        let tokens = tokenizer
            .encode(prompt_str, true)
            .map_err(anyhow::Error::msg)?;
        if args.verbose_prompt {
@ -415,19 +373,11 @@ fn main() -> anyhow::Result<()> {
        };
        let prompt_dt = start_prompt_processing.elapsed();
        all_tokens.push(next_token);
-        if let Some(t) = tos.next_token(next_token)? {
-            print!("{t}");
-            std::io::stdout().flush()?;
-        }
+        print_token(next_token, &tokenizer);
+
+        let eos_token = *tokenizer.get_vocab(true).get("</s>").unwrap();

-        let eos_token = if args.which.is_open_chat() {
-            "<|end_of_turn|>"
-        } else {
-            "</s>"
-        };
-        let eos_token = *tos.tokenizer().get_vocab(true).get(eos_token).unwrap();
        let start_post_prompt = std::time::Instant::now();
-        let mut sampled = 0;
        for index in 0..to_sample {
            let input = Tensor::new(&[next_token], &Device::Cpu)?.unsqueeze(0)?;
            let logits = model.forward(&input, prompt_tokens.len() + index)?;
@ -444,19 +394,11 @@ fn main() -> anyhow::Result<()> {
            };
            next_token = logits_processor.sample(&logits)?;
            all_tokens.push(next_token);
-            if let Some(t) = tos.next_token(next_token)? {
-                print!("{t}");
-                std::io::stdout().flush()?;
-            }
-            sampled += 1;
+            print_token(next_token, &tokenizer);
            if next_token == eos_token {
                break;
            };
        }
-        if let Some(rest) = tos.decode_rest().map_err(candle::Error::msg)? {
-            print!("{rest}");
-        }
-        std::io::stdout().flush()?;
        let dt = start_post_prompt.elapsed();
        println!(
            "\n\n{:4} prompt tokens processed: {:.2} token/s",
@ -464,8 +406,9 @@ fn main() -> anyhow::Result<()> {
            prompt_tokens.len() as f64 / prompt_dt.as_secs_f64(),
        );
        println!(
-            "{sampled:4} tokens generated: {:.2} token/s",
-            sampled as f64 / dt.as_secs_f64(),
+            "{:4} tokens generated: {:.2} token/s",
+            to_sample,
+            to_sample as f64 / dt.as_secs_f64(),
        );

        match prompt {
--- a/candle-examples/examples/reinforcement-learning/ddpg.rs
+++ b/candle-examples/examples/reinforcement-learning/ddpg.rs
@ -1,451 +0,0 @@
-use std::collections::VecDeque;
-use std::fmt::Display;
-
-use candle::{DType, Device, Error, Module, Result, Tensor, Var};
-use candle_nn::{
-    func, linear, sequential::seq, Activation, AdamW, Optimizer, ParamsAdamW, Sequential,
-    VarBuilder, VarMap,
-};
-use rand::{distributions::Uniform, thread_rng, Rng};
-
-pub struct OuNoise {
-    mu: f64,
-    theta: f64,
-    sigma: f64,
-    state: Tensor,
-}
-impl OuNoise {
-    pub fn new(mu: f64, theta: f64, sigma: f64, size_action: usize) -> Result<Self> {
-        Ok(Self {
-            mu,
-            theta,
-            sigma,
-            state: Tensor::ones(size_action, DType::F32, &Device::Cpu)?,
-        })
-    }
-
-    pub fn sample(&mut self) -> Result<Tensor> {
-        let rand = Tensor::randn_like(&self.state, 0.0, 1.0)?;
-        let dx = ((self.theta * (self.mu - &self.state)?)? + (self.sigma * rand)?)?;
-        self.state = (&self.state + dx)?;
-        Ok(self.state.clone())
-    }
-}
-
-#[derive(Clone)]
-struct Transition {
-    state: Tensor,
-    action: Tensor,
-    reward: Tensor,
-    next_state: Tensor,
-    terminated: bool,
-    truncated: bool,
-}
-impl Transition {
-    fn new(
-        state: &Tensor,
-        action: &Tensor,
-        reward: &Tensor,
-        next_state: &Tensor,
-        terminated: bool,
-        truncated: bool,
-    ) -> Self {
-        Self {
-            state: state.clone(),
-            action: action.clone(),
-            reward: reward.clone(),
-            next_state: next_state.clone(),
-            terminated,
-            truncated,
-        }
-    }
-}
-
-pub struct ReplayBuffer {
-    buffer: VecDeque<Transition>,
-    capacity: usize,
-    size: usize,
-}
-impl ReplayBuffer {
-    pub fn new(capacity: usize) -> Self {
-        Self {
-            buffer: VecDeque::with_capacity(capacity),
-            capacity,
-            size: 0,
-        }
-    }
-
-    pub fn push(
-        &mut self,
-        state: &Tensor,
-        action: &Tensor,
-        reward: &Tensor,
-        next_state: &Tensor,
-        terminated: bool,
-        truncated: bool,
-    ) {
-        if self.size == self.capacity {
-            self.buffer.pop_front();
-        } else {
-            self.size += 1;
-        }
-        self.buffer.push_back(Transition::new(
-            state, action, reward, next_state, terminated, truncated,
-        ));
-    }
-
-    #[allow(clippy::type_complexity)]
-    pub fn random_batch(
-        &self,
-        batch_size: usize,
-    ) -> Result<Option<(Tensor, Tensor, Tensor, Tensor, Vec<bool>, Vec<bool>)>> {
-        if self.size < batch_size {
-            Ok(None)
-        } else {
-            let transitions: Vec<&Transition> = thread_rng()
-                .sample_iter(Uniform::from(0..self.size))
-                .take(batch_size)
-                .map(|i| self.buffer.get(i).unwrap())
-                .collect();
-
-            let states: Vec<Tensor> = transitions
-                .iter()
-                .map(|t| t.state.unsqueeze(0))
-                .collect::<Result<_>>()?;
-            let actions: Vec<Tensor> = transitions
-                .iter()
-                .map(|t| t.action.unsqueeze(0))
-                .collect::<Result<_>>()?;
-            let rewards: Vec<Tensor> = transitions
-                .iter()
-                .map(|t| t.reward.unsqueeze(0))
-                .collect::<Result<_>>()?;
-            let next_states: Vec<Tensor> = transitions
-                .iter()
-                .map(|t| t.next_state.unsqueeze(0))
-                .collect::<Result<_>>()?;
-            let terminateds: Vec<bool> = transitions.iter().map(|t| t.terminated).collect();
-            let truncateds: Vec<bool> = transitions.iter().map(|t| t.truncated).collect();
-
-            Ok(Some((
-                Tensor::cat(&states, 0)?,
-                Tensor::cat(&actions, 0)?,
-                Tensor::cat(&rewards, 0)?,
-                Tensor::cat(&next_states, 0)?,
-                terminateds,
-                truncateds,
-            )))
-        }
-    }
-}
-
-fn track(
-    varmap: &mut VarMap,
-    vb: &VarBuilder,
-    target_prefix: &str,
-    network_prefix: &str,
-    dims: &[(usize, usize)],
-    tau: f64,
-) -> Result<()> {
-    for (i, &(in_dim, out_dim)) in dims.iter().enumerate() {
-        let target_w = vb.get((out_dim, in_dim), &format!("{target_prefix}-fc{i}.weight"))?;
-        let network_w = vb.get((out_dim, in_dim), &format!("{network_prefix}-fc{i}.weight"))?;
-        varmap.set_one(
-            format!("{target_prefix}-fc{i}.weight"),
-            ((tau * network_w)? + ((1.0 - tau) * target_w)?)?,
-        )?;
-
-        let target_b = vb.get(out_dim, &format!("{target_prefix}-fc{i}.bias"))?;
-        let network_b = vb.get(out_dim, &format!("{network_prefix}-fc{i}.bias"))?;
-        varmap.set_one(
-            format!("{target_prefix}-fc{i}.bias"),
-            ((tau * network_b)? + ((1.0 - tau) * target_b)?)?,
-        )?;
-    }
-    Ok(())
-}
-
-struct Actor<'a> {
-    varmap: VarMap,
-    vb: VarBuilder<'a>,
-    network: Sequential,
-    target_network: Sequential,
-    size_state: usize,
-    size_action: usize,
-    dims: Vec<(usize, usize)>,
-}
-
-impl Actor<'_> {
-    fn new(device: &Device, dtype: DType, size_state: usize, size_action: usize) -> Result<Self> {
-        let mut varmap = VarMap::new();
-        let vb = VarBuilder::from_varmap(&varmap, dtype, device);
-
-        let dims = vec![(size_state, 400), (400, 300), (300, size_action)];
-
-        let make_network = |prefix: &str| {
-            let seq = seq()
-                .add(linear(
-                    dims[0].0,
-                    dims[0].1,
-                    vb.pp(format!("{prefix}-fc0")),
-                )?)
-                .add(Activation::Relu)
-                .add(linear(
-                    dims[1].0,
-                    dims[1].1,
-                    vb.pp(format!("{prefix}-fc1")),
-                )?)
-                .add(Activation::Relu)
-                .add(linear(
-                    dims[2].0,
-                    dims[2].1,
-                    vb.pp(format!("{prefix}-fc2")),
-                )?)
-                .add(func(|xs| xs.tanh()));
-            Ok::<Sequential, Error>(seq)
-        };
-
-        let network = make_network("actor")?;
-        let target_network = make_network("target-actor")?;
-
-        // this sets the two networks to be equal to each other using tau = 1.0
-        track(&mut varmap, &vb, "target-actor", "actor", &dims, 1.0);
-
-        Ok(Self {
-            varmap,
-            vb,
-            network,
-            target_network,
-            size_state,
-            size_action,
-            dims,
-        })
-    }
-
-    fn forward(&self, state: &Tensor) -> Result<Tensor> {
-        self.network.forward(state)
-    }
-
-    fn target_forward(&self, state: &Tensor) -> Result<Tensor> {
-        self.target_network.forward(state)
-    }
-
-    fn track(&mut self, tau: f64) -> Result<()> {
-        track(
-            &mut self.varmap,
-            &self.vb,
-            "target-actor",
-            "actor",
-            &self.dims,
-            tau,
-        )
-    }
-}
-
-struct Critic<'a> {
-    varmap: VarMap,
-    vb: VarBuilder<'a>,
-    network: Sequential,
-    target_network: Sequential,
-    size_state: usize,
-    size_action: usize,
-    dims: Vec<(usize, usize)>,
-}
-
-impl Critic<'_> {
-    fn new(device: &Device, dtype: DType, size_state: usize, size_action: usize) -> Result<Self> {
-        let mut varmap = VarMap::new();
-        let vb = VarBuilder::from_varmap(&varmap, dtype, device);
-
-        let dims: Vec<(usize, usize)> = vec![(size_state + size_action, 400), (400, 300), (300, 1)];
-
-        let make_network = |prefix: &str| {
-            let seq = seq()
-                .add(linear(
-                    dims[0].0,
-                    dims[0].1,
-                    vb.pp(format!("{prefix}-fc0")),
-                )?)
-                .add(Activation::Relu)
-                .add(linear(
-                    dims[1].0,
-                    dims[1].1,
-                    vb.pp(format!("{prefix}-fc1")),
-                )?)
-                .add(Activation::Relu)
-                .add(linear(
-                    dims[2].0,
-                    dims[2].1,
-                    vb.pp(format!("{prefix}-fc2")),
-                )?);
-            Ok::<Sequential, Error>(seq)
-        };
-
-        let network = make_network("critic")?;
-        let target_network = make_network("target-critic")?;
-
-        // this sets the two networks to be equal to each other using tau = 1.0
-        track(&mut varmap, &vb, "target-critic", "critic", &dims, 1.0);
-
-        Ok(Self {
-            varmap,
-            vb,
-            network,
-            target_network,
-            size_state,
-            size_action,
-            dims,
-        })
-    }
-
-    fn forward(&self, state: &Tensor, action: &Tensor) -> Result<Tensor> {
-        let xs = Tensor::cat(&[action, state], 1)?;
-        self.network.forward(&xs)
-    }
-
-    fn target_forward(&self, state: &Tensor, action: &Tensor) -> Result<Tensor> {
-        let xs = Tensor::cat(&[action, state], 1)?;
-        self.target_network.forward(&xs)
-    }
-
-    fn track(&mut self, tau: f64) -> Result<()> {
-        track(
-            &mut self.varmap,
-            &self.vb,
-            "target-critic",
-            "critic",
-            &self.dims,
-            tau,
-        )
-    }
-}
-
-#[allow(clippy::upper_case_acronyms)]
-pub struct DDPG<'a> {
-    actor: Actor<'a>,
-    actor_optim: AdamW,
-    critic: Critic<'a>,
-    critic_optim: AdamW,
-    gamma: f64,
-    tau: f64,
-    replay_buffer: ReplayBuffer,
-    ou_noise: OuNoise,
-
-    size_state: usize,
-    size_action: usize,
-    pub train: bool,
-}
-
-impl DDPG<'_> {
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        device: &Device,
-        size_state: usize,
-        size_action: usize,
-        train: bool,
-        actor_lr: f64,
-        critic_lr: f64,
-        gamma: f64,
-        tau: f64,
-        buffer_capacity: usize,
-        ou_noise: OuNoise,
-    ) -> Result<Self> {
-        let filter_by_prefix = |varmap: &VarMap, prefix: &str| {
-            varmap
-                .data()
-                .lock()
-                .unwrap()
-                .iter()
-                .filter_map(|(name, var)| name.starts_with(prefix).then_some(var.clone()))
-                .collect::<Vec<Var>>()
-        };
-
-        let actor = Actor::new(device, DType::F32, size_state, size_action)?;
-        let actor_optim = AdamW::new(
-            filter_by_prefix(&actor.varmap, "actor"),
-            ParamsAdamW {
-                lr: actor_lr,
-                ..Default::default()
-            },
-        )?;
-
-        let critic = Critic::new(device, DType::F32, size_state, size_action)?;
-        let critic_optim = AdamW::new(
-            filter_by_prefix(&critic.varmap, "critic"),
-            ParamsAdamW {
-                lr: critic_lr,
-                ..Default::default()
-            },
-        )?;
-
-        Ok(Self {
-            actor,
-            actor_optim,
-            critic,
-            critic_optim,
-            gamma,
-            tau,
-            replay_buffer: ReplayBuffer::new(buffer_capacity),
-            ou_noise,
-            size_state,
-            size_action,
-            train,
-        })
-    }
-
-    pub fn remember(
-        &mut self,
-        state: &Tensor,
-        action: &Tensor,
-        reward: &Tensor,
-        next_state: &Tensor,
-        terminated: bool,
-        truncated: bool,
-    ) {
-        self.replay_buffer
-            .push(state, action, reward, next_state, terminated, truncated)
-    }
-
-    pub fn actions(&mut self, state: &Tensor) -> Result<f32> {
-        let actions = self
-            .actor
-            .forward(&state.detach()?.unsqueeze(0)?)?
-            .squeeze(0)?;
-        let actions = if self.train {
-            (actions + self.ou_noise.sample()?)?
-        } else {
-            actions
-        };
-        actions.squeeze(0)?.to_scalar::<f32>()
-    }
-
-    pub fn train(&mut self, batch_size: usize) -> Result<()> {
-        let (states, actions, rewards, next_states, _, _) =
-            match self.replay_buffer.random_batch(batch_size)? {
-                Some(v) => v,
-                _ => return Ok(()),
-            };
-
-        let q_target = self
-            .critic
-            .target_forward(&next_states, &self.actor.target_forward(&next_states)?)?;
-        let q_target = (rewards + (self.gamma * q_target)?.detach())?;
-        let q = self.critic.forward(&states, &actions)?;
-        let diff = (q_target - q)?;
-
-        let critic_loss = diff.sqr()?.mean_all()?;
-        self.critic_optim.backward_step(&critic_loss)?;
-
-        let actor_loss = self
-            .critic
-            .forward(&states, &self.actor.forward(&states)?)?
-            .mean_all()?
-            .neg()?;
-        self.actor_optim.backward_step(&actor_loss)?;
-
-        self.critic.track(self.tau)?;
-        self.actor.track(self.tau)?;
-
-        Ok(())
-    }
-}
--- a/candle-examples/examples/reinforcement-learning/gym_env.rs
+++ b/candle-examples/examples/reinforcement-learning/gym_env.rs
@ -7,22 +7,20 @@ use pyo3::types::PyDict;
 /// The return value for a step.
 #[derive(Debug)]
 pub struct Step<A> {
-    pub state: Tensor,
+    pub obs: Tensor,
    pub action: A,
    pub reward: f64,
-    pub terminated: bool,
-    pub truncated: bool,
+    pub is_done: bool,
 }

 impl<A: Copy> Step<A> {
    /// Returns a copy of this step changing the observation tensor.
-    pub fn copy_with_obs(&self, state: &Tensor) -> Step<A> {
+    pub fn copy_with_obs(&self, obs: &Tensor) -> Step<A> {
        Step {
-            state: state.clone(),
+            obs: obs.clone(),
            action: self.action,
            reward: self.reward,
-            terminated: self.terminated,
-            truncated: self.truncated,
+            is_done: self.is_done,
        }
    }
 }
@ -65,14 +63,14 @@ impl GymEnv {

    /// Resets the environment, returning the observation tensor.
    pub fn reset(&self, seed: u64) -> Result<Tensor> {
-        let state: Vec<f32> = Python::with_gil(|py| {
+        let obs: Vec<f32> = Python::with_gil(|py| {
            let kwargs = PyDict::new(py);
            kwargs.set_item("seed", seed)?;
-            let state = self.env.call_method(py, "reset", (), Some(kwargs))?;
-            state.as_ref(py).get_item(0)?.extract()
+            let obs = self.env.call_method(py, "reset", (), Some(kwargs))?;
+            obs.as_ref(py).get_item(0)?.extract()
        })
        .map_err(w)?;
-        Tensor::new(state, &Device::Cpu)
+        Tensor::new(obs, &Device::Cpu)
    }

    /// Applies an environment step using the specified action.
@ -80,23 +78,21 @@ impl GymEnv {
        &self,
        action: A,
    ) -> Result<Step<A>> {
-        let (state, reward, terminated, truncated) = Python::with_gil(|py| {
+        let (obs, reward, is_done) = Python::with_gil(|py| {
            let step = self.env.call_method(py, "step", (action.clone(),), None)?;
            let step = step.as_ref(py);
-            let state: Vec<f32> = step.get_item(0)?.extract()?;
+            let obs: Vec<f32> = step.get_item(0)?.extract()?;
            let reward: f64 = step.get_item(1)?.extract()?;
-            let terminated: bool = step.get_item(2)?.extract()?;
-            let truncated: bool = step.get_item(3)?.extract()?;
-            Ok((state, reward, terminated, truncated))
+            let is_done: bool = step.get_item(2)?.extract()?;
+            Ok((obs, reward, is_done))
        })
        .map_err(w)?;
-        let state = Tensor::new(state, &Device::Cpu)?;
+        let obs = Tensor::new(obs, &Device::Cpu)?;
        Ok(Step {
-            state,
-            action,
+            obs,
            reward,
-            terminated,
-            truncated,
+            is_done,
+            action,
        })
    }

--- a/candle-examples/examples/reinforcement-learning/main.rs
+++ b/candle-examples/examples/reinforcement-learning/main.rs
@ -9,34 +9,14 @@ extern crate accelerate_src;
 mod gym_env;
 mod vec_gym_env;

-mod ddpg;
-
-use candle::{Device, Result, Tensor};
+use candle::Result;
 use clap::Parser;
 use rand::Rng;

-// The impact of the q value of the next state on the current state's q value.
-const GAMMA: f64 = 0.99;
-// The weight for updating the target networks.
-const TAU: f64 = 0.005;
-// The capacity of the replay buffer used for sampling training data.
-const REPLAY_BUFFER_CAPACITY: usize = 100_000;
-// The training batch size for each training iteration.
-const TRAINING_BATCH_SIZE: usize = 100;
 // The total number of episodes.
 const MAX_EPISODES: usize = 100;
 // The maximum length of an episode.
 const EPISODE_LENGTH: usize = 200;
-// The number of training iterations after one episode finishes.
-const TRAINING_ITERATIONS: usize = 200;
-
-// Ornstein-Uhlenbeck process parameters.
-const MU: f64 = 0.0;
-const THETA: f64 = 0.15;
-const SIGMA: f64 = 0.1;
-
-const ACTOR_LEARNING_RATE: f64 = 1e-4;
-const CRITIC_LEARNING_RATE: f64 = 1e-3;

 #[derive(Parser, Debug, Clone)]
 #[command(author, version, about, long_about = None)]
@ -68,77 +48,28 @@ fn main() -> Result<()> {
    println!("action space: {}", env.action_space());
    println!("observation space: {:?}", env.observation_space());

-    let size_state = env.observation_space().iter().product::<usize>();
-    let size_action = env.action_space();
-
-    let mut agent = ddpg::DDPG::new(
-        &Device::Cpu,
-        size_state,
-        size_action,
-        true,
-        ACTOR_LEARNING_RATE,
-        CRITIC_LEARNING_RATE,
-        GAMMA,
-        TAU,
-        REPLAY_BUFFER_CAPACITY,
-        ddpg::OuNoise::new(MU, THETA, SIGMA, size_action)?,
-    )?;
+    let _num_obs = env.observation_space().iter().product::<usize>();
+    let _num_actions = env.action_space();

    let mut rng = rand::thread_rng();

    for episode in 0..MAX_EPISODES {
-        // let mut state = env.reset(episode as u64)?;
-        let mut state = env.reset(rng.gen::<u64>())?;
+        let mut obs = env.reset(episode as u64)?;

        let mut total_reward = 0.0;
        for _ in 0..EPISODE_LENGTH {
-            let mut action = 2.0 * agent.actions(&state)?;
-            action = action.clamp(-2.0, 2.0);
+            let actions = rng.gen_range(-2.0..2.0);

-            let step = env.step(vec![action])?;
+            let step = env.step(vec![actions])?;
            total_reward += step.reward;

-            agent.remember(
-                &state,
-                &Tensor::new(vec![action], &Device::Cpu)?,
-                &Tensor::new(vec![step.reward as f32], &Device::Cpu)?,
-                &step.state,
-                step.terminated,
-                step.truncated,
-            );
-
-            if step.terminated || step.truncated {
+            if step.is_done {
                break;
            }
-            state = step.state;
+            obs = step.obs;
        }

        println!("episode {episode} with total reward of {total_reward}");
-
-        for _ in 0..TRAINING_ITERATIONS {
-            agent.train(TRAINING_BATCH_SIZE)?;
-        }
-    }
-
-    println!("Testing...");
-    agent.train = false;
-    for episode in 0..10 {
-        // let mut state = env.reset(episode as u64)?;
-        let mut state = env.reset(rng.gen::<u64>())?;
-        let mut total_reward = 0.0;
-        for _ in 0..EPISODE_LENGTH {
-            let mut action = 2.0 * agent.actions(&state)?;
-            action = action.clamp(-2.0, 2.0);
-
-            let step = env.step(vec![action])?;
-            total_reward += step.reward;
-
-            if step.terminated || step.truncated {
-                break;
-            }
-            state = step.state;
-        }
-        println!("episode {episode} with total reward of {total_reward}");
    }
    Ok(())
 }
--- a/candle-examples/examples/stable-diffusion/main.rs
+++ b/candle-examples/examples/stable-diffusion/main.rs
@ -416,7 +416,7 @@ fn run(args: Args) -> Result<()> {

    println!("Building the autoencoder.");
    let vae_weights = ModelFile::Vae.get(vae_weights, sd_version, use_f16)?;
-    let vae = sd_config.build_vae(vae_weights, &device, dtype)?;
+    let vae = sd_config.build_vae(&vae_weights, &device, dtype)?;
    let init_latent_dist = match &img2img {
        None => None,
        Some(image) => {
@ -426,7 +426,7 @@ fn run(args: Args) -> Result<()> {
    };
    println!("Building the unet.");
    let unet_weights = ModelFile::Unet.get(unet_weights, sd_version, use_f16)?;
-    let unet = sd_config.build_unet(unet_weights, &device, 4, use_flash_attn, dtype)?;
+    let unet = sd_config.build_unet(&unet_weights, &device, 4, use_flash_attn, dtype)?;

    let t_start = if img2img.is_some() {
        n_steps - (n_steps as f64 * img2img_strength) as usize
--- a/candle-examples/examples/t5/README.md
+++ b/candle-examples/examples/t5/README.md
@ -5,26 +5,12 @@
 ```bash
 $ cargo run --example t5 --release -- --model-id "t5-small" --prompt "translate to German: A beautiful candle." --decode
 ...
+Running on CPU, to run on GPU, build this example with `--features cuda`
 Eine schöne Kerze.
 9 tokens generated (2.42 token/s)
 ```

-Variants such as [flan-t5](https://huggingface.co/google/flan-t5-small), [flan-ul2](https://huggingface.co/google/flan-ul2) (with `--revision "refs/pr/25"`), and [Co-EdIT](https://huggingface.co/grammarly/coedit-large) are also supported.
-
-## Translation with [MADLAD-400](https://arxiv.org/abs/2309.04662)
-
-MADLAD-400 is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models.
-
-```bash
-cargo run --example t5 --release  -- \
-  --model-id "jbochi/madlad400-3b-mt" \
-  --prompt "<2de> How are you, my friend?" \
-  --decode --temperature 0
-...
- Wie geht es dir, mein Freund?
-```
-
-## Sentence embedding example
+## Sentence embedding example:

 ```bash
 $ cargo run --example t5 --release -- --model-id "t5-small" --prompt "A beautiful candle."
--- a/candle-examples/examples/t5/main.rs
+++ b/candle-examples/examples/t5/main.rs
@ -104,17 +104,6 @@ impl T5ModelBuilder {
                api.get("model-00004-of-00005.safetensors")?,
                api.get("model-00005-of-00005.safetensors")?,
            ]
-        } else if model_id == "google/flan-ul2" {
-            vec![
-                api.get("model-00001-of-00008.safetensors")?,
-                api.get("model-00002-of-00008.safetensors")?,
-                api.get("model-00003-of-00008.safetensors")?,
-                api.get("model-00004-of-00008.safetensors")?,
-                api.get("model-00005-of-00008.safetensors")?,
-                api.get("model-00006-of-00008.safetensors")?,
-                api.get("model-00007-of-00008.safetensors")?,
-                api.get("model-00008-of-00008.safetensors")?,
-            ]
        } else {
            vec![api.get("model.safetensors")?]
        };
@ -183,12 +172,7 @@ fn main() -> Result<()> {
                println!("Took {:?}", start.elapsed());
            } else {
                let mut model = builder.build_conditional_generation()?;
-                let mut output_token_ids = [builder
-                    .config
-                    .decoder_start_token_id
-                    .unwrap_or(builder.config.pad_token_id)
-                    as u32]
-                .to_vec();
+                let mut output_token_ids = [builder.config.pad_token_id as u32].to_vec();
                if let Some(decoder_prompt) = &args.decoder_prompt {
                    print!("{decoder_prompt}");
                    output_token_ids.extend(
--- a/candle-examples/examples/trocr/assets/trocr.png
+++ b/candle-examples/examples/trocr/assets/trocr.png
--- a/candle-examples/examples/trocr/image_processor.rs
+++ b/candle-examples/examples/trocr/image_processor.rs
@ -1,154 +0,0 @@
-use image::{DynamicImage, ImageBuffer};
-use serde::Deserialize;
-use std::collections::HashMap;
-
-use candle::{DType, Device, Result, Tensor};
-
-#[derive(Debug, Clone, PartialEq, Deserialize)]
-pub struct ProcessorConfig {
-    do_resize: bool,
-    height: u32,
-    width: u32,
-    do_rescale: bool,
-    do_normalize: bool,
-    image_mean: Vec<f32>,
-    image_std: Vec<f32>,
-}
-
-impl Default for ProcessorConfig {
-    fn default() -> Self {
-        Self {
-            do_resize: true,
-            height: 384,
-            width: 384,
-            do_rescale: true,
-            do_normalize: true,
-            image_mean: vec![0.5, 0.5, 0.5],
-            image_std: vec![0.5, 0.5, 0.5],
-        }
-    }
-}
-
-pub struct ViTImageProcessor {
-    do_resize: bool,
-    height: u32,
-    width: u32,
-    do_normalize: bool,
-    image_mean: Vec<f32>,
-    image_std: Vec<f32>,
-}
-
-impl ViTImageProcessor {
-    pub fn new(config: &ProcessorConfig) -> Self {
-        Self {
-            do_resize: config.do_resize,
-            height: config.height,
-            width: config.width,
-            do_normalize: config.do_normalize,
-            image_mean: config.image_mean.clone(),
-            image_std: config.image_std.clone(),
-        }
-    }
-
-    pub fn preprocess(&self, images: Vec<&str>) -> Result<Tensor> {
-        let height = self.height as usize;
-        let width = self.width as usize;
-        let channels = 3;
-
-        let images = self.load_images(images)?;
-
-        let resized_images: Vec<DynamicImage> = if self.do_resize {
-            images
-                .iter()
-                .map(|image| self.resize(image.clone(), None).unwrap())
-                .collect()
-        } else {
-            images
-        };
-
-        let normalized_images: Vec<Tensor> = if self.do_normalize {
-            resized_images
-                .iter()
-                .map(|image| self.normalize(image.clone(), None, None).unwrap())
-                .collect()
-        } else {
-            let resized_images: Vec<ImageBuffer<image::Rgb<u8>, Vec<u8>>> =
-                resized_images.iter().map(|image| image.to_rgb8()).collect();
-            let data = resized_images
-                .into_iter()
-                .map(|image| image.into_raw())
-                .collect::<Vec<Vec<u8>>>();
-
-            data.iter()
-                .map(|image| {
-                    Tensor::from_vec(image.clone(), (height, width, channels), &Device::Cpu)
-                        .unwrap()
-                        .permute((2, 0, 1))
-                        .unwrap()
-                })
-                .collect::<Vec<Tensor>>()
-        };
-
-        Tensor::stack(&normalized_images, 0)
-    }
-
-    fn resize(
-        &self,
-        image: image::DynamicImage,
-        size: Option<HashMap<String, u32>>,
-    ) -> Result<image::DynamicImage> {
-        let (height, width) = match &size {
-            Some(size) => (size.get("height").unwrap(), size.get("width").unwrap()),
-            None => (&self.height, &self.width),
-        };
-
-        let resized_image =
-            image.resize_exact(*width, *height, image::imageops::FilterType::Triangle);
-
-        Ok(resized_image)
-    }
-
-    fn normalize(
-        &self,
-        image: image::DynamicImage,
-        mean: Option<Vec<f32>>,
-        std: Option<Vec<f32>>,
-    ) -> Result<Tensor> {
-        let mean = match mean {
-            Some(mean) => mean,
-            None => self.image_mean.clone(),
-        };
-
-        let std = match std {
-            Some(std) => std,
-            None => self.image_std.clone(),
-        };
-
-        let mean = Tensor::from_vec(mean, (3, 1, 1), &Device::Cpu)?;
-        let std = Tensor::from_vec(std, (3, 1, 1), &Device::Cpu)?;
-
-        let image = image.to_rgb8();
-        let data = image.into_raw();
-
-        let height = self.height as usize;
-        let width = self.width as usize;
-        let channels = 3;
-
-        let data =
-            Tensor::from_vec(data, &[height, width, channels], &Device::Cpu)?.permute((2, 0, 1))?;
-
-        (data.to_dtype(DType::F32)? / 255.)?
-            .broadcast_sub(&mean)?
-            .broadcast_div(&std)
-    }
-
-    pub fn load_images(&self, image_path: Vec<&str>) -> Result<Vec<image::DynamicImage>> {
-        let mut images: Vec<image::DynamicImage> = Vec::new();
-        for path in image_path {
-            let img = image::io::Reader::open(path)?.decode().unwrap();
-            images.push(img);
-        }
-
-        Ok(images)
-    }
-}
--- a/candle-examples/examples/trocr/main.rs
+++ b/candle-examples/examples/trocr/main.rs
@ -1,132 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::Error as E;
-use clap::{Parser, ValueEnum};
-
-use candle::{DType, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::models::trocr;
-
-use tokenizers::Tokenizer;
-mod image_processor;
-
-#[derive(Clone, Debug, Copy, ValueEnum)]
-enum Which {
-    Base,
-    Large,
-}
-
-#[derive(Parser, Debug)]
-struct Args {
-    #[arg(long)]
-    model: Option<String>,
-
-    /// Choose the variant of the model to run.
-    #[arg(long, default_value = "base")]
-    which: Which,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Text to be translated
-    #[arg(long)]
-    image: String,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    use hf_hub::api::sync::Api;
-    let args = Args::parse();
-
-    let tokenizer_dec = {
-        let tokenizer = Api::new()?
-            .model(String::from("ToluClassics/candle-trocr-tokenizer"))
-            .get("tokenizer.json")?;
-
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
-    };
-
-    let mut tokenizer_dec = TokenOutputStream::new(tokenizer_dec);
-
-    let device = candle_examples::device(args.cpu)?;
-
-    let vb = {
-        let model = match args.model {
-            Some(model) => std::path::PathBuf::from(model),
-            None => match args.which {
-                Which::Base => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "microsoft/trocr-base-handwritten".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/3".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-                Which::Large => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "microsoft/trocr-large-handwritten".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/6".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-            },
-        };
-        println!("model: {:?}", model);
-        unsafe { VarBuilder::from_mmaped_safetensors(&[model], DType::F32, &device)? }
-    };
-
-    let encoder_config = match args.which {
-        Which::Base => candle_transformers::models::vit::Config::microsoft_trocr_base_handwritten(),
-        Which::Large => {
-            candle_transformers::models::vit::Config::microsoft_trocr_base_handwritten()
-        }
-    };
-
-    let decoder_config = trocr::TrOCRConfig::default();
-    let mut model = trocr::TrOCRModel::new(&encoder_config, &decoder_config, vb)?;
-
-    let config = image_processor::ProcessorConfig::default();
-    let processor = image_processor::ViTImageProcessor::new(&config);
-
-    let image = vec![args.image.as_str()];
-    let image = processor.preprocess(image)?;
-
-    let encoder_xs = model.encoder().forward(&image)?;
-
-    let mut logits_processor =
-        candle_transformers::generation::LogitsProcessor::new(1337, None, None);
-
-    let mut token_ids: Vec<u32> = vec![decoder_config.decoder_start_token_id];
-    for index in 0..1000 {
-        let context_size = if index >= 1 { 1 } else { token_ids.len() };
-        let start_pos = token_ids.len().saturating_sub(context_size);
-        let input_ids = Tensor::new(&token_ids[start_pos..], &device)?.unsqueeze(0)?;
-
-        let logits = model.decode(&input_ids, &encoder_xs, start_pos)?;
-
-        let logits = logits.squeeze(0)?;
-        let logits = logits.get(logits.dim(0)? - 1)?;
-        let token = logits_processor.sample(&logits)?;
-        token_ids.push(token);
-
-        if let Some(t) = tokenizer_dec.next_token(token)? {
-            use std::io::Write;
-            print!("{t}");
-            std::io::stdout().flush()?;
-        }
-        if token == decoder_config.eos_token_id {
-            break;
-        }
-    }
-
-    if let Some(rest) = tokenizer_dec.decode_rest().map_err(E::msg)? {
-        print!("{rest}");
-    }
-    println!();
-
-    Ok(())
-}
--- a/candle-examples/examples/trocr/readme.md
+++ b/candle-examples/examples/trocr/readme.md
@ -1,16 +0,0 @@
-# candle-trocr
-
-`TrOCR` is a transformer OCR Model. In this example it is used to
-transcribe image text. See the associated [model
-card](https://huggingface.co/microsoft/trocr-base-printed) for details on
-the model itself.
-
-## Running an example
-
-```bash
-cargo run --example trocr --release --  --which base --cpu --image candle-examples/examples/trocr/assets/trocr.png
-```
-
-```
-<s> industry , Mr. Brown commented icily . " Let us have a</s>
-```
--- a/candle-examples/examples/vgg/README.md
+++ b/candle-examples/examples/vgg/README.md
@ -1,13 +0,0 @@
-## VGG Model Implementation
-
-This example demonstrates the implementation of VGG models (VGG13, VGG16, VGG19) using the Candle library.
-
-The VGG models are defined in `candle-transformers/src/models/vgg.rs`. The main function in `candle-examples/examples/vgg/main.rs` loads an image, selects the VGG model based on the provided argument, and applies the model to the loaded image.
-
-You can run the example with the following command:
-
-```bash
-cargo run --example vgg --release -- --image ../yolo-v8/assets/bike.jpg --which vgg13
-```
-
-In the command above, `--image` specifies the path to the image file and `--which` specifies the VGG model to use (vgg13, vgg16, or vgg19).
--- a/candle-examples/examples/vgg/main.rs
+++ b/candle-examples/examples/vgg/main.rs
@ -1,77 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use candle::{DType, IndexOp, D};
-use candle_nn::{ModuleT, VarBuilder};
-use candle_transformers::models::vgg::{Models, Vgg};
-use clap::{Parser, ValueEnum};
-
-#[derive(Clone, Copy, Debug, ValueEnum)]
-enum Which {
-    Vgg13,
-    Vgg16,
-    Vgg19,
-}
-
-#[derive(Parser)]
-struct Args {
-    #[arg(long)]
-    image: String,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Variant of the model to use.
-    #[arg(value_enum, long, default_value_t = Which::Vgg13)]
-    which: Which,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-    let device = candle_examples::device(args.cpu)?;
-    let image = candle_examples::imagenet::load_image224(args.image)?;
-
-    println!("loaded image {image:?}");
-
-    let api = hf_hub::api::sync::Api::new()?;
-    let repo = match args.which {
-        Which::Vgg13 => "timm/vgg13.tv_in1k",
-        Which::Vgg16 => "timm/vgg16.tv_in1k",
-        Which::Vgg19 => "timm/vgg19.tv_in1k",
-    };
-    let api = api.model(repo.into());
-    let filename = "model.safetensors";
-    let model_file = api.get(filename)?;
-
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
-    let model = match args.which {
-        Which::Vgg13 => Vgg::new(vb, Models::Vgg13)?,
-        Which::Vgg16 => Vgg::new(vb, Models::Vgg16)?,
-        Which::Vgg19 => Vgg::new(vb, Models::Vgg19)?,
-    };
-    let logits = model.forward_t(&image, /*train=*/ false)?;
-
-    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
-        .i(0)?
-        .to_vec1::<f32>()?;
-
-    // Sort the predictions and take the top 5
-    let mut top: Vec<_> = prs.iter().enumerate().collect();
-    top.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
-    let top = top.into_iter().take(5).collect::<Vec<_>>();
-
-    // Print the top predictions
-    for &(i, p) in &top {
-        println!(
-            "{:50}: {:.2}%",
-            candle_examples::imagenet::CLASSES[i],
-            p * 100.0
-        );
-    }
-
-    Ok(())
-}
--- a/candle-examples/examples/whisper/main.rs
+++ b/candle-examples/examples/whisper/main.rs
@ -128,13 +128,7 @@ impl Decoder {
        let transcribe_token = token_id(&tokenizer, m::TRANSCRIBE_TOKEN)?;
        let translate_token = token_id(&tokenizer, m::TRANSLATE_TOKEN)?;
        let eot_token = token_id(&tokenizer, m::EOT_TOKEN)?;
-        let no_speech_token = m::NO_SPEECH_TOKENS
-            .iter()
-            .find_map(|token| token_id(&tokenizer, token).ok());
-        let no_speech_token = match no_speech_token {
-            None => anyhow::bail!("unable to find any non-speech token"),
-            Some(n) => n,
-        };
+        let no_speech_token = token_id(&tokenizer, m::NO_SPEECH_TOKEN)?;
        Ok(Self {
            model,
            rng: rand::rngs::StdRng::seed_from_u64(seed),
@ -351,7 +345,7 @@ enum Task {
    Translate,
 }

-#[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)]
+#[derive(Clone, Copy, Debug, ValueEnum)]
 enum WhichModel {
    Tiny,
    #[value(name = "tiny.en")]
@ -367,27 +361,15 @@ enum WhichModel {
    MediumEn,
    Large,
    LargeV2,
-    LargeV3,
-    #[value(name = "distil-medium.en")]
-    DistilMediumEn,
-    #[value(name = "distil-large-v2")]
-    DistilLargeV2,
 }

 impl WhichModel {
    fn is_multilingual(&self) -> bool {
        match self {
-            Self::Tiny
-            | Self::Base
-            | Self::Small
-            | Self::Medium
-            | Self::Large
-            | Self::LargeV2
-            | Self::LargeV3
-            | Self::DistilLargeV2 => true,
-            Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn | Self::DistilMediumEn => {
-                false
+            Self::Tiny | Self::Base | Self::Small | Self::Medium | Self::Large | Self::LargeV2 => {
+                true
            }
+            Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn => false,
        }
    }

@ -403,9 +385,6 @@ impl WhichModel {
            Self::MediumEn => ("openai/whisper-medium.en", "main"),
            Self::Large => ("openai/whisper-large", "refs/pr/36"),
            Self::LargeV2 => ("openai/whisper-large-v2", "refs/pr/57"),
-            Self::LargeV3 => ("openai/whisper-large-v3", "main"),
-            Self::DistilMediumEn => ("distil-whisper/distil-medium.en", "main"),
-            Self::DistilLargeV2 => ("distil-whisper/distil-large-v2", "main"),
        }
    }
 }
@ -517,21 +496,17 @@ fn main() -> Result<()> {
                repo.get(&format!("model-{ext}-q80.gguf"))?,
            )
        } else {
-            let config = repo.get("config.json")?;
-            let tokenizer = repo.get("tokenizer.json")?;
-            let model = repo.get("model.safetensors")?;
-            (config, tokenizer, model)
+            (
+                repo.get("config.json")?,
+                repo.get("tokenizer.json")?,
+                repo.get("model.safetensors")?,
+            )
        };
        (config, tokenizer, model, sample)
    };
-    let config: Config = serde_json::from_str(&std::fs::read_to_string(config_filename)?)?;
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

-    let mel_bytes = match config.num_mel_bins {
-        80 => include_bytes!("melfilters.bytes").as_slice(),
-        128 => include_bytes!("melfilters128.bytes").as_slice(),
-        nmel => anyhow::bail!("unexpected num_mel_bins {nmel}"),
-    };
+    let mel_bytes = include_bytes!("melfilters.bytes");
    let mut mel_filters = vec![0f32; mel_bytes.len() / 4];
    <byteorder::LittleEndian as byteorder::ByteOrder>::read_f32_into(mel_bytes, &mut mel_filters);

@ -547,15 +522,12 @@ fn main() -> Result<()> {
        .map(|v| *v as f32 / 32768.)
        .collect();
    println!("pcm data loaded {}", pcm_data.len());
-    let mel = audio::pcm_to_mel(&config, &pcm_data, &mel_filters);
+    let mel = audio::pcm_to_mel(&pcm_data, &mel_filters);
    let mel_len = mel.len();
-    let mel = Tensor::from_vec(
-        mel,
-        (1, config.num_mel_bins, mel_len / config.num_mel_bins),
-        &device,
-    )?;
+    let mel = Tensor::from_vec(mel, (1, m::N_MELS, mel_len / m::N_MELS), &device)?;
    println!("loaded mel: {:?}", mel.dims());

+    let config: Config = serde_json::from_str(&std::fs::read_to_string(config_filename)?)?;
    let mut model = if args.quantized {
        let vb =
            candle_transformers::quantized_var_builder::VarBuilder::from_gguf(&weights_filename)?;
--- a/candle-examples/examples/whisper/melfilters128.bytes
+++ b/candle-examples/examples/whisper/melfilters128.bytes
--- a/candle-examples/examples/yi/main.rs
+++ b/candle-examples/examples/yi/main.rs
@ -1,268 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::{Error as E, Result};
-use clap::{Parser, ValueEnum};
-
-use candle_transformers::models::yi::{Config, Model};
-
-use candle::{DType, Device, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
-enum Which {
-    #[value(name = "6b")]
-    L6b,
-    #[value(name = "34b")]
-    L34b,
-}
-
-struct TextGeneration {
-    model: Model,
-    device: Device,
-    tokenizer: TokenOutputStream,
-    logits_processor: LogitsProcessor,
-    repeat_penalty: f32,
-    repeat_last_n: usize,
-}
-
-impl TextGeneration {
-    #[allow(clippy::too_many_arguments)]
-    fn new(
-        model: Model,
-        tokenizer: Tokenizer,
-        seed: u64,
-        temp: Option<f64>,
-        top_p: Option<f64>,
-        repeat_penalty: f32,
-        repeat_last_n: usize,
-        device: &Device,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-        Self {
-            model,
-            tokenizer: TokenOutputStream::new(tokenizer),
-            logits_processor,
-            repeat_penalty,
-            repeat_last_n,
-            device: device.clone(),
-        }
-    }
-
-    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
-        use std::io::Write;
-        self.tokenizer.clear();
-        let mut tokens = self
-            .tokenizer
-            .tokenizer()
-            .encode(prompt, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        for &t in tokens.iter() {
-            if let Some(t) = self.tokenizer.next_token(t)? {
-                print!("{t}")
-            }
-        }
-        std::io::stdout().flush()?;
-
-        let mut generated_tokens = 0usize;
-        let eos_token = match self.tokenizer.get_token("<|endoftext|>") {
-            Some(token) => token,
-            None => anyhow::bail!("cannot find the <|endoftext|> token"),
-        };
-        let start_gen = std::time::Instant::now();
-        for index in 0..sample_len {
-            let context_size = if index > 0 { 1 } else { tokens.len() };
-            let start_pos = tokens.len().saturating_sub(context_size);
-            let ctxt = &tokens[start_pos..];
-            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
-            let logits = self.model.forward(&input, start_pos)?;
-            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
-            let logits = if self.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    self.repeat_penalty,
-                    &tokens[start_at..],
-                )?
-            };
-
-            let next_token = self.logits_processor.sample(&logits)?;
-            tokens.push(next_token);
-            generated_tokens += 1;
-            if next_token == eos_token {
-                break;
-            }
-            if let Some(t) = self.tokenizer.next_token(next_token)? {
-                print!("{t}");
-                std::io::stdout().flush()?;
-            }
-        }
-        let dt = start_gen.elapsed();
-        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
-            print!("{rest}");
-        }
-        std::io::stdout().flush()?;
-        println!(
-            "\n{generated_tokens} tokens generated ({:.2} token/s)",
-            generated_tokens as f64 / dt.as_secs_f64(),
-        );
-        Ok(())
-    }
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    #[arg(long)]
-    prompt: String,
-
-    /// The temperature used to generate samples.
-    #[arg(long)]
-    temperature: Option<f64>,
-
-    /// Nucleus sampling probability cutoff.
-    #[arg(long)]
-    top_p: Option<f64>,
-
-    /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
-    seed: u64,
-
-    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 100)]
-    sample_len: usize,
-
-    #[arg(long, default_value = "01-ai/Yi-6B")]
-    model_id: String,
-
-    #[arg(long, default_value = "main")]
-    revision: String,
-
-    #[arg(long)]
-    tokenizer_file: Option<String>,
-
-    #[arg(long)]
-    weight_files: Option<String>,
-
-    /// Penalty to be applied for repeating tokens, 1. means no penalty.
-    #[arg(long, default_value_t = 1.1)]
-    repeat_penalty: f32,
-
-    /// The context size to consider for the repeat penalty.
-    #[arg(long, default_value_t = 64)]
-    repeat_last_n: usize,
-
-    /// The model size to use.
-    #[arg(long, default_value = "6b")]
-    which: Which,
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    println!(
-        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        candle::utils::with_avx(),
-        candle::utils::with_neon(),
-        candle::utils::with_simd128(),
-        candle::utils::with_f16c()
-    );
-    println!(
-        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
-        args.temperature.unwrap_or(0.),
-        args.repeat_penalty,
-        args.repeat_last_n
-    );
-
-    let start = std::time::Instant::now();
-    let api = Api::new()?;
-    let repo = api.repo(Repo::with_revision(
-        args.model_id,
-        RepoType::Model,
-        args.revision,
-    ));
-    let tokenizer_filename = match args.tokenizer_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("tokenizer.json")?,
-    };
-    let filenames = match args.weight_files {
-        Some(files) => files
-            .split(',')
-            .map(std::path::PathBuf::from)
-            .collect::<Vec<_>>(),
-        None => match args.which {
-            Which::L6b => vec![
-                repo.get("model-00001-of-00002.safetensors")?,
-                repo.get("model-00002-of-00002.safetensors")?,
-            ],
-            Which::L34b => vec![
-                repo.get("model-00001-of-00007.safetensors")?,
-                repo.get("model-00002-of-00007.safetensors")?,
-                repo.get("model-00003-of-00007.safetensors")?,
-                repo.get("model-00004-of-00007.safetensors")?,
-                repo.get("model-00005-of-00007.safetensors")?,
-                repo.get("model-00006-of-00007.safetensors")?,
-                repo.get("model-00007-of-00007.safetensors")?,
-            ],
-        },
-    };
-    println!("retrieved the files in {:?}", start.elapsed());
-    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-    let start = std::time::Instant::now();
-    let config = match args.which {
-        Which::L6b => Config::config_6b(),
-        Which::L34b => Config::config_34b(),
-    };
-    let device = candle_examples::device(args.cpu)?;
-    let dtype = if device.is_cuda() {
-        DType::BF16
-    } else {
-        DType::F32
-    };
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
-    let model = Model::new(&config, vb)?;
-
-    println!("loaded the model in {:?}", start.elapsed());
-
-    let mut pipeline = TextGeneration::new(
-        model,
-        tokenizer,
-        args.seed,
-        args.temperature,
-        args.top_p,
-        args.repeat_penalty,
-        args.repeat_last_n,
-        &device,
-    );
-    pipeline.run(&args.prompt, args.sample_len)?;
-    Ok(())
-}
--- a/candle-examples/examples/yolo-v3/main.rs
+++ b/candle-examples/examples/yolo-v3/main.rs
@ -43,7 +43,6 @@ pub fn report(
    confidence_threshold: f32,
    nms_threshold: f32,
 ) -> Result<DynamicImage> {
-    let pred = pred.to_device(&Device::Cpu)?;
    let (npreds, pred_size) = pred.dims2()?;
    let nclasses = pred_size - 5;
    // The bounding boxes grouped by (maximum) class index.
--- a/candle-examples/examples/yolo-v8/README.md
+++ b/candle-examples/examples/yolo-v8/README.md
@ -32,7 +32,7 @@ Image source:
 ### Pose Estimation
 ```bash
 cargo run --example yolo-v8 --release -- \
-  candle-examples/examples/yolo-v8/assets/bike.jpg --task pose
+  candle-examples/examples/yolo-v8/assets/peoples.jpeg --task pose
 ```

 ![Leading group, Giro d'Italia 2021](./assets/bike.pose.jpg)
--- a/candle-examples/examples/yolo-v8/main.rs
+++ b/candle-examples/examples/yolo-v8/main.rs
@ -7,7 +7,7 @@ extern crate accelerate_src;
 mod model;
 use model::{Multiples, YoloV8, YoloV8Pose};

-use candle::{DType, Device, IndexOp, Result, Tensor};
+use candle::{DType, IndexOp, Result, Tensor};
 use candle_nn::{Module, VarBuilder};
 use candle_transformers::object_detection::{non_maximum_suppression, Bbox, KeyPoint};
 use clap::{Parser, ValueEnum};
@ -61,7 +61,6 @@ pub fn report_detect(
    nms_threshold: f32,
    legend_size: u32,
 ) -> Result<DynamicImage> {
-    let pred = pred.to_device(&Device::Cpu)?;
    let (pred_size, npreds) = pred.dims2()?;
    let nclasses = pred_size - 4;
    // The bounding boxes grouped by (maximum) class index.
@ -154,7 +153,6 @@ pub fn report_pose(
    confidence_threshold: f32,
    nms_threshold: f32,
 ) -> Result<DynamicImage> {
-    let pred = pred.to_device(&Device::Cpu)?;
    let (pred_size, npreds) = pred.dims2()?;
    if pred_size != 17 * 3 + 4 + 1 {
        candle::bail!("unexpected pred-size {pred_size}");
--- a/candle-examples/examples/yolo-v8/model.rs
+++ b/candle-examples/examples/yolo-v8/model.rs
@ -1,5 +1,7 @@
 use candle::{DType, IndexOp, Result, Tensor, D};
-use candle_nn::{batch_norm, conv2d, conv2d_no_bias, Conv2d, Conv2dConfig, Module, VarBuilder};
+use candle_nn::{
+    batch_norm, conv2d, conv2d_no_bias, BatchNorm, Conv2d, Conv2dConfig, Module, VarBuilder,
+};

 #[derive(Clone, Copy, PartialEq, Debug)]
 pub struct Multiples {
@ -74,6 +76,7 @@ impl Module for Upsample {
 #[derive(Debug)]
 struct ConvBlock {
    conv: Conv2d,
+    bn: BatchNorm,
    span: tracing::Span,
 }

@ -93,10 +96,11 @@ impl ConvBlock {
            groups: 1,
            dilation: 1,
        };
+        let conv = conv2d_no_bias(c1, c2, k, cfg, vb.pp("conv"))?;
        let bn = batch_norm(c2, 1e-3, vb.pp("bn"))?;
-        let conv = conv2d_no_bias(c1, c2, k, cfg, vb.pp("conv"))?.absorb_bn(&bn)?;
        Ok(Self {
            conv,
+            bn,
            span: tracing::span!(tracing::Level::TRACE, "conv-block"),
        })
    }
@ -106,6 +110,7 @@ impl Module for ConvBlock {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let _enter = self.span.enter();
        let xs = self.conv.forward(xs)?;
+        let xs = self.bn.forward(&xs)?;
        candle_nn::ops::silu(&xs)
    }
 }
--- a/candle-examples/src/lib.rs
+++ b/candle-examples/src/lib.rs
@ -2,28 +2,17 @@ pub mod coco_classes;
 pub mod imagenet;
 pub mod token_output_stream;

-use candle::utils::{cuda_is_available, metal_is_available};
 use candle::{Device, Result, Tensor};

 pub fn device(cpu: bool) -> Result<Device> {
    if cpu {
        Ok(Device::Cpu)
-    } else if cuda_is_available() {
-        Ok(Device::new_cuda(0)?)
-    } else if metal_is_available() {
-        Ok(Device::new_metal(0)?)
    } else {
-        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-        {
-            println!(
-                "Running on CPU, to run on GPU(metal), build this example with `--features metal`"
-            );
-        }
-        #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-        {
+        let device = Device::cuda_if_available(0)?;
+        if !device.is_cuda() {
            println!("Running on CPU, to run on GPU, build this example with `--features cuda`");
        }
-        Ok(Device::Cpu)
+        Ok(device)
    }
 }

--- a/candle-flash-attn/Cargo.toml
+++ b/candle-flash-attn/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-flash-attn"
-version = "0.3.1"
+version = "0.3.0"
 edition = "2021"

 description = "Flash attention layer for the candle ML framework."
@ -11,7 +11,7 @@ license = "MIT OR Apache-2.0"
 readme = "README.md"

 [dependencies]
-candle = { path = "../candle-core", features = ["cuda"], version = "0.3.1", package = "candle-core" }
+candle = { path = "../candle-core", features = ["cuda"], version = "0.3.0", package = "candle-core" }
 half = { version = "2.3.1", features = ["num-traits"] }

 [build-dependencies]
@ -21,4 +21,4 @@ rayon = "1.7.0"

 [dev-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-nn = { path = "../candle-nn", version = "0.3.1", features = ["cuda"] }
+candle-nn = { path = "../candle-nn", version = "0.3.0", features = ["cuda"] }
--- a/candle-flash-attn/src/lib.rs
+++ b/candle-flash-attn/src/lib.rs
@ -233,8 +233,8 @@ impl FlashAttnVarLen {

        let (seqlens_q, seqlens_q_layout) = self.seqlens_q.storage_and_layout();
        let seqlens_q = match &*seqlens_q {
+            candle::Storage::Cpu(_) => candle::bail!("seqlens_q must be a cuda tensor"),
            candle::Storage::Cuda(c) => c.as_cuda_slice::<u32>()?, // Should be i32!
-            _ => candle::bail!("seqlens_q must be a cuda tensor"),
        };
        let seqlens_q = match seqlens_q_layout.contiguous_offsets() {
            Some((o1, o2)) => seqlens_q.slice(o1..o2),
@ -243,8 +243,8 @@ impl FlashAttnVarLen {

        let (seqlens_k, seqlens_k_layout) = self.seqlens_k.storage_and_layout();
        let seqlens_k = match &*seqlens_k {
+            candle::Storage::Cpu(_) => candle::bail!("seqlens_k must be a cuda tensor"),
            candle::Storage::Cuda(c) => c.as_cuda_slice::<u32>()?, // Should be i32!
-            _ => candle::bail!("seqlens_k must be a cuda tensor"),
        };
        let seqlens_k = match seqlens_k_layout.contiguous_offsets() {
            Some((o1, o2)) => seqlens_k.slice(o1..o2),
--- a/candle-kernels/Cargo.toml
+++ b/candle-kernels/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-kernels"
-version = "0.3.1"
+version = "0.3.0"
 edition = "2021"

 description = "CUDA kernels for Candle"
@ -14,4 +14,4 @@ license = "MIT OR Apache-2.0"
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
 glob = "0.3.1"
-rayon = "1.7.0"
+rayon = "1.7.0"
--- a/candle-metal-kernels/Cargo.toml
+++ b/candle-metal-kernels/Cargo.toml
@ -1,20 +0,0 @@
-[package]
-name = "candle-metal-kernels"
-version = "0.3.1"
-edition = "2021"
-
-description = "Metal kernels for Candle"
-repository = "https://github.com/huggingface/candle"
-keywords = ["blas", "tensor", "machine-learning"]
-categories = ["science"]
-license = "MIT OR Apache-2.0"
-
-[dependencies]
-metal = { version = "0.27.0", features = ["mps"]}
-once_cell = "1.18.0"
-thiserror = "1"
-tracing = "0.1.37"
-
-[dev-dependencies]
-half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
-rand = "0.8.5"
--- a/candle-metal-kernels/README.md
+++ b/candle-metal-kernels/README.md
@ -1,3 +0,0 @@
-# candle-metal-kernels
-
-This crate contains Metal kernels used from candle.
--- a/candle-metal-kernels/src/affine.metal
+++ b/candle-metal-kernels/src/affine.metal
@ -1,124 +0,0 @@
-#include <metal_stdlib>
-
-METAL_FUNC uint get_strided_index(
-    uint idx,
-    constant size_t &num_dims,
-    constant size_t *dims,
-    constant size_t *strides
-) {
-    uint strided_i = 0;
-    for (uint d = 0; d < num_dims; d++) {
-        uint dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
-
-using namespace metal;
-
-#define AFFINE(FN_NAME, TYPENAME) \
-kernel void FN_NAME( \
-    constant size_t &dim, \
-    constant float &mul, \
-    constant float &add, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    output[id] = TYPENAME(float(input[id]) * mul + add); \
-} \
-kernel void FN_NAME##_strided( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant float &mul, \
-    constant float &add, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    output[id] = TYPENAME(float(input[get_strided_index(id, num_dims, dims, strides)]) * mul + add); \
-}
-
-#define POWF(FN_NAME, TYPENAME) \
-kernel void FN_NAME( \
-    constant size_t &dim, \
-    constant float &mul, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    output[id] = TYPENAME(pow(input[id], TYPENAME(mul))); \
-} \
-kernel void FN_NAME##_strided( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant float &mul, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    output[id] = TYPENAME(pow(input[get_strided_index(id, num_dims, dims, strides)], TYPENAME(mul))); \
-}
-
-#define ELU(FN_NAME, TYPENAME) \
-kernel void FN_NAME( \
-    constant size_t &dim, \
-    constant float &mul, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    const TYPENAME x = input[id]; \
-    output[id] = TYPENAME((x > 0)?x: mul * exp(x - 1)); \
-} \
-kernel void FN_NAME##_strided( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant float &mul, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint id [[ thread_position_in_grid ]] \
-) { \
-    if (id >= dim) { \
-        return; \
-    } \
-    const TYPENAME x = input[get_strided_index(id, num_dims, dims, strides)]; \
-    output[id] = TYPENAME((x > 0)?x: mul * exp(x - 1)); \
-} \
-
-
-AFFINE(affine_f32, float)
-AFFINE(affine_f16, half)
-POWF(powf_f32, float)
-POWF(powf_f16, half)
-ELU(elu_f32, float)
-ELU(elu_f16, half)
-
-
-#if __METAL_VERSION__ >= 310
-AFFINE(affine_bf16, bfloat);
-POWF(powf_bf16, bfloat);
-ELU(elu_bf16, bfloat);
-#endif
--- a/candle-metal-kernels/src/binary.metal
+++ b/candle-metal-kernels/src/binary.metal
@ -1,90 +0,0 @@
-#include <metal_stdlib>
-
-#define MAX(x, y) ((x) > (y) ? (x) : (y))
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
-METAL_FUNC uint get_strided_index(
-    uint idx,
-    constant size_t &num_dims,
-    constant size_t *dims,
-    constant size_t *strides
-) {
-    uint strided_i = 0;
-    for (uint d = 0; d < num_dims; d++) {
-        uint dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
-
-using namespace metal;
-
-#define BINARY(FN, TYPENAME, OUT_TYPENAME, FN_NAME, FN_NAME_STRIDED) \
-kernel void FN_NAME( \
-    constant size_t &dim, \
-    device const TYPENAME *left,  \
-    device const TYPENAME *right,  \
-    device OUT_TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    if (tid >= dim) { \
-        return; \
-    } \
-    TYPENAME x = left[tid]; \
-    TYPENAME y = right[tid]; \
-    output[tid] = OUT_TYPENAME(FN); \
-}\
-kernel void FN_NAME_STRIDED( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *left_strides, \
-    constant size_t *right_strides, \
-    device const TYPENAME *left,  \
-    device const TYPENAME *right,  \
-    device OUT_TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    if (tid >= dim) { \
-        return; \
-    } \
-    TYPENAME x = left[get_strided_index(tid, num_dims, dims, left_strides)]; \
-    TYPENAME y = right[get_strided_index(tid, num_dims, dims, right_strides)]; \
-    output[tid] = OUT_TYPENAME(FN); \
-}
-
-#define BINARY_OP(FN, NAME) \
-BINARY(FN, float, float, NAME##_f32, NAME##_f32_strided); \
-BINARY(FN, half, half, NAME##_f16, NAME##_f16_strided);
-
-#define BFLOAT_BINARY_OP(FN, NAME) \
-BINARY(FN, bfloat, bfloat, NAME##_bf16, NAME##_bf16_strided);
-
-#define BINARY_OP_OUT(NAME, FN) \
-BINARY(FN, float, uint8_t, NAME##_f32, NAME##_f32_strided); \
-BINARY(FN, half, uint8_t, NAME##_f16, NAME##_f16_strided);
-
-
-BINARY_OP(x + y, add)
-BINARY_OP(x - y, sub)
-BINARY_OP(x * y, mul)
-BINARY_OP(x / y, div)
-BINARY_OP(MIN(x, y), min)
-BINARY_OP(MAX(x, y), max)
-
-BINARY_OP_OUT(eq, x == y)
-BINARY_OP_OUT(ne, x != y)
-BINARY_OP_OUT(le, x <= y)
-BINARY_OP_OUT(lt, x < y)
-BINARY_OP_OUT(ge, x >= y)
-BINARY_OP_OUT(gt, x > y)
-
-#if __METAL_VERSION__ >= 310
-BFLOAT_BINARY_OP(x + y, add)
-BFLOAT_BINARY_OP(x - y, sub)
-BFLOAT_BINARY_OP(x * y, mul)
-BFLOAT_BINARY_OP(x / y, div)
-BFLOAT_BINARY_OP(MIN(x, y), min)
-BFLOAT_BINARY_OP(MAX(x, y), max)
-#endif
--- a/candle-metal-kernels/src/cast.metal
+++ b/candle-metal-kernels/src/cast.metal
@ -1,56 +0,0 @@
-#include <metal_stdlib>
-
-METAL_FUNC uint get_strided_index(
-    uint idx,
-    constant size_t &num_dims,
-    constant size_t *dims,
-    constant size_t *strides
-) {
-    uint strided_i = 0;
-    for (uint d = 0; d < num_dims; d++) {
-        uint dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
-
-
-using namespace metal;
-
-#define CAST(FN_NAME, FN_NAME_STRIDED, LEFT_TYPENAME, RIGHT_TYPENAME) \
-kernel void FN_NAME( \
-    constant size_t &dim, \
-    device const LEFT_TYPENAME *input,  \
-    device RIGHT_TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    if (tid >= dim) { \
-        return; \
-    } \
-    output[tid] = RIGHT_TYPENAME(input[tid]); \
-} \
-kernel void FN_NAME_STRIDED( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    device const LEFT_TYPENAME *input,  \
-    device RIGHT_TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    if (tid >= dim) { \
-        return; \
-    } \
-    output[tid] = RIGHT_TYPENAME(input[get_strided_index(tid, num_dims, dims, strides)]); \
-} \
-
-CAST(cast_u32_f32, cast_u32_f32_strided, uint32_t, float)
-CAST(cast_u32_u8, cast_u32_u8_strided, uint32_t, uint8_t)
-CAST(cast_u8_u32, cast_u8_u32_strided, uint8_t, uint32_t)
-CAST(cast_u8_f32, cast_u8_f32_strided, uint8_t, float)
-CAST(cast_f16_f32, cast_f16_f32_strided, half, float)
-CAST(cast_f32_f16, cast_f32_f16_strided, float, half)
-
-#if __METAL_VERSION__ >= 310
-#endif
--- a/candle-metal-kernels/src/indexing.metal
+++ b/candle-metal-kernels/src/indexing.metal
@ -1,198 +0,0 @@
-#include <metal_stdlib>
-using namespace metal;
-
-template<typename TYPENAME, typename INDEX_TYPENAME>
-METAL_FUNC void index( 
-    constant size_t &dst_size, 
-    constant size_t &left_size, 
-    constant size_t &src_dim_size, 
-    constant size_t &right_size, 
-    constant size_t &ids_size, 
-    const device TYPENAME *input, 
-    const device INDEX_TYPENAME *input_ids, 
-    device TYPENAME *output, 
-    uint tid [[ thread_position_in_grid ]] 
-) { 
-    if (tid >= dst_size) { 
-        return; 
-    } 
-    const size_t id_i = (tid / right_size) % ids_size; 
-    const INDEX_TYPENAME input_i = min(input_ids[id_i], (INDEX_TYPENAME)(src_dim_size - 1)); 
-    const size_t right_rank_i = tid % right_size; 
-    const size_t left_rank_i = tid / right_size / ids_size; 
-    /* 
-    // Force prevent out of bounds indexing 
-    // since there doesn't seem to be a good way to force crash 
-    // No need to check for zero we're only allowing unsized. 
-    */ 
-    const size_t src_i = left_rank_i * src_dim_size * right_size + input_i * right_size + right_rank_i; 
-    output[tid] = input[src_i]; 
-}
-
-# define INDEX_OP(NAME, INDEX_TYPENAME, TYPENAME) \
-kernel void NAME( \
-    constant size_t &dst_size, \
-    constant size_t &left_size, \
-    constant size_t &src_dim_size, \
-    constant size_t &right_size, \
-    constant size_t &ids_size, \
-    const device TYPENAME *input, \
-    const device INDEX_TYPENAME *input_ids, \
-    device TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    index<TYPENAME, INDEX_TYPENAME>(dst_size, left_size, src_dim_size, right_size, ids_size, input, input_ids, output, tid); \
-}
-
-
-template<typename TYPENAME, typename INDEX_TYPENAME>
-METAL_FUNC void gather( 
-    constant size_t &dst_size, 
-    constant size_t &left_size, 
-    constant size_t &src_dim_size, 
-    constant size_t &right_size, 
-    constant size_t &ids_size, 
-    const device TYPENAME *input, 
-    const device INDEX_TYPENAME *input_ids, 
-    device TYPENAME *output, 
-    uint tid [[ thread_position_in_grid ]] 
-) { 
-    if (tid >= dst_size) { 
-        return; 
-    } 
-    const INDEX_TYPENAME input_i = input_ids[tid]; 
-    const size_t right_rank_i = tid % right_size; 
-    const size_t left_rank_i = tid / right_size / ids_size; 
-    const size_t src_i = (left_rank_i * src_dim_size + input_i) * right_size + right_rank_i; 
-    output[tid] = input[src_i]; 
-}
-
-# define GATHER_OP(NAME, INDEX_TYPENAME, TYPENAME) \
-kernel void NAME( \
-    constant size_t &dst_size, \
-    constant size_t &left_size, \
-    constant size_t &src_dim_size, \
-    constant size_t &right_size, \
-    constant size_t &ids_size, \
-    const device TYPENAME *input, \
-    const device INDEX_TYPENAME *input_ids, \
-    device TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    gather<TYPENAME, INDEX_TYPENAME>(dst_size, left_size, src_dim_size, right_size, ids_size, input, input_ids, output, tid); \
-}
-
-template<typename TYPENAME, typename INDEX_TYPENAME>
-METAL_FUNC void scatter_add( 
-    constant size_t &dst_size, 
-    constant size_t &left_size, 
-    constant size_t &src_dim_size, 
-    constant size_t &right_size, 
-    constant size_t &dst_dim_size, 
-    const device TYPENAME *input, 
-    const device INDEX_TYPENAME *input_ids, 
-    device TYPENAME *output, 
-    uint tid [[ thread_position_in_grid ]] 
-) { 
-    if (tid >= dst_size) { 
-        return; 
-    } 
-    const size_t right_rank_i = tid % right_size; 
-    const size_t left_rank_i = tid / right_size; 
-    for (unsigned int j = 0; j < src_dim_size; ++j) {
-        const size_t src_i = (left_rank_i * src_dim_size + j) * right_size + right_rank_i; 
-        const INDEX_TYPENAME idx = input_ids[src_i];
-        const size_t dst_i = (left_rank_i * dst_dim_size + idx) * right_size + right_rank_i; 
-        output[dst_i] += input[src_i]; 
-    }
-}
-
-# define SCATTER_ADD_OP(NAME, INDEX_TYPENAME, TYPENAME) \
-kernel void NAME( \
-    constant size_t &dst_size, \
-    constant size_t &left_size, \
-    constant size_t &src_dim_size, \
-    constant size_t &right_size, \
-    constant size_t &dst_dim_size, \
-    const device TYPENAME *input, \
-    const device INDEX_TYPENAME *input_ids, \
-    device TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    scatter_add<TYPENAME, INDEX_TYPENAME>(dst_size, left_size, src_dim_size, right_size, dst_dim_size, input, input_ids, output, tid); \
-}
-
-template<typename TYPENAME, typename INDEX_TYPENAME>
-METAL_FUNC void index_add( 
-    constant size_t &dst_size, 
-    constant size_t &left_size, 
-    constant size_t &src_dim_size, 
-    constant size_t &right_size, 
-    constant size_t &dst_dim_size, 
-    constant size_t &ids_dim_size, 
-    const device TYPENAME *input, 
-    const device INDEX_TYPENAME *input_ids, 
-    device TYPENAME *output, 
-    uint tid [[ thread_position_in_grid ]] 
-) { 
-    if (tid >= dst_size) { 
-        return; 
-    } 
-    const size_t right_rank_i = tid % right_size; 
-    const size_t left_rank_i = tid / right_size; 
-    for (unsigned int j = 0; j < ids_dim_size; ++j) {
-        const INDEX_TYPENAME idx = input_ids[j];
-        const size_t src_i = (left_rank_i * src_dim_size + j) * right_size + right_rank_i; 
-        const size_t dst_i = (left_rank_i * dst_dim_size + idx) * right_size + right_rank_i; 
-        output[dst_i] += input[src_i]; 
-    }
-}
-
-# define INDEX_ADD_OP(NAME, INDEX_TYPENAME, TYPENAME) \
-kernel void NAME( \
-    constant size_t &dst_size, \
-    constant size_t &left_size, \
-    constant size_t &src_dim_size, \
-    constant size_t &right_size, \
-    constant size_t &dst_dim_size, \
-    constant size_t &ids_dim_size, \
-    const device TYPENAME *input, \
-    const device INDEX_TYPENAME *input_ids, \
-    device TYPENAME *output, \
-    uint tid [[ thread_position_in_grid ]] \
-) { \
-    index_add<TYPENAME, INDEX_TYPENAME>(dst_size, left_size, src_dim_size, right_size, dst_dim_size, ids_dim_size, input, input_ids, output, tid); \
-}
-
-
-INDEX_OP(is_u32_f32, uint, float)
-INDEX_OP(is_u32_f16, uint, half)
-GATHER_OP(gather_u32_f32, uint, float)
-GATHER_OP(gather_u32_f16, uint, half)
-SCATTER_ADD_OP(sa_u32_f32, uint, float)
-SCATTER_ADD_OP(sa_u32_f16, uint, half)
-
-
-#if __METAL_VERSION__ >= 310
-INDEX_ADD_OP(ia_i64_bf16, int64_t, bfloat)
-INDEX_ADD_OP(ia_u32_bf16, uint32_t, bfloat)
-INDEX_ADD_OP(ia_u8_bf16, uint8_t, bfloat)
-#endif
-
-INDEX_ADD_OP(ia_u32_f16, uint32_t, half)
-INDEX_ADD_OP(ia_u8_f16, uint8_t, half)
-
-INDEX_ADD_OP(ia_i64_f32, int64_t, float)
-INDEX_ADD_OP(ia_i64_u8, int64_t, uint8_t)
-INDEX_ADD_OP(ia_i64_i64, int64_t, int64_t)
-INDEX_ADD_OP(ia_i64_u32, int64_t, uint32_t)
-
-INDEX_ADD_OP(ia_u32_f32, uint32_t, float)
-INDEX_ADD_OP(ia_u32_u8, uint32_t, uint8_t)
-INDEX_ADD_OP(ia_u32_i64, uint32_t, int64_t)
-INDEX_ADD_OP(ia_u32_u32, uint32_t, uint32_t)
-
-INDEX_ADD_OP(ia_u8_f32, uint8_t, float)
-INDEX_ADD_OP(ia_u8_u8, uint8_t, uint8_t)
-INDEX_ADD_OP(ia_u8_u32, uint8_t, uint32_t)
-INDEX_ADD_OP(ia_u8_i64, uint8_t, int64_t)
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
--- a/candle-metal-kernels/src/libMetalFlashAttention.metallib
+++ b/candle-metal-kernels/src/libMetalFlashAttention.metallib
--- a/candle-metal-kernels/src/reduce.metal
+++ b/candle-metal-kernels/src/reduce.metal
@ -1,292 +0,0 @@
-#include <metal_stdlib>
-using namespace metal;
-
-#define MAX(x, y) ((x) > (y) ? (x) : (y))
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
-METAL_FUNC uint get_strided_index(
-    uint idx,
-    constant size_t &num_dims,
-    constant size_t *dims,
-    constant size_t *strides
-) {
-    uint strided_i = 0;
-    for (uint d = 0; d < num_dims; d++) {
-        uint dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
-
-constant int THREADGROUP_SIZE = 2048;
-
-
-#define ARGMIN(NAME, T, MAXVALUE) \
-kernel void NAME( \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant size_t &el_to_sum_per_block, \
-    device const T *src, \
-    device uint *dst,  \
-    uint id [[ thread_position_in_grid ]],  \
-    uint tid [[ thread_index_in_threadgroup ]],  \
-    uint dst_id [[ threadgroup_position_in_grid ]],  \
-    uint block_dim [[ threads_per_threadgroup ]]  \
-) {  \
-      \
-   threadgroup T shared_memory[THREADGROUP_SIZE];  \
-   threadgroup uint shared_indices[THREADGROUP_SIZE];  \
-       \
-   shared_memory[tid] = MAXVALUE;  \
-   shared_indices[tid] = 0xFFFFFFFF; \
-   bool notset = true; \
-   /*  \
-   // Elements summed in this block range from dst_id * el_to_sum_per_block   \
-   // to (dst_id + 1) * el_to_sum_per_block.  \
-   */  \
-   size_t start_idx = dst_id * el_to_sum_per_block;  \
-   size_t stop_idx = start_idx + el_to_sum_per_block;  \
-   size_t idx = start_idx + tid;  \
-   while (idx < stop_idx) {  \
-     /*  \
-     // TODO: Fast version for the contiguous case.  \
-     */  \
-     size_t strided_i = get_strided_index(idx, num_dims, dims, strides);  \
-     if (notset || src[strided_i] < shared_memory[tid]) {  \
-         shared_memory[tid] = src[strided_i];  \
-          /* Assume that the reduction takes place over the last dimension which is contiguous. */ \
-          shared_indices[tid] = idx % dims[num_dims - 1]; \
-          notset = false; \
-     }  \
-     idx += block_dim;  \
-   }  \
-       \
-   threadgroup_barrier(mem_flags::mem_none);  \
-     \
-   /*  \
-   // reduction in shared memory  \
-   */  \
-   for (uint s = block_dim / 2; s > 0; s >>= 1) {  \
-       if (tid < s && shared_memory[tid + s] < shared_memory[tid]) {  \
-           shared_indices[tid] = shared_indices[tid + s];  \
-           shared_memory[tid] = shared_memory[tid + s];  \
-       }  \
-       threadgroup_barrier(mem_flags::mem_none);  \
-   }  \
-     \
-     if (tid == 0){ \
-       dst[dst_id] = shared_indices[0];  \
-     } \
-} \
-
-
-#define ARGMAX(NAME, T, MINVALUE) \
-kernel void NAME( \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant size_t &el_to_sum_per_block, \
-    device const T *src, \
-    device uint *dst,  \
-    uint id [[ thread_position_in_grid ]],  \
-    uint tid [[ thread_index_in_threadgroup ]],  \
-    uint dst_id [[ threadgroup_position_in_grid ]],  \
-    uint block_dim [[ threads_per_threadgroup ]]  \
-) {  \
-      \
-   threadgroup T shared_memory[THREADGROUP_SIZE];  \
-   threadgroup uint shared_indices[THREADGROUP_SIZE];  \
-       \
-   shared_memory[tid] = MINVALUE;  \
-   shared_indices[tid] = 0xFFFFFFFF; \
-   /*  \
-   // Elements summed in this block range from dst_id * el_to_sum_per_block   \
-   // to (dst_id + 1) * el_to_sum_per_block.  \
-   */  \
-   size_t start_idx = dst_id * el_to_sum_per_block;  \
-   size_t stop_idx = start_idx + el_to_sum_per_block;  \
-   size_t idx = start_idx + tid;  \
-   bool notset = true; \
-   while (idx < stop_idx) {  \
-     /*  \
-     // TODO: Fast version for the contiguous case.  \
-     */  \
-     size_t strided_i = get_strided_index(idx, num_dims, dims, strides);  \
-     if (notset || shared_memory[tid] < src[strided_i]) {  \
-         shared_memory[tid] = src[strided_i];  \
-         shared_indices[tid] = idx % dims[num_dims - 1]; \
-         notset = false; \
-     }  \
-     idx += block_dim;  \
-   }  \
-       \
-   threadgroup_barrier(mem_flags::mem_none);  \
-     \
-   /*  \
-   // reduction in shared memory  \
-   */  \
-   for (uint s = block_dim / 2; s > 0; s >>= 1) {  \
-       if (tid < s && shared_memory[tid + s] > shared_memory[tid]) {  \
-           shared_indices[tid] = shared_indices[tid + s];  \
-           shared_memory[tid] = shared_memory[tid + s];  \
-       }  \
-       threadgroup_barrier(mem_flags::mem_none);  \
-   }  \
-     \
-   if (tid == 0){ \
-       dst[dst_id] = shared_indices[0];  \
-   } \
-} \
-
-#define REDUCE(FN, NAME, T, START) \
-kernel void NAME( \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant size_t &el_to_sum_per_block, \
-    device const T *src,  \
-    device T *dst, \
-    uint id [[ thread_position_in_grid ]], \
-    uint tid [[ thread_index_in_threadgroup ]], \
-    uint dst_id [[ threadgroup_position_in_grid ]], \
-    uint block_dim [[ threads_per_threadgroup ]] \
-) { \
-     \
-   threadgroup T shared_memory[THREADGROUP_SIZE]; \
-      \
-   shared_memory[tid] = START; \
-   /* \
-   // Elements summed in this block range from dst_id * el_to_sum_per_block  \
-   // to (dst_id + 1) * el_to_sum_per_block. \
-   */ \
-   size_t start_idx = dst_id * el_to_sum_per_block; \
-   size_t stop_idx = start_idx + el_to_sum_per_block; \
-   size_t idx = start_idx + tid; \
-   while (idx < stop_idx) { \
-     /* \
-     // TODO: Fast version for the contiguous case. \
-     */ \
-     size_t strided_i = get_strided_index(idx, num_dims, dims, strides); \
-     T x = shared_memory[tid]; \
-     T y = src[strided_i]; \
-     shared_memory[tid] = FN; \
-     idx += block_dim; \
-   } \
-      \
-   threadgroup_barrier(mem_flags::mem_none); \
-    \
-   /* \
-   // reduction in shared memory \
-   */ \
-   for (uint s = block_dim / 2; s > 0; s >>= 1) { \
-       if (tid < s) { \
-           T x = shared_memory[tid]; \
-           T y = shared_memory[tid + s]; \
-           shared_memory[tid] = FN; \
-       } \
-       threadgroup_barrier(mem_flags::mem_none); \
-   } \
-    \
-   dst[dst_id] = shared_memory[0]; \
-} \
-
-
-#define SOFTMAX(NAME, T)                                                          \
-kernel void NAME(                                                                 \
-    constant size_t &src_numel,                                                   \
-    constant size_t &el_to_sum_per_block,                                         \
-    device const T *src,                                                          \
-    device T *dst,                                                                \
-                                                                                  \
-    uint id [[ thread_position_in_grid ]],                                        \
-    uint tid [[ thread_index_in_threadgroup ]],                                   \
-    uint dst_id [[ threadgroup_position_in_grid ]],                               \
-    uint block_dim [[ threads_per_threadgroup ]]                                  \
-) {                                                                               \
-    threadgroup float shared_memory[THREADGROUP_SIZE];                                \
-    shared_memory[tid] = -INFINITY;                                            \
-    size_t start_idx = dst_id * el_to_sum_per_block;                              \
-    size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel);            \
-    size_t idx = start_idx + tid;                                                 \
-                                                                                  \
-                                                                                  \
-    float tmp = -INFINITY; \
-    while (idx < stop_idx) {                                                      \
-        tmp = MAX(tmp, float(src[idx]));                   \
-        idx += block_dim;                                                         \
-    }                                                                             \
-    shared_memory[tid] = tmp; \
-                                                                                  \
-    threadgroup_barrier(mem_flags::mem_threadgroup);                              \
-                                                                                  \
-    for (uint s = block_dim / 2; s > 0; s >>= 1) {                                \
-        if (tid < s) {                                                            \
-            shared_memory[tid] = MAX(shared_memory[tid], shared_memory[tid + s]); \
-        }                                                                         \
-        threadgroup_barrier(mem_flags::mem_threadgroup);                              \
-    }                                                                             \
-                                                                                  \
-    /* wait for shared_memory[0] to be filled */ \
-    threadgroup_barrier(mem_flags::mem_threadgroup);                              \
-                                                                                  \
-    float _max = shared_memory[0];                                                    \
-                                                                                  \
-    /* prevent tid=0 from overwriting _max before other threads have written it */ \
-    threadgroup_barrier(mem_flags::mem_threadgroup);                              \
-    shared_memory[tid] = 0;                                                       \
-                                                                                  \
-    idx = start_idx + tid;                                                        \
-    while (idx < stop_idx) {                                                      \
-        const float val = exp(float(src[idx]) - _max);                                    \
-        dst[idx] = T(val);                                                           \
-        shared_memory[tid] += val;                                                \
-        idx += block_dim;                                                         \
-    }                                                                             \
-    threadgroup_barrier(mem_flags::mem_threadgroup);                              \
-    for (uint s = block_dim / 2; s > 0; s >>= 1) {                                \
-        if (tid < s) {                                                            \
-            shared_memory[tid] += shared_memory[tid + s];                         \
-        }                                                                         \
-        threadgroup_barrier(mem_flags::mem_threadgroup);                              \
-    }                                                                             \
-                                                                                  \
-    const T inv_acc = T(1.0/shared_memory[0]);                                         \
-    idx = start_idx + tid;                                                        \
-    while (idx < stop_idx) {                                                      \
-        dst[idx] *= inv_acc;                                                      \
-        idx += block_dim;                                                         \
-    }                                                                             \
-}                                                                                 \
-
-REDUCE(x + y, fast_sum_f32_strided, float, 0)
-REDUCE(x + y, fast_sum_u32_strided, uint, 0)
-REDUCE(x + y, fast_sum_f16_strided, half, 0)
-REDUCE(x * y, fast_mul_f32_strided, float, 1)
-REDUCE(x * y, fast_mul_u32_strided, uint, 1)
-REDUCE(x * y, fast_mul_f16_strided, half, 1)
-REDUCE(MAX(x, y), fast_max_f32_strided, float, -HUGE_VALF)
-REDUCE(MAX(x, y), fast_max_u32_strided, uint, 0)
-REDUCE(MAX(x, y), fast_max_f16_strided, half, -HUGE_VALH)
-REDUCE(MIN(x, y), fast_min_f32_strided, float, HUGE_VALF)
-REDUCE(MIN(x, y), fast_min_u32_strided, uint, 0xFFFFFFFF)
-REDUCE(MIN(x, y), fast_min_f16_strided, half, HUGE_VALH)
-ARGMIN(fast_argmin_f32_strided, float, HUGE_VALF)
-ARGMIN(fast_argmin_f16_strided, half, HUGE_VALH)
-ARGMIN(fast_argmin_u32_strided, uint, 0xFFFFFFFF)
-ARGMAX(fast_argmax_f32_strided, float, -HUGE_VALF)
-ARGMAX(fast_argmax_f16_strided, half, -HUGE_VALH)
-ARGMAX(fast_argmax_u32_strided, uint, 0)
-
-SOFTMAX(softmax_f32, float)
-SOFTMAX(softmax_f16, half)
-#if __METAL_VERSION__ >= 310
-REDUCE(x + y, fast_sum_bf16, bfloat, 0)
-REDUCE(x * y, fast_mul_bf16, bfloat, 1)
-REDUCE(MAX(x, y), fast_max_bf16, bfloat, -HUGE_VALBF)
-REDUCE(MIN(x, y), fast_min_bf16, bfloat, HUGE_VALBF)
-ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF)
-ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF)
-SOFTMAX(softmax_bf16, bfloat)
-#endif
--- a/candle-metal-kernels/src/ternary.metal
+++ b/candle-metal-kernels/src/ternary.metal
@ -1,60 +0,0 @@
-#include <metal_stdlib>
-#
-using namespace metal;
-
-METAL_FUNC uint get_strided_index(
-    uint idx,
-    constant size_t &num_dims,
-    constant size_t *dims,
-    constant size_t *strides
-) {
-    uint strided_i = 0;
-    for (uint d = 0; d < num_dims; d++) {
-        uint dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
-
-
-#define WHERE_OP(TYPENAME, ID_TYPENAME, FN_NAME) \
-kernel void FN_NAME(  \
-    constant size_t &numel,  \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    constant size_t *strides_t, \
-    constant size_t *strides_f, \
-    device const ID_TYPENAME *ids, \
-    device const TYPENAME *t, \
-    device const TYPENAME *f, \
-    device TYPENAME *out ,\
-    uint i [[ thread_position_in_grid ]] \
-) {  \
-   if (i >= numel){ \
-       return; \
-   } \
-   uint strided_i = get_strided_index(i, num_dims, dims, strides); \
-   uint strided_i_t = get_strided_index(i, num_dims, dims, strides_t); \
-   uint strided_i_f = get_strided_index(i, num_dims, dims, strides_f); \
-   out[i] = ids[strided_i] ? t[strided_i_t] : f[strided_i_f]; \
-} \
-
-// WHERE_OP(float, int64_t, where_i64_f32)
-// WHERE_OP(double, int64_t, where_i64_f64)
-// WHERE_OP(uint8_t, int64_t, where_i64_u8)
-// WHERE_OP(uint32_t, int64_t, where_i64_u32)
-// WHERE_OP(int64_t, int64_t, where_i64_i64)
-// 
-// WHERE_OP(float, uint32_t, where_u32_f32)
-// WHERE_OP(double, uint32_t, where_u32_f64)
-// WHERE_OP(uint8_t, uint32_t, where_u32_u8)
-// WHERE_OP(uint32_t, uint32_t, where_u32_u32)
-// WHERE_OP(int64_t, uint32_t, where_u32_i64)
-
-WHERE_OP(float, uint8_t, where_u8_f32)
-// WHERE_OP(double, uint8_t, where_u8_f64)
-// WHERE_OP(uint8_t, uint8_t, where_u8_u8)
-// WHERE_OP(uint32_t, uint8_t, where_u8_u32)
-// WHERE_OP(int64_t, uint8_t, where_u8_i64)
--- a/candle-metal-kernels/src/tests.rs
+++ b/candle-metal-kernels/src/tests.rs
@ -1,875 +0,0 @@
-use super::*;
-use half::{bf16, f16};
-use metal::{CompileOptions, Device, MTLResourceOptions, MTLSize, NSUInteger};
-
-fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
-    let ptr = buffer.contents() as *const T;
-    assert!(!ptr.is_null());
-    let slice = unsafe { std::slice::from_raw_parts(ptr, n) };
-    slice.to_vec()
-}
-
-fn new_buffer<T>(device: &Device, data: &[T]) -> Buffer {
-    let options = MTLResourceOptions::StorageModeManaged;
-    let ptr = data.as_ptr() as *const core::ffi::c_void;
-    let size = (data.len() * std::mem::size_of::<T>()) as u64;
-    device.new_buffer_with_data(ptr, size, options)
-}
-
-fn device() -> Device {
-    Device::system_default().unwrap()
-}
-
-fn approx(v: Vec<f32>, digits: i32) -> Vec<f32> {
-    let b = 10f32.powi(digits);
-    v.iter().map(|t| f32::round(t * b) / b).collect()
-}
-
-fn approx_f16(v: Vec<f16>, digits: i32) -> Vec<f32> {
-    let b = 10f32.powi(digits);
-    v.iter().map(|t| f32::round(t.to_f32() * b) / b).collect()
-}
-
-fn approx_bf16(v: Vec<bf16>, digits: i32) -> Vec<f32> {
-    let b = 10f32.powi(digits);
-    v.iter().map(|t| f32::round(t.to_f32() * b) / b).collect()
-}
-
-fn run<T: Clone>(v: &[T], name: unary::contiguous::Kernel) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let input = new_buffer(&device, v);
-    let output = new_buffer(&device, v);
-    call_unary_contiguous(
-        &device,
-        command_buffer,
-        &kernels,
-        name,
-        v.len(),
-        &input,
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-    read_to_vec(&output, v.len())
-}
-
-fn run_binary<T: Clone>(x: &[T], y: &[T], name: binary::contiguous::Kernel) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let options = MTLResourceOptions::StorageModeManaged;
-    let left = new_buffer(&device, x);
-    let right = new_buffer(&device, y);
-    let output = device.new_buffer(std::mem::size_of_val(x) as u64, options);
-    call_binary_contiguous(
-        &device,
-        command_buffer,
-        &kernels,
-        name,
-        x.len(),
-        &left,
-        &right,
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-    read_to_vec(&output, x.len())
-}
-
-fn run_strided<T: Clone>(
-    v: &[T],
-    kernel: unary::strided::Kernel,
-    shape: &[usize],
-    strides: &[usize],
-    offset: usize,
-) -> Vec<T> {
-    let device = device();
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let input = new_buffer(&device, v);
-    let output = new_buffer(&device, v);
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    call_unary_strided(
-        &device,
-        command_buffer,
-        &kernels,
-        kernel,
-        shape,
-        &input,
-        strides,
-        offset,
-        &output,
-        0,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-    read_to_vec(&output, v.len())
-}
-
-#[test]
-fn cos_f32() {
-    let v = vec![1.0f32, 2.0, 3.0];
-    let results = run(&v, unary::contiguous::cos::FLOAT);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(approx(results, 4), vec![0.5403, -0.4161, -0.99]);
-    assert_eq!(approx(expected, 4), vec![0.5403, -0.4161, -0.99]);
-
-    let v = vec![1.0f32; 10_000];
-    let results = run(&v, unary::contiguous::cos::FLOAT);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(approx(results, 4), vec![0.5403; 10_000]);
-    assert_eq!(approx(expected, 4), vec![0.5403; 10_000]);
-}
-
-#[test]
-fn cos_f32_strided() {
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let shape = vec![6];
-    let strides = vec![1];
-    let offset = 0;
-    let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(
-        approx(results, 4),
-        vec![0.5403, -0.4161, -0.99, -0.6536, 0.2837, 0.9602]
-    );
-    assert_eq!(
-        approx(expected, 4),
-        vec![0.5403, -0.4161, -0.99, -0.6536, 0.2837, 0.9602]
-    );
-
-    // Contiguous
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let shape = vec![3, 2];
-    let strides = vec![2, 1];
-    let offset = 0;
-    let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(
-        approx(results, 4),
-        vec![0.5403, -0.4161, -0.99, -0.6536, 0.2837, 0.9602]
-    );
-    assert_eq!(
-        approx(expected, 4),
-        vec![0.5403, -0.4161, -0.99, -0.6536, 0.2837, 0.9602]
-    );
-
-    // Transposed
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let shape = vec![3, 2];
-    let strides = vec![1, 3];
-    let offset = 0;
-    let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(
-        approx(results, 4),
-        vec![0.5403, -0.6536, -0.4161, 0.2837, -0.99, 0.9602]
-    );
-    assert_eq!(
-        approx(expected, 4),
-        vec![0.5403, -0.4161, -0.99, -0.6536, 0.2837, 0.9602]
-    );
-
-    // Very large
-    let v = vec![1.0f32; 10_000];
-    let shape = vec![2, 5_000];
-    let strides = vec![2, 1];
-    let offset = 0;
-    let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(approx(results, 4), vec![0.5403; 10_000]);
-    assert_eq!(approx(expected, 4), vec![0.5403; 10_000]);
-}
-
-#[test]
-fn cos_strided_random() {
-    let v: Vec<_> = (0..10_000).map(|_| rand::random::<f32>()).collect();
-    let shape = vec![5_000, 2];
-    let strides = vec![1, 5_000];
-    let offset = 0;
-    let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
-    let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
-    assert_eq!(approx(vec![results[0]], 4), approx(vec![expected[0]], 4));
-    assert_eq!(
-        approx(vec![results[1]], 4),
-        approx(vec![expected[5_000]], 4)
-    );
-    assert_eq!(approx(vec![results[2]], 4), approx(vec![expected[1]], 4));
-    assert_eq!(
-        approx(vec![results[3]], 4),
-        approx(vec![expected[5_001]], 4)
-    );
-    assert_eq!(
-        approx(vec![results[5_000]], 4),
-        approx(vec![expected[2_500]], 4)
-    );
-}
-
-#[test]
-fn gelu_f16() {
-    let v: Vec<f16> = [-10f32, -1.0, 0., 1., 2., 3., 10.0, 20.0]
-        .iter()
-        .map(|v| f16::from_f32(*v))
-        .collect();
-    let expected: Vec<f32> = vec![-0.0, -0.16, 0.0, 0.84, 1.96, 3.0, 10.0, 20.0];
-    let results = run(&v, unary::contiguous::gelu::HALF);
-    assert_eq!(approx_f16(results, 2), expected);
-}
-
-#[test]
-fn gelu_f32() {
-    let v: Vec<f32> = vec![-10f32, -1.0, 0., 1., 2., 3., 10.0, 20.0];
-    let expected: Vec<f32> = vec![-0.0, -0.159, 0.0, 0.841, 1.955, 2.996, 10.0, 20.0];
-    let results = run(&v, unary::contiguous::gelu::FLOAT);
-    assert_eq!(approx(results, 3), expected);
-}
-
-#[test]
-fn binary_add_f32() {
-    let left = vec![1.0f32, 2.0, 3.0];
-    let right = vec![2.0f32, 3.1, 4.2];
-    let results = run_binary(&left, &right, binary::contiguous::add::FLOAT);
-    let expected: Vec<_> = left
-        .iter()
-        .zip(right.iter())
-        .map(|(&x, &y)| x + y)
-        .collect();
-    assert_eq!(approx(results, 4), vec![3.0f32, 5.1, 7.2]);
-    assert_eq!(approx(expected, 4), vec![3.0f32, 5.1, 7.2]);
-}
-
-fn cast<T: Clone, U: Clone>(v: &[T], name: &'static str) -> Vec<U> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let input = new_buffer(&device, v);
-    let options = MTLResourceOptions::StorageModeManaged;
-    let size = (v.len() * std::mem::size_of::<U>()) as u64;
-    let output = device.new_buffer(size, options);
-
-    call_cast_contiguous(
-        &device,
-        command_buffer,
-        &kernels,
-        name,
-        v.len(),
-        &input,
-        0,
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-    read_to_vec(&output, v.len())
-}
-
-#[test]
-fn cast_u32_f32() {
-    let v = vec![1u32, 2, 3];
-    let results = cast(&v, "cast_u32_f32");
-    let expected: Vec<_> = v.iter().map(|&v| v as f32).collect();
-    assert_eq!(approx(results, 4), vec![1.0f32, 2.0, 3.0]);
-    assert_eq!(approx(expected, 4), vec![1.0f32, 2.0, 3.0]);
-
-    let v = vec![1.0f32, 2.0, 3.0];
-    let input: Vec<f16> = v.iter().map(|v| f16::from_f32(*v)).collect();
-    let results: Vec<f32> = cast(&input, "cast_f16_f32");
-    assert_eq!(results, vec![1.0f32, 2.0, 3.0]);
-
-    let v = vec![1.0f32; 10_000];
-    let input: Vec<f16> = v.iter().map(|v| f16::from_f32(*v)).collect();
-    let results: Vec<f32> = cast(&input, "cast_f16_f32");
-    assert_eq!(results.len(), 10_000);
-    assert_eq!(&results[..10], vec![1.0f32; 10]);
-    assert_eq!(results, vec![1.0f32; 10_000]);
-}
-
-fn run_affine<T: Clone>(v: &[T], mul: f64, add: f64) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-
-    let input = new_buffer(&device, v);
-    let output = new_buffer(&device, v);
-
-    let size = v.len();
-
-    call_affine(
-        &device,
-        command_buffer,
-        &kernels,
-        "affine_f32",
-        size,
-        &input,
-        &output,
-        mul as f32,
-        add as f32,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    read_to_vec(&output, v.len())
-}
-
-fn run_affine_strided<T: Clone>(
-    v: &[T],
-    shape: &[usize],
-    strides: &[usize],
-    mul: f64,
-    add: f64,
-) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-
-    let input = new_buffer(&device, v);
-    let output = new_buffer(&device, v);
-
-    call_affine_strided(
-        &device,
-        command_buffer,
-        &kernels,
-        "affine_f32_strided",
-        shape,
-        &input,
-        strides,
-        0,
-        &output,
-        mul as f32,
-        add as f32,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    let len: usize = shape.iter().product();
-    read_to_vec(&output, len)
-}
-
-#[test]
-fn affine() {
-    let input = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-    let mul = 1.5;
-    let add = 1.1;
-    let result = run_affine(&input, mul, add);
-    assert_eq!(result, vec![2.6, 4.1, 5.6, 7.1, 8.6, 10.1, 11.6, 13.1]);
-
-    let input = [1.0f32; 40_000];
-    let mul = 1.5;
-    let add = 1.1;
-    let result = run_affine(&input, mul, add);
-    assert_eq!(result, vec![2.6; 40_000]);
-}
-
-#[test]
-fn affine_strided() {
-    let input = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-    let mul = 1.5;
-    let add = 1.1;
-    let shape = [4];
-    let strides = [2];
-    let result = run_affine_strided(&input, &shape, &strides, mul, add);
-    // 1 on 2
-    assert_eq!(result, vec![2.6, 5.6, 8.6, 11.6]);
-}
-
-#[test]
-fn index_select() {
-    let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
-    let shape = [5, 2];
-    let ids = [0u32, 4, 2];
-    let dim = 0;
-    let result = run_index_select(&embedding, &shape, &ids, dim);
-    assert_eq!(result, vec![1.0f32, 2.0, 9.0, 10.0, 5.0, 6.0]);
-
-    let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
-    let shape = [2, 5];
-    let ids = [0u32, 1, 0];
-    let dim = 0;
-    let result = run_index_select(&embedding, &shape, &ids, dim);
-    assert_eq!(
-        result,
-        vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 1.0f32, 2.0, 3.0, 4.0, 5.0]
-    );
-}
-
-#[test]
-fn index_select_f16() {
-    let embedding: Vec<_> = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
-        .into_iter()
-        .map(|x| f16::from_f32(x))
-        .collect();
-    let shape = [5, 2];
-    let ids = [0u32, 4, 2];
-    let dim = 0;
-    let result = run_index_select(&embedding, &shape, &ids, dim);
-    assert_eq!(
-        approx_f16(result, 4),
-        vec![1.0f32, 2.0, 9.0, 10.0, 5.0, 6.0]
-    );
-}
-
-#[test]
-fn index_select_dim1() {
-    let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
-    let shape = [5, 2];
-    let ids = [0u32, 1, 0];
-    let dim = 1;
-    let result = run_index_select(&embedding, &shape, &ids, dim);
-    assert_eq!(
-        result,
-        vec![1.0f32, 2.0, 1.0, 3.0, 4.0, 3.0, 5.0, 6.0, 5.0, 7.0, 8.0f32, 7.0, 9.0, 10.0, 9.0]
-    );
-}
-
-fn run_index_select<T: Clone, I: Clone + std::fmt::Debug>(
-    embeddings: &[T],
-    shape: &[usize],
-    ids: &[I],
-    dim: usize,
-) -> Vec<T> {
-    let device = Device::system_default().expect("no device found");
-
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let embeddings_buffer = new_buffer(&device, &embeddings);
-    let ids_buffer = new_buffer(&device, &ids);
-
-    let left_size: usize = shape[..dim].iter().product();
-    let right_size: usize = shape[dim + 1..].iter().product();
-    let dst_el = ids.len() * left_size * right_size;
-    let dst_buffer = new_buffer(&device, &vec![0.0f32; dst_el]);
-
-    let name = match core::mem::size_of::<T>() {
-        4 => "is_u32_f32",
-        2 => "is_u32_f16",
-        _ => unimplemented!(),
-    };
-
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    call_index_select(
-        &device,
-        &command_buffer,
-        &kernels,
-        name,
-        shape,
-        ids.len(),
-        dim,
-        &embeddings_buffer,
-        &ids_buffer,
-        &dst_buffer,
-    )
-    .unwrap();
-
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    read_to_vec(&dst_buffer, dst_el)
-}
-
-#[test]
-fn index_add() {
-    let device = Device::system_default().expect("no device found");
-
-    let options = CompileOptions::new();
-    let library = device.new_library_with_source(INDEXING, &options).unwrap();
-
-    let left = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
-    let right = [1.0f32; 15];
-    let index = [0u32, 4, 2];
-    let ids_dim_size = index.len() as u32;
-    let dst_dim_size: u32 = 15;
-    let left_size: u32 = 3;
-    let right_size: u32 = 3;
-
-    let function = library.get_function("ia_u32_f32", None).unwrap();
-    let pipeline = device
-        .new_compute_pipeline_state_with_function(&function)
-        .unwrap();
-
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let encoder = command_buffer.new_compute_command_encoder();
-
-    encoder.set_compute_pipeline_state(&pipeline);
-
-    let index_buffer = new_buffer(&device, &index);
-    let inputs_buffer = new_buffer(&device, &left);
-    let outputs_buffer = new_buffer(&device, &right);
-
-    set_params!(
-        encoder,
-        (
-            &index_buffer,
-            &inputs_buffer,
-            &outputs_buffer,
-            ids_dim_size,
-            left_size,
-            dst_dim_size,
-            right_size
-        )
-    );
-
-    let grid_size = MTLSize {
-        width: right.len() as NSUInteger,
-        height: 1,
-        depth: 1,
-    };
-
-    let thread_group_size = MTLSize {
-        width: pipeline.max_total_threads_per_threadgroup(),
-        height: 1,
-        depth: 1,
-    };
-
-    encoder.dispatch_thread_groups(grid_size, thread_group_size);
-    encoder.end_encoding();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    let expected = vec![
-        2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 8.0, 9.0, 10.0, 1.0, 1.0, 1.0, 5.0, 6.0, 7.0,
-    ];
-    let result: Vec<f32> = read_to_vec(&outputs_buffer, right.len());
-    assert_eq!(result, expected);
-}
-
-#[test]
-fn cos_f16() {
-    let v: Vec<f16> = [1.0f32, 2.0, 3.0]
-        .iter()
-        .map(|v| f16::from_f32(*v))
-        .collect();
-    let results = run(&v, unary::contiguous::cos::HALF);
-    let expected: Vec<f16> = v.iter().map(|v| f16::from_f32(v.to_f32().cos())).collect();
-    assert_eq!(approx_f16(results, 2), vec![0.54, -0.42, -0.99]);
-    assert_eq!(approx_f16(expected, 2), vec![0.54, -0.42, -0.99]);
-}
-
-fn run_reduce<T: Clone>(v: &[T], out_length: usize, name: &'static str) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let input = new_buffer(&device, v);
-
-    let options = MTLResourceOptions::StorageModeManaged;
-    let output = device.new_buffer((out_length * core::mem::size_of::<T>()) as u64, options);
-    let dims = vec![v.len()];
-    let strides = vec![1];
-    call_reduce_strided(
-        &device,
-        command_buffer,
-        &kernels,
-        name,
-        &dims,
-        &strides,
-        out_length,
-        &input,
-        0,
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    read_to_vec(&output, out_length)
-}
-
-fn run_softmax<T: Clone + std::fmt::Debug>(v: &[T], last_dim: usize, name: &'static str) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let input = new_buffer(&device, v);
-    let output = new_buffer(&device, v);
-    call_last_softmax(
-        &device,
-        command_buffer,
-        &kernels,
-        name,
-        v.len(),
-        last_dim,
-        &input,
-        0,
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    read_to_vec(&output, v.len())
-}
-
-#[test]
-fn reduce_sum() {
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let out_length = 1;
-
-    let results = run_reduce(&v, out_length, "fast_sum_f32_strided");
-    assert_eq!(approx(results, 4), vec![21.0]);
-}
-
-#[test]
-fn reduce_sum2() {
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let out_length = 2;
-
-    let results = run_reduce(&v, out_length, "fast_sum_f32_strided");
-    assert_eq!(approx(results, 4), vec![6.0, 15.0]);
-}
-
-#[test]
-fn softmax() {
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let last_dim = 6;
-    let results = run_softmax(&v, last_dim, "softmax_f32");
-    assert_eq!(
-        approx(results, 4),
-        vec![0.0043, 0.0116, 0.0315, 0.0858, 0.2331, 0.6337]
-    );
-
-    let last_dim = 4096;
-    let n = 200;
-    let mut v = vec![0.0; n * last_dim];
-    for i in 0..n {
-        v[i * last_dim] = 20.0;
-    }
-    let results = run_softmax(&v, last_dim, "softmax_f32");
-    let results = approx(results, 4);
-    println!("{results:?}");
-    assert_eq!(
-        results.iter().map(|&s| s.round() as usize).sum::<usize>(),
-        n
-    );
-    assert_eq!(results[0], 1.0);
-    assert_eq!(results[1], 0.0);
-    assert_eq!(results[last_dim], 1.0);
-    assert_eq!(results[2 * last_dim], 1.0);
-
-    let v = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
-    let last_dim = 6;
-    let results = run_softmax(&v, last_dim, "softmax_f32");
-    assert_eq!(
-        approx(results, 4),
-        vec![0.0043, 0.0116, 0.0315, 0.0858, 0.2331, 0.6337]
-    );
-
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let last_dim = 3;
-    let results = run_softmax(&v, last_dim, "softmax_f32");
-    assert_eq!(
-        approx(results, 4),
-        vec![0.0900, 0.2447, 0.6652, 0.0900, 0.2447, 0.6652]
-    );
-
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]
-        .iter()
-        .map(|v| f16::from_f32(*v))
-        .collect::<Vec<_>>();
-    let last_dim = 6;
-    let results = run_softmax(&v, last_dim, "softmax_f16");
-    assert_eq!(
-        approx_f16(results, 4),
-        vec![0.0043, 0.0116, 0.0316, 0.0858, 0.2332, 0.6338]
-    );
-
-    let v = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]
-        .iter()
-        .map(|v| bf16::from_f32(*v))
-        .collect::<Vec<_>>();
-    let last_dim = 6;
-    let results = run_softmax(&v, last_dim, "softmax_bf16");
-    assert_eq!(
-        approx_bf16(results, 4),
-        vec![0.0043, 0.0116, 0.0315, 0.0859, 0.2324, 0.6328]
-    );
-}
-
-fn run_where_cond<I: Clone, T: Clone>(
-    shape: &[usize],
-    cond: &[I],
-    (cond_stride, cond_offset): (Vec<usize>, usize),
-    left_true: &[T],
-    (left_stride, left_offset): (Vec<usize>, usize),
-    right_false: &[T],
-    (_right_stride, _right_offset): (Vec<usize>, usize),
-    name: &'static str,
-) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let options = MTLResourceOptions::StorageModeManaged;
-
-    let length = cond.len();
-    let cond = device.new_buffer_with_data(
-        cond.as_ptr() as *const core::ffi::c_void,
-        std::mem::size_of_val(cond) as u64,
-        options,
-    );
-    let left = device.new_buffer_with_data(
-        left_true.as_ptr() as *const core::ffi::c_void,
-        (length * core::mem::size_of::<T>()) as u64,
-        options,
-    );
-    let right = device.new_buffer_with_data(
-        right_false.as_ptr() as *const core::ffi::c_void,
-        (length * core::mem::size_of::<T>()) as u64,
-        options,
-    );
-
-    let output = device.new_buffer((length * core::mem::size_of::<T>()) as u64, options);
-    call_where_cond_strided(
-        &device,
-        command_buffer,
-        &kernels,
-        name,
-        shape,
-        &cond,
-        (&cond_stride, cond_offset),
-        &left,
-        (&left_stride, left_offset),
-        &right,
-        (&cond_stride, cond_offset),
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    read_to_vec(&output, length)
-}
-
-#[test]
-fn where_cond() {
-    let shape = vec![6];
-    let cond = vec![0u8, 1, 0, 0, 1, 1];
-    let cond_l = (vec![1], 0);
-    let left_true = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0];
-    let left_l = (vec![1], 0);
-    let right_false = vec![-1.0f32, -2.0, -3.0, -4.0, -5.0, -6.0];
-    let right_l = (vec![1], 0);
-    let results = run_where_cond(
-        &shape,
-        &cond,
-        cond_l,
-        &left_true,
-        left_l,
-        &right_false,
-        right_l,
-        "where_u8_f32",
-    );
-    assert_eq!(approx(results, 4), vec![-1.0f32, 2.0, -3.0, -4.0, 5.0, 6.0]);
-}
-
-fn run_gemm<T: Clone>(
-    (b, m, n, k): (usize, usize, usize, usize),
-    lhs: &[T],
-    lhs_stride: Vec<usize>,
-    lhs_offset: usize,
-    rhs: &[T],
-    rhs_stride: Vec<usize>,
-    rhs_offset: usize,
-) -> Vec<T> {
-    let device = device();
-    let fence = device.new_fence();
-    let kernels = Kernels::new(fence);
-    let command_queue = device.new_command_queue();
-    let command_buffer = command_queue.new_command_buffer();
-    let options = MTLResourceOptions::StorageModeManaged;
-
-    let lhs = device.new_buffer_with_data(
-        lhs.as_ptr() as *const core::ffi::c_void,
-        std::mem::size_of_val(lhs) as u64,
-        options,
-    );
-    let rhs = device.new_buffer_with_data(
-        rhs.as_ptr() as *const core::ffi::c_void,
-        std::mem::size_of_val(rhs) as u64,
-        options,
-    );
-    let length = b * m * n;
-    let output = device.new_buffer((length * core::mem::size_of::<T>()) as u64, options);
-    call_gemm(
-        &device,
-        command_buffer,
-        &kernels,
-        "sgemm",
-        (b, m, n, k),
-        &lhs_stride,
-        lhs_offset,
-        &lhs,
-        &rhs_stride,
-        rhs_offset,
-        &rhs,
-        &output,
-    )
-    .unwrap();
-    command_buffer.commit();
-    command_buffer.wait_until_completed();
-
-    read_to_vec(&output, length)
-}
-
-#[test]
-fn gemm() {
-    let (b, m, n, k) = (1, 2, 4, 3);
-    let lhs_stride = vec![m * k, k, 1];
-    let lhs: Vec<f32> = (0..b * m * k).map(|f| f as f32).collect();
-    let rhs_stride = vec![n * k, n, 1];
-    let rhs: Vec<f32> = (0..b * n * k).map(|f| f as f32).collect();
-    let results = run_gemm((b, m, n, k), &lhs, lhs_stride, 0, &rhs, rhs_stride, 0);
-    assert_eq!(
-        approx(results, 4),
-        vec![20.0, 23.0, 26.0, 29.0, 56.0, 68.0, 80.0, 92.0]
-    );
-
-    let (b, m, n, k) = (2, 2, 4, 3);
-    let lhs_stride = vec![m * k, k, 1];
-    let lhs: Vec<f32> = (0..b * m * k).map(|f| f as f32).collect();
-    let rhs_stride = vec![n * k, n, 1];
-    let rhs: Vec<f32> = (0..b * n * k).map(|f| f as f32).collect();
-    let results = run_gemm((b, m, n, k), &lhs, lhs_stride, 0, &rhs, rhs_stride, 0);
-    assert_eq!(
-        approx(results, 4),
-        vec![
-            20.0, 23.0, 26.0, 29.0, 56.0, 68.0, 80.0, 92.0, 344.0, 365.0, 386.0, 407.0, 488.0,
-            518.0, 548.0, 578.0
-        ]
-    );
-
-    // OFFSET
-    let (b, m, n, k) = (2, 2, 4, 3);
-    let lhs_stride = vec![m * k, k, 1];
-    let lhs: Vec<f32> = (0..b * m * k).map(|f| f as f32).collect();
-    let rhs_stride = vec![n * k, n, 1];
-    let rhs: Vec<f32> = (0..b * n * k).map(|f| f as f32).collect();
-    // Manually set batch_size=1 and offset 12 elements * 4 the number of bytes for f32
-    let results = run_gemm((1, m, n, k), &lhs, lhs_stride, 0, &rhs, rhs_stride, 12 * 4);
-    assert_eq!(
-        approx(results, 4),
-        vec![56.0, 59.0, 62.0, 65.0, 200.0, 212.0, 224.0, 236.0]
-    );
-}
--- a/candle-metal-kernels/src/unary.metal
+++ b/candle-metal-kernels/src/unary.metal
@ -1,133 +0,0 @@
-#include <metal_stdlib>
-#include <metal_math>
-#
-using namespace metal;
-
-METAL_FUNC uint get_strided_index(
-    uint idx,
-    constant size_t &num_dims,
-    constant size_t *dims,
-    constant size_t *strides
-) {
-    uint strided_i = 0;
-    for (uint d = 0; d < num_dims; d++) {
-        uint dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
-
-template <typename T> METAL_FUNC T sqr(T in){ return in * in; }
-template <typename T> METAL_FUNC T neg(T in){ return -in; }
-template <typename T> METAL_FUNC T erf(T in){
-    float x = (float) in;
-    // constants
-    float a1 =  0.254829592;
-    float a2 = -0.284496736;
-    float a3 =  1.421413741;
-    float a4 = -1.453152027;
-    float a5 =  1.061405429;
-    float p  =  0.3275911;
-
-    // Save the sign of x
-    int sign = 1;
-    if (x < 0)
-        sign = -1;
-    x = fabs(x);
-
-    // A&S formula 7.1.26
-    float t = 1.0/(1.0 + p*x);
-    float y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
-
-    return T(sign*y);
-}
-template <typename T> METAL_FUNC T id(T in) { return in; }
-template <typename T> METAL_FUNC T gelu_erf(T x) {
-    return T(x * (1 + erf(x * M_SQRT1_2_F)) / 2);
-}
-template <typename T> METAL_FUNC T gelu(T x) {
-    if (x > 5) {
-        return x;
-    }
-    T x_sq = x * x;
-    T x_cube = x_sq * x;
-    T alpha = x + static_cast<T>(0.044715) * x_cube;
-    T beta =  (static_cast<T>(M_2_SQRTPI_F * M_SQRT1_2_F) * alpha);
-    return static_cast<T>(0.5) * x * (static_cast<T>(1.0) + T(tanh(beta)));
-}
-
-
-
-#define UNARY(FN, TYPENAME, FN_NAME, FN_NAME_STRIDED) \
-kernel void FN_NAME( \
-    constant size_t &dim, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint thread_position_in_grid [[ thread_position_in_grid ]] \
-) { \
-    if (thread_position_in_grid >= dim) { \
-        return; \
-    } \
-    output[thread_position_in_grid] = TYPENAME(FN(float(input[thread_position_in_grid]))); \
-}\
-kernel void FN_NAME_STRIDED( \
-    constant size_t &dim, \
-    constant size_t &num_dims, \
-    constant size_t *dims, \
-    constant size_t *strides, \
-    device const TYPENAME *input,  \
-    device TYPENAME *output, \
-    uint thread_position_in_grid [[ thread_position_in_grid ]] \
-) { \
-    if (thread_position_in_grid >= dim) { \
-        return; \
-    } \
-    output[thread_position_in_grid] = TYPENAME(FN(float(input[get_strided_index(thread_position_in_grid, num_dims, dims, strides)]))); \
-}
-
-#define UNARY_OP(NAME) \
-UNARY(NAME, float, NAME##_f32, NAME##_f32_strided); \
-UNARY(NAME, half, NAME##_f16, NAME##_f16_strided);
-
-#define BFLOAT_UNARY_OP(NAME) \
-UNARY(NAME, bfloat, NAME##_bf16, NAME##_bf16_strided);
-
-
-UNARY_OP(cos)
-UNARY_OP(sin)
-UNARY_OP(sqr)
-UNARY_OP(sqrt)
-UNARY_OP(neg)
-UNARY_OP(exp)
-UNARY_OP(log)
-UNARY_OP(gelu)
-UNARY_OP(ceil)
-UNARY_OP(floor)
-UNARY_OP(round)
-UNARY_OP(gelu_erf)
-UNARY_OP(erf)
-UNARY_OP(tanh)
-UNARY(id, float, copy_f32, copy_f32_strided)
-UNARY(id, half, copy_f16, copy_f16_strided)
-UNARY(id, uint8_t, copy_u8, copy_u8_strided)
-UNARY(id, uint32_t, copy_u32, copy_u32_strided)
-
-#if __METAL_VERSION__ >= 310
-BFLOAT_UNARY_OP(cos)
-BFLOAT_UNARY_OP(sin)
-BFLOAT_UNARY_OP(sqr)
-BFLOAT_UNARY_OP(sqrt)
-BFLOAT_UNARY_OP(neg)
-BFLOAT_UNARY_OP(exp)
-BFLOAT_UNARY_OP(log)
-BFLOAT_UNARY_OP(gelu)
-BFLOAT_UNARY_OP(ceil)
-BFLOAT_UNARY_OP(floor)
-BFLOAT_UNARY_OP(round)
-BFLOAT_UNARY_OP(gelu_erf)
-BFLOAT_UNARY_OP(erf)
-BFLOAT_UNARY_OP(tanh)
-
-UNARY(id, bfloat, copy_bf16, copy_bf16_strided)
-#endif
--- a/candle-metal-kernels/tmp/affine.rs
+++ b/candle-metal-kernels/tmp/affine.rs
@ -1,76 +0,0 @@
-use candle_metal_kernels::{call_affine, Kernels};
-use metal::objc::rc::autoreleasepool;
-use metal::{Device, MTLResourceOptions};
-use rand;
-use std::any::type_name;
-use std::time::Instant;
-
-fn main() {
-    let device = Device::system_default().unwrap();
-    let kernels = Kernels::new();
-
-    let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
-    let f32_10k = (0..10000)
-        .map(|_| rand::random::<f32>())
-        .collect::<Vec<_>>();
-    let f32_100k = (0..100000)
-        .map(|_| rand::random::<f32>())
-        .collect::<Vec<_>>();
-
-    println!(
-        "{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
-        "dtype", "kernel", "size", "runs", "total time", "avg time"
-    );
-
-    // f32
-    run_affine_bench(&device, &kernels, &f32_1k);
-    run_affine_bench(&device, &kernels, &f32_10k);
-    run_affine_bench(&device, &kernels, &f32_100k);
-}
-
-fn run_affine_bench<T: Clone>(device: &Device, kernels: &Kernels, v: &[T]) {
-    let command_queue = device.new_command_queue();
-    let options = MTLResourceOptions::StorageModeManaged;
-
-    let iterations = 10000;
-    let input = device.new_buffer_with_data(
-        v.as_ptr() as *const core::ffi::c_void,
-        core::mem::size_of_val(v) as u64,
-        options,
-    );
-    let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
-
-    let mul: f32 = 1.2345;
-    let add: f32 = 2.3456;
-    let total_time = autoreleasepool(|| {
-        let command_buffer = command_queue.new_command_buffer();
-        let start = Instant::now();
-        for _ in 0..iterations {
-            call_affine(
-                &device,
-                command_buffer,
-                &kernels,
-                "affine_float",
-                v.len(),
-                &input,
-                &mut output,
-                mul,
-                add,
-            )
-            .unwrap();
-        }
-        command_buffer.commit();
-        command_buffer.wait_until_completed();
-
-        start.elapsed()
-    });
-    println!(
-        "{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
-        type_name::<T>().split("::").last().unwrap(),
-        "affine",
-        v.len(),
-        iterations,
-        total_time,
-        total_time / iterations
-    );
-}
--- a/Show More
+++ b/Show More