Tmp gemm.

Reuse buffers on our own reference counts.
Metal operational.
2025-06-17 02:58:50 +00:00 · 2023-11-19 20:43:59 +01:00 · 2023-11-18 23:28:59 +01:00 · 2023-11-18 00:52:38 +01:00 · 2023-11-17 10:36:57 +01:00 · 2023-11-16 11:07:56 +01:00
332 changed files with 6568 additions and 39838 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,7 +0,0 @@
-version: 2
-updates:
-  - package-ecosystem: "cargo"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-    open-pull-requests-limit: 5
--- a/.github/workflows/ci_cuda.yaml
+++ b/.github/workflows/ci_cuda.yaml
@ -5,15 +5,47 @@ on:
  pull_request:

 jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+      EC2_AMI_ID: ami-03cfed9ea28f4b002
+      EC2_INSTANCE_TYPE: g5.xlarge
+      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
+      EC2_SECURITY_GROUP: sg-030175c435ac141d6
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ env.EC2_AMI_ID }}
+          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
+          subnet-id: ${{ env.EC2_SUBNET_ID }}
+          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
+          aws-resource-tags: > # optional, requires additional permissions
+            [
+              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
  test-cuda:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: nvidia/cuda:12.3.1-devel-ubuntu22.04
-      options: --gpus 0 
-    if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
+    needs: start-runner # required to start the main job when the runner is ready
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    permissions:
      contents: write
      packages: write
@ -24,10 +56,32 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
-      - name: Install dependencies
-        run: apt-get update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y
      - name: Install Rust Stable
-        uses: actions-rust-lang/setup-rust-toolchain@v1
+        run: curl https://sh.rustup.rs -sSf | sh -s -- -y
      - uses: Swatinem/rust-cache@v2
+      - run: apt-get update -y && apt-get install libssl-dev protobuf-compiler -y
      - name: Test (cuda)
-        run: cargo test --features cuda
+        run: PATH=$PATH:/usr/local/cuda-11.8/bin/ /root/.cargo/bin/cargo test --features cuda
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner
+      - test-cuda
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -63,7 +63,7 @@ This documents the main changes to the `candle` crate.
  [760](https://github.com/huggingface/candle/pull/760).
 - Add the Segment-Anything Model (SAM) as an example
  [773](https://github.com/huggingface/candle/pull/773).
- TinyViT backbone for the segment anything example
+- TinyViT backbone for the segemnt anything example
  [787](https://github.com/huggingface/candle/pull/787).
 - Shape with holes support
  [770](https://github.com/huggingface/candle/pull/770).
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,7 +19,7 @@ exclude = [
 resolver = "2"

 [workspace.package]
-version = "0.4.2"
+version = "0.3.0"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@ -28,50 +28,40 @@ categories = ["science"]
 license = "MIT OR Apache-2.0"

 [workspace.dependencies]
-ab_glyph = "0.2.23"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
-candle = { path = "./candle-core", package = "candle-core", version = "0.4.2" }
-candle-datasets = { path = "./candle-datasets", version = "0.4.2" }
-candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.2" }
-candle-kernels = { path = "./candle-kernels", version = "0.4.2" }
-candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.2" }
-candle-nn = { path = "./candle-nn", version = "0.4.2" }
-candle-onnx = { path = "./candle-onnx", version = "0.4.2" }
-candle-transformers = { path = "./candle-transformers", version = "0.4.2" }
 clap = { version = "4.2.4", features = ["derive"] }
-criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "0.10.0", features = ["f16"] }
-fancy-regex = "0.13.0"
-gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
+cudarc = { version = "0.9.14", features = ["f16"] }
+gemm = { version = "0.16.6", features = ["wasm-simd128-enable"] }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
-image = { version = "0.25.0", default-features = false, features = ["jpeg", "png"] }
-imageproc = { version = "0.24.0", default-features = false }
+image = { version = "0.24.7", default-features = false, features = ["jpeg", "png"] }
+imageproc = { version = "0.23.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
-memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
+memmap2 = { version = "0.7.1", features = ["stable_deref_trait"] }
 num_cpus = "1.15.0"
 num-traits = "0.2.15"
-parquet = { version = "50.0.0" }
+parquet = { version = "45.0.0" }
 rand = "0.8.5"
 rand_distr = "0.4.3"
 rayon = "1.7.0"
-safetensors = "0.4.1"
+rusttype = { version = "0.9", default-features = false }
+safetensors = "0.3.1"
 serde = { version = "1.0.171", features = ["derive"] }
 serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
-tokenizers = { version = "0.15.0", default-features = false }
+tokenizers = { version = "0.13.4", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
 wav = "1.0.0"
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "0.6.6", default-features = false }
-metal = { version = "0.27.0", features = ["mps"]}
+metal = { git = "https://github.com/ivarflakstad/metal-rs.git", features = ["mps"] }

 [profile.release-with-debug]
 inherits = "release"
--- a/README.md
+++ b/README.md
@ -54,33 +54,20 @@ These online demos run entirely in your browser:
 - [whisper](https://huggingface.co/spaces/lmz/candle-whisper): speech recognition.
 - [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2): text generation.
 - [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm): text generation.
- [Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
+- [Phi-v1.5](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
 - [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm): Image segmentation.
 - [BLIP](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning): image captioning.

 We also provide a some command line based examples using state of the art models:

- [LLaMA and LLaMA-v2](./candle-examples/examples/llama/): general LLM, includes
-  the SOLAR-10.7B variant.
+- [LLaMA and LLaMA-v2](./candle-examples/examples/llama/): general LLM.
 - [Falcon](./candle-examples/examples/falcon/): general LLM.
- [Gemma](./candle-examples/examples/gemma/): 2b and 7b general LLMs from Google
-  Deepmind.
- [Phi-1, Phi-1.5, and Phi-2](./candle-examples/examples/phi/): 1.3b and 2.7b general LLMs with performance on par with LLaMA-v2 7b.
+- [Phi-v1 and Phi-v1.5](./candle-examples/examples/phi/): a 1.3b general LLM with performance on par with LLaMA-v2 7b.
 - [StableLM-3B-4E1T](./candle-examples/examples/stable-lm/): a 3b general LLM
-  pre-trained on 1T tokens of English and code datasets. Also supports
-  StableLM-2, a 1.6b LLM trained on 2T tokens, as well as the code variants.
- [Mamba](./candle-examples/examples/mamba/): an inference only
-  implementation of the Mamba state space model.
+  pre-trained on 1T tokens of English and code datasets.
 - [Mistral7b-v0.1](./candle-examples/examples/mistral/): a 7b general LLM with
-  better performance than all publicly available 13b models as of 2023-09-28.
- [Mixtral8x7b-v0.1](./candle-examples/examples/mixtral/): a sparse mixture of
-  experts 8x7b general LLM with better performance than a Llama 2 70B model with
-  much faster inference.
- [StarCoder](./candle-examples/examples/bigcode/) and
-  [StarCoder2](./candle-examples/examples/starcoder2/): LLM specialized to code generation.
- [Qwen1.5](./candle-examples/examples/qwen/): Bilingual (English/Chinese) LLMs.
- [RWKV v5 and v6](./candle-examples/examples/rwkv/): An RNN with transformer level LLM
-  performance.
+  performance larger than all publicly available 13b models as of 2023-09-28.
+- [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code generation.
 - [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
 - [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
  (English/Chinese) general LLMs with 6b and 34b parameters.
@ -91,7 +78,7 @@ We also provide a some command line based examples using state of the art models
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/quantized/assets/aoc.gif" width="600">
  
 - [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
-  image generative model, support for the 1.5, 2.1, SDXL 1.0 and Turbo versions.
+  image generative model, support for the 1.5, 2.1, and SDXL 1.0 versions.

 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg" width="200">

@ -110,23 +97,14 @@ We also provide a some command line based examples using state of the art models

 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/segment-anything/assets/sam_merged.jpg" width="200">

- [SegFormer](./candle-examples/examples/segformer/): transformer based semantic segmantation model.
 - [Whisper](./candle-examples/examples/whisper/): speech recognition model.
- [EnCodec](./candle-examples/examples/encodec/): high-quality audio compression
-  model using residual vector quantization.
- [MetaVoice](./candle-examples/examples/metavoice/): foundational model for
-  text-to-speech.
 - [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
  [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
  using self-supervision (can be used for imagenet classification, depth
  evaluation, segmentation).
- [VGG](./candle-examples/examples/vgg/),
-  [RepVGG](./candle-examples/examples/repvgg): computer vision models.
 - [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
  generate captions for an image.
- [TrOCR](./candle-examples/examples/trocr/): a transformer OCR model, with
-  dedicated submodels for hand-writing and printed recognition.
 - [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
  model, generates the translated text from the input text.

@ -144,7 +122,7 @@ There are also some wasm examples for whisper and
 [whisper](https://huggingface.co/spaces/lmz/candle-whisper),
 [llama2](https://huggingface.co/spaces/lmz/candle-llama2),
 [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
-[Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm),
+[Phi-v1.5](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm),
 [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).

 For LLaMA2, run the following command to retrieve the weight files and start a
@ -161,21 +139,17 @@ And then head over to
 <!--- ANCHOR: useful_libraries --->

 ## Useful External Resources
- [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): A
+- [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): a
  very detailed tutorial showing how to convert a PyTorch model to Candle.
- [`candle-lora`](https://github.com/EricLBuehler/candle-lora): Efficient and
-  ergonomic LoRA implementation for Candle. `candle-lora` has      
-  out-of-the-box LoRA support for many models from Candle, which can be found
-  [here](https://github.com/EricLBuehler/candle-lora/tree/master/candle-lora-transformers/examples).
- [`optimisers`](https://github.com/KGrewal1/optimisers): A collection of optimisers
+- [`optimisers`](https://github.com/KGrewal1/optimisers): a collection of optimisers
  including SGD with momentum, AdaGrad, AdaDelta, AdaMax, NAdam, RAdam, and RMSprop.
+- [`candle-lora`](https://github.com/EricLBuehler/candle-lora): a LoRA implementation
+  that conforms to the official `peft` implementation.
 - [`candle-vllm`](https://github.com/EricLBuehler/candle-vllm): Efficient platform for inference and
  serving local LLMs including an OpenAI compatible API server.
- [`candle-ext`](https://github.com/mokeyish/candle-ext): An extension library to Candle that provides PyTorch functions not currently available in Candle.
- [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
+- [`candle-ext`](https://github.com/mokeyish/candle-ext): an extension library to Candle that provides PyTorch functions not currently available in Candle.
+- [`kalosm`](https://github.com/floneum/floneum/tree/master/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
 - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
- [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
- [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.

 If you have an addition to this list, please submit a pull request.

@ -194,45 +168,28 @@ If you have an addition to this list, please submit a pull request.
    - WASM support, run your models in a browser.
 - Included models.
    - Language Models.
-        - LLaMA v1 and v2 with variants such as SOLAR-10.7B.
+        - LLaMA v1 and v2.
        - Falcon.
-        - StarCoder, StarCoder2.
-        - Phi 1, 1.5, and 2.
-        - Mamba, Minimal Mamba
-        - Gemma 2b and 7b.
+        - StarCoder.
+        - Phi v1.5.
        - Mistral 7b v0.1.
-        - Mixtral 8x7b v0.1.
-        - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
+        - StableLM-3B-4E1T.
        - Replit-code-v1.5-3B.
        - Bert.
        - Yi-6B and Yi-34B.
-        - Qwen1.5.
-        - RWKV v5 and v6.
-    - Quantized LLMs.
-        - Llama 7b, 13b, 70b, as well as the chat and code variants.
-        - Mistral 7b, and 7b instruct.
-        - Mixtral 8x7b.
-        - Zephyr 7b a and b (Mistral-7b based).
-        - OpenChat 3.5 (Mistral-7b based).
    - Text to text.
        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
        - Marian MT (Machine Translation).
+    - Whisper (multi-lingual support).
    - Text to image.
        - Stable Diffusion v1.5, v2.1, XL v1.0.
        - Wurstchen v2.
    - Image to text.
        - BLIP.
-        - TrOCR.
-    - Audio.
-        - Whisper, multi-lingual speech-to-text.
-        - EnCodec, audio compression model.
-        - MetaVoice-1B, text-to-speech model.
    - Computer Vision Models.
-        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
-          ConvNeXTv2, MobileOne, EfficientVit (MSRA).
+        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT.
        - yolo-v3, yolo-v8.
        - Segment-Anything Model (SAM).
-        - SegFormer.
 - File formats: load models from safetensors, npz, ggml, or PyTorch files.
 - Serverless (on CPU), small and fast deployments.
 - Quantization support using the llama.cpp quantized types.
--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@ -11,11 +11,11 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { workspace = true }
-candle-datasets = { workspace = true }
-candle-nn = { workspace = true }
-candle-transformers = { workspace = true }
-candle-flash-attn = { workspace = true, optional = true }
+candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.3.0" }
+candle-nn = { path = "../candle-nn", version = "0.3.0" }
+candle-transformers = { path = "../candle-transformers", version = "0.3.0" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.0", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
--- a/candle-book/src/apps/dekstop.md
+++ b/candle-book/src/apps/dekstop.md
--- a/candle-book/src/lib.rs
+++ b/candle-book/src/lib.rs
@ -28,7 +28,6 @@ let weights = candle::safetensors::load(weights_filename, &Device::Cpu).unwrap()
    #[rustfmt::skip]
    #[test]
    fn book_hub_2() {
-        {
 // ANCHOR: book_hub_2
 use candle::Device;
 use hf_hub::api::sync::Api;
@ -46,10 +45,9 @@ let weights = candle::safetensors::load_buffer(&mmap[..], &Device::Cpu).unwrap()
        assert_eq!(weights.len(), 206);
    }

-    // #[rustfmt::skip]
-    // #[test]
-    // fn book_hub_3() {
-    {
+    #[rustfmt::skip]
+    #[test]
+    fn book_hub_3() {
 // ANCHOR: book_hub_3
 use candle::{DType, Device, Tensor};
 use hf_hub::api::sync::Api;
@ -104,7 +102,6 @@ let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).un
        assert_eq!(view.shape(), &[768, 768]);
        assert_eq!(tp_tensor.dims(), &[192, 768]);
    }
-}

    #[rustfmt::skip]
    #[test]
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -12,8 +12,8 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { workspace = true, optional = true }
-candle-metal-kernels = { workspace = true, optional = true }
+candle-kernels = { path = "../candle-kernels", version = "0.3.0", optional = true }
+candle-metal-kernels = { path = "../candle-metal-kernels", version = "0.3.0", optional = true }
 metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
@ -34,8 +34,6 @@ zip = { workspace = true }
 [dev-dependencies]
 anyhow = { workspace = true }
 clap = { workspace = true }
-criterion = { workspace = true }
-

 [features]
 default = []
@ -44,7 +42,3 @@ cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels"]
-
-[[bench]]
-name = "bench_main"
-harness = false
--- a/candle-core/benches/bench_main.rs
+++ b/candle-core/benches/bench_main.rs
@ -1,10 +0,0 @@
-mod benchmarks;
-
-use criterion::criterion_main;
-criterion_main!(
-    benchmarks::affine::benches,
-    benchmarks::matmul::benches,
-    benchmarks::random::benches,
-    benchmarks::where_cond::benches,
-    benchmarks::conv_transpose2d::benches,
-);
--- a/candle-core/benches/benchmarks/affine.rs
+++ b/candle-core/benches/benchmarks/affine.rs
@ -1,43 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(a: &Tensor) {
-    a.affine(12.34, 56.78).unwrap();
-}
-
-fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let b = 1;
-    let m = 1024;
-    let k = 1024;
-
-    let tensor = Tensor::zeros((b, m, k), dtype, &device).unwrap();
-
-    let flops = b * m * k * dtype.size_in_bytes();
-
-    let mut group = c.benchmark_group(device.bench_name(name));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(black_box(&tensor));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_affine_benchmark(c, &device, DType::F32, "affine_f32");
-        run_affine_benchmark(c, &device, DType::F16, "affine_f16");
-        run_affine_benchmark(c, &device, DType::BF16, "affine_bf16");
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/conv_transpose2d.rs
+++ b/candle-core/benches/benchmarks/conv_transpose2d.rs
@ -1,59 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(
-    x: &Tensor,
-    k: &Tensor,
-    padding: usize,
-    output_padding: usize,
-    stride: usize,
-    dilation: usize,
-) {
-    x.conv_transpose2d(k, padding, output_padding, stride, dilation)
-        .unwrap();
-}
-
-fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let t = Tensor::arange(0.0f32, 10000.0, device)
-        .unwrap()
-        .reshape((1, 4, 50, 50))
-        .unwrap()
-        .to_dtype(dtype)
-        .unwrap();
-
-    let kernel = Tensor::arange(0.0f32, 100.0, device)
-        .unwrap()
-        .reshape((4, 1, 5, 5))
-        .unwrap()
-        .to_dtype(dtype)
-        .unwrap();
-
-    let flops = t.dims().iter().product::<usize>() * dtype.size_in_bytes();
-
-    let mut group = c.benchmark_group(device.bench_name(name));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(black_box(&t), black_box(&kernel), 1, 0, 1, 2);
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_benchmark(c, &device, DType::F32, "conv_transpose2d_f32");
-        run_benchmark(c, &device, DType::F16, "conv_transpose2d_f16");
-        run_benchmark(c, &device, DType::BF16, "conv_transpose2d_bf16");
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/matmul.rs
+++ b/candle-core/benches/benchmarks/matmul.rs
@ -1,44 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(a: &Tensor, b: &Tensor) {
-    a.matmul(&b.t().unwrap()).unwrap();
-}
-
-fn run_bench(c: &mut Criterion, device: &Device) {
-    let b = 1;
-    let m = 1;
-    let n = 2048;
-    let k = 2048;
-
-    let dtype = DType::F32;
-    let lhs = Tensor::zeros((b, m, k), dtype, device).unwrap();
-    let rhs = Tensor::zeros((b, n, k), dtype, device).unwrap();
-
-    let flops = b * m * n * k;
-
-    let mut group = c.benchmark_group(device.bench_name("matmul"));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(black_box(&lhs), black_box(&rhs));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_bench(c, &device);
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/mod.rs
+++ b/candle-core/benches/benchmarks/mod.rs
@ -1,67 +0,0 @@
-pub(crate) mod affine;
-pub(crate) mod conv_transpose2d;
-pub(crate) mod matmul;
-pub(crate) mod random;
-pub(crate) mod where_cond;
-
-use candle_core::{Device, Result};
-
-pub(crate) trait BenchDevice {
-    fn sync(&self) -> Result<()>;
-
-    fn bench_name<S: Into<String>>(&self, name: S) -> String;
-}
-
-impl BenchDevice for Device {
-    fn sync(&self) -> Result<()> {
-        match self {
-            Device::Cpu => Ok(()),
-            Device::Cuda(device) => {
-                #[cfg(feature = "cuda")]
-                return Ok(device.synchronize()?);
-                #[cfg(not(feature = "cuda"))]
-                panic!("Cuda device without cuda feature enabled: {:?}", device)
-            }
-            Device::Metal(device) => {
-                #[cfg(feature = "metal")]
-                return Ok(device.wait_until_completed()?);
-                #[cfg(not(feature = "metal"))]
-                panic!("Metal device without metal feature enabled: {:?}", device)
-            }
-        }
-    }
-
-    fn bench_name<S: Into<String>>(&self, name: S) -> String {
-        match self {
-            Device::Cpu => {
-                let cpu_type = if cfg!(feature = "accelerate") {
-                    "accelerate"
-                } else if cfg!(feature = "mkl") {
-                    "mkl"
-                } else {
-                    "cpu"
-                };
-                format!("{}_{}", cpu_type, name.into())
-            }
-            Device::Cuda(_) => format!("cuda_{}", name.into()),
-            Device::Metal(_) => format!("metal_{}", name.into()),
-        }
-    }
-}
-
-struct BenchDeviceHandler {
-    devices: Vec<Device>,
-}
-
-impl BenchDeviceHandler {
-    pub fn new() -> Result<Self> {
-        let mut devices = Vec::new();
-        if cfg!(feature = "metal") {
-            devices.push(Device::new_metal(0)?);
-        } else if cfg!(feature = "cuda") {
-            devices.push(Device::new_cuda(0)?);
-        }
-        devices.push(Device::Cpu);
-        Ok(Self { devices })
-    }
-}
--- a/candle-core/benches/benchmarks/random.rs
+++ b/candle-core/benches/benchmarks/random.rs
@ -1,63 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn rand_uniform(a: &Tensor) {
-    a.rand_like(-1.0, 123.0).unwrap();
-}
-
-fn rand_normal(a: &Tensor) {
-    a.randn_like(100.0, 15.0).unwrap();
-}
-
-fn run_random_bench(c: &mut Criterion, device: &Device) {
-    let b = 1;
-
-    let rows = 2048;
-    let cols = 2048;
-
-    let dtype = DType::F32;
-    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
-
-    let flops = b * rows * cols * dtype.size_in_bytes();
-
-    let mut group = c.benchmark_group(device.bench_name("random_uniform"));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |benches| {
-        benches.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                rand_uniform(black_box(&tensor));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-
-    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
-
-    let mut group = c.benchmark_group(device.bench_name("random_normal"));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |benches| {
-        benches.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                rand_normal(black_box(&tensor));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_random_bench(c, &device);
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/where_cond.rs
+++ b/candle-core/benches/benchmarks/where_cond.rs
@ -1,64 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(a: &Tensor, b: &Tensor, c: &Tensor) {
-    a.where_cond(b, c).unwrap();
-}
-
-const fn create_cond_arr<const N: usize>() -> [u8; N] {
-    let mut arr = [0u8; N];
-    let mut i = 0;
-    while i < N {
-        arr[i] = (i % 2) as u8;
-        i += 1;
-    }
-    arr
-}
-
-const B: usize = 1;
-const M: usize = 1024;
-const K: usize = 1024;
-const SIZE: usize = B * M * K;
-
-const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
-
-fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), &device).unwrap();
-    let on_true = Tensor::ones((B, M, K), dtype, &device).unwrap();
-    let on_false = Tensor::zeros((B, M, K), dtype, &device).unwrap();
-
-    let elements = B * M * K;
-    // E.g. 2 f32 tensors + 1 u8 tensor
-    let flops = (2 * elements * dtype.size_in_bytes()) + elements;
-
-    let mut group = c.benchmark_group(device.bench_name(name));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(
-                    black_box(&tensor),
-                    black_box(&on_true),
-                    black_box(&on_false),
-                );
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let device = BenchDeviceHandler::new().unwrap();
-    for d in device.devices {
-        run_where_cond_benchmark(c, &d, DType::F32, "where_cond_f32");
-        run_where_cond_benchmark(c, &d, DType::BF16, "where_cond_bf16");
-        run_where_cond_benchmark(c, &d, DType::F16, "where_cond_f16");
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -8,10 +8,11 @@ use anyhow::Result;
 use candle_core::{Device, Tensor};

 fn main() -> Result<()> {
-    let a = Tensor::new(&[[0.0f32, 1.0, 2.0], [3.0, 4.0, 5.0]], &Device::Cpu)?;
-    let b = Tensor::new(&[[88.0f32, 99.0]], &Device::Cpu)?;
-    let new_a = a.slice_scatter(&b, 1, 2)?;
-    assert_eq!(a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
-    assert_eq!(new_a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
+    let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
+    let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
+    let start = std::time::Instant::now();
+    let res = inp.conv2d(&w, 0, 1, 1, 1)?;
+    println!("{:?}", start.elapsed());
+    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -5,32 +5,25 @@ extern crate accelerate_src;
 extern crate intel_mkl_src;

 use anyhow::Result;
-use candle_core::{Device, Module, Tensor};
-
-use candle_core::quantized::{QMatMul, QTensor};
+use candle_core::{Device, Tensor};

 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
-    let q = Tensor::randn(0f32, 1.0, (72, 256), &device)?;
-    let q_cpu = q.to_device(&Device::Cpu)?;
-    let q = QTensor::quantize(&q, candle_core::quantized::GgmlDType::Q8K)?;
-    let q = QMatMul::from_qtensor(q)?;
-    let x = Tensor::randn(0f32, 1.0, (5, 256), &device)?;
-    let res_q_cuda = q.forward(&x)?;
-    println!("{res_q_cuda}");
-
-    let q_cpu = QTensor::quantize(&q_cpu, candle_core::quantized::GgmlDType::Q8K)?;
-    let q_cpu_tensor = q_cpu.dequantize(&Device::Cpu)?;
-    let q_cpu = QMatMul::from_qtensor(q_cpu)?;
-    let x_cpu = x.to_device(&Device::Cpu)?;
-    let res_q_cpu = q_cpu.forward(&x_cpu)?;
-    println!("{res_q_cpu}");
-
-    let res_mm = x_cpu.matmul(&q_cpu_tensor.t()?)?;
-    let diff = (res_mm - res_q_cuda.to_device(&Device::Cpu))?
-        .abs()?
-        .flatten_all()?
-        .max(0)?;
+    let in_t = Tensor::rand(-1f32, 1f32, (1, 3, 12, 7), &device)?;
+    let k_t = Tensor::rand(-1f32, 1f32, (6, 3, 1, 1), &device)?;
+    let out_t = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
+    println!("{out_t}");
+    let in_t = in_t.to_device(&Device::Cpu)?;
+    let k_t = k_t.to_device(&Device::Cpu)?;
+    let out_t2 = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
+    let diff = (out_t.to_device(&Device::Cpu)? - out_t2)?
+        .sqr()?
+        .sum_all()?;
    println!("{diff}");
+
+    let t = Tensor::randn(0f32, 1f32, (2, 4, 96, 96), &device)?;
+    let w = Tensor::randn(0f32, 1f32, (320, 4, 3, 3), &device)?;
+    let res = t.conv2d(&w, 1, 1, 1, 1)?;
+    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@ -1,5 +1,5 @@
-use candle_core::quantized::{gguf_file, GgmlDType, QTensor};
-use candle_core::{Device, Result};
+use candle_core::quantized::{gguf_file, k_quants, QTensor};
+use candle_core::{Device, Result, Tensor};
 use clap::{Parser, Subcommand, ValueEnum};
 use rayon::prelude::*;

@ -11,7 +11,12 @@ enum QuantizationMode {
 }

 impl QuantizationMode {
-    fn quantize(&self, name: &str, tensor: QTensor, dtype: GgmlDType) -> Result<QTensor> {
+    fn quantize(
+        &self,
+        name: &str,
+        tensor: QTensor,
+        default: fn(&Tensor) -> Result<QTensor>,
+    ) -> Result<QTensor> {
        match self {
            Self::Llama => {
                // Same behavior as the llama.cpp quantization.
@ -19,9 +24,9 @@ impl QuantizationMode {
                if should_quantize {
                    let tensor = tensor.dequantize(&Device::Cpu)?;
                    if name == "output.weight" {
-                        QTensor::quantize(&tensor, GgmlDType::Q6K)
+                        QTensor::quantize::<k_quants::BlockQ6K>(&tensor)
                    } else {
-                        QTensor::quantize(&tensor, dtype)
+                        default(&tensor)
                    }
                } else {
                    Ok(tensor)
@ -55,27 +60,6 @@ enum Quantization {
    F32,
 }

-impl Quantization {
-    fn dtype(&self) -> GgmlDType {
-        match self {
-            Quantization::Q4_0 => GgmlDType::Q4_0,
-            Quantization::Q4_1 => GgmlDType::Q4_1,
-            Quantization::Q5_0 => GgmlDType::Q5_0,
-            Quantization::Q5_1 => GgmlDType::Q5_1,
-            Quantization::Q8_0 => GgmlDType::Q8_0,
-            Quantization::Q8_1 => GgmlDType::Q8_1,
-            Quantization::Q2k => GgmlDType::Q2K,
-            Quantization::Q3k => GgmlDType::Q3K,
-            Quantization::Q4k => GgmlDType::Q4K,
-            Quantization::Q5k => GgmlDType::Q5K,
-            Quantization::Q6k => GgmlDType::Q6K,
-            Quantization::Q8k => GgmlDType::Q8K,
-            Quantization::F16 => GgmlDType::F16,
-            Quantization::F32 => GgmlDType::F32,
-        }
-    }
-}
-
 #[derive(ValueEnum, Debug, Clone)]
 enum Format {
    Safetensors,
@ -118,7 +102,7 @@ enum Command {
    },

    Quantize {
-        /// The input file(s), in safetensors format.
+        /// The input file, in gguf format.
        in_file: Vec<std::path::PathBuf>,

        /// The output file, in gguf format.
@ -133,15 +117,6 @@ enum Command {
        #[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
        mode: QuantizationMode,
    },
-
-    Dequantize {
-        /// The input file, in gguf format.
-        in_file: std::path::PathBuf,
-
-        /// The output file, in safetensors format.
-        #[arg(long)]
-        out_file: std::path::PathBuf,
-    },
 }

 #[derive(Parser, Debug, Clone)]
@ -150,12 +125,7 @@ struct Args {
    command: Command,
 }

-fn run_ls(
-    file: &std::path::PathBuf,
-    format: Option<Format>,
-    verbose: bool,
-    device: &Device,
-) -> Result<()> {
+fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> Result<()> {
    let format = match format {
        Some(format) => format,
        None => match Format::infer(file) {
@ -196,7 +166,7 @@ fn run_ls(
            }
        }
        Format::Pth => {
-            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose, None)?;
+            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose)?;
            tensors.sort_by(|a, b| a.name.cmp(&b.name));
            for tensor_info in tensors.iter() {
                println!(
@ -221,7 +191,7 @@ fn run_ls(
        }
        Format::Ggml => {
            let mut file = std::fs::File::open(file)?;
-            let content = candle_core::quantized::ggml_file::Content::read(&mut file, device)?;
+            let content = candle_core::quantized::ggml_file::Content::read(&mut file)?;
            let mut tensors = content.tensors.into_iter().collect::<Vec<_>>();
            tensors.sort_by(|a, b| a.0.cmp(&b.0));
            for (name, qtensor) in tensors.iter() {
@ -262,8 +232,37 @@ fn run_quantize_safetensors(
    }
    println!("tensors: {}", tensors.len());

-    let dtype = q.dtype();
-    let block_size = dtype.block_size();
+    let quantize_fn = match q {
+        Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
+        Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
+        Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
+        Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
+        Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
+        Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
+        Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
+        Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
+        Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
+        Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
+        Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
+        Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
+        Quantization::F16 => QTensor::quantize::<half::f16>,
+        Quantization::F32 => QTensor::quantize::<f32>,
+    };
+    let block_size = match q {
+        Quantization::Q4_0 => k_quants::QK4_0,
+        Quantization::Q4_1 => k_quants::QK4_1,
+        Quantization::Q5_0 => k_quants::QK5_0,
+        Quantization::Q5_1 => k_quants::QK5_1,
+        Quantization::Q8_0 => k_quants::QK8_0,
+        Quantization::Q8_1 => k_quants::QK8_1,
+        Quantization::Q2k
+        | Quantization::Q3k
+        | Quantization::Q4k
+        | Quantization::Q5k
+        | Quantization::Q6k
+        | Quantization::Q8k => k_quants::QK_K,
+        Quantization::F16 | Quantization::F32 => 1,
+    };

    let qtensors = tensors
        .into_par_iter()
@ -271,9 +270,9 @@ fn run_quantize_safetensors(
            let should_quantize = tensor.rank() == 2 && tensor.dim(1)? % block_size == 0;
            println!("  quantizing {name} {tensor:?} {should_quantize}");
            let tensor = if should_quantize {
-                QTensor::quantize(&tensor, dtype)?
+                quantize_fn(&tensor)?
            } else {
-                QTensor::quantize(&tensor, GgmlDType::F32)?
+                QTensor::quantize::<f32>(&tensor)?
            };
            Ok((name, tensor))
        })
@ -286,29 +285,11 @@ fn run_quantize_safetensors(
    Ok(())
 }

-fn run_dequantize(
-    in_file: std::path::PathBuf,
-    out_file: std::path::PathBuf,
-    device: &Device,
-) -> Result<()> {
-    let mut in_file = std::fs::File::open(in_file)?;
-    let content = gguf_file::Content::read(&mut in_file)?;
-    let mut tensors = std::collections::HashMap::new();
-    for (tensor_name, _) in content.tensor_infos.iter() {
-        let tensor = content.tensor(&mut in_file, tensor_name, device)?;
-        let tensor = tensor.dequantize(device)?;
-        tensors.insert(tensor_name.to_string(), tensor);
-    }
-    candle_core::safetensors::save(&tensors, out_file)?;
-    Ok(())
-}
-
 fn run_quantize(
    in_files: &[std::path::PathBuf],
    out_file: std::path::PathBuf,
    q: Quantization,
    qmode: QuantizationMode,
-    device: &Device,
 ) -> Result<()> {
    if in_files.is_empty() {
        candle_core::bail!("no specified input files")
@ -334,15 +315,31 @@ fn run_quantize(
    let content = gguf_file::Content::read(&mut in_)?;
    println!("tensors: {}", content.tensor_infos.len());

-    let dtype = q.dtype();
+    let quantize_fn = match q {
+        Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
+        Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
+        Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
+        Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
+        Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
+        Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
+        Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
+        Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
+        Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
+        Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
+        Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
+        Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
+        Quantization::F16 => QTensor::quantize::<half::f16>,
+        Quantization::F32 => QTensor::quantize::<f32>,
+    };
+
    let qtensors = content
        .tensor_infos
        .par_iter()
        .map(|(name, _)| {
            println!("  quantizing {name}");
            let mut in_file = std::fs::File::open(&in_files[0])?;
-            let tensor = content.tensor(&mut in_file, name, device)?;
-            let tensor = qmode.quantize(name, tensor, dtype)?;
+            let tensor = content.tensor(&mut in_file, name)?;
+            let tensor = qmode.quantize(name, tensor, quantize_fn)?;
            Ok((name, tensor))
        })
        .collect::<Result<Vec<_>>>()?;
@ -362,7 +359,6 @@ fn run_quantize(

 fn main() -> anyhow::Result<()> {
    let args = Args::parse();
-    let device = Device::Cpu;
    match args.command {
        Command::Ls {
            files,
@ -374,7 +370,7 @@ fn main() -> anyhow::Result<()> {
                if multiple_files {
                    println!("--- {file:?} ---");
                }
-                run_ls(file, format.clone(), verbose, &device)?
+                run_ls(file, format.clone(), verbose)?
            }
        }
        Command::Quantize {
@ -382,8 +378,7 @@ fn main() -> anyhow::Result<()> {
            out_file,
            quantization,
            mode,
-        } => run_quantize(&in_file, out_file, quantization, mode, &device)?,
-        Command::Dequantize { in_file, out_file } => run_dequantize(in_file, out_file, &device)?,
+        } => run_quantize(&in_file, out_file, quantization, mode)?,
    }
    Ok(())
 }
--- a/candle-core/src/accelerate.rs
+++ b/candle-core/src/accelerate.rs
@ -380,16 +380,6 @@ pub fn vd_tanh_inplace(y: &mut [f64]) {
    unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
 }

-#[inline]
-pub fn vs_exp_inplace(y: &mut [f32]) {
-    unsafe { ffi::vvexpf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vd_exp_inplace(y: &mut [f64]) {
-    unsafe { ffi::vvexp(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
 #[inline]
 pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
@ -412,28 +402,6 @@ pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
    }
 }

-#[inline]
-pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vs_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vd_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
 macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
        #[inline]
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@ -98,19 +98,6 @@ pub trait BackendStorage: Sized {
    ) -> Result<Self>;

    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;
-
-    #[allow(clippy::too_many_arguments)]
-    // Similar to cudaMemcpy2D, though values are in elements and not in bytes.
-    fn copy2d(
-        &self,
-        _: &mut Self,
-        _d1: usize,
-        _d2: usize,
-        _src_stride1: usize,
-        _dst_stride1: usize,
-        _src_offset: usize,
-        _dst_offset: usize,
-    ) -> Result<()>;
 }

 pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
@ -127,16 +114,8 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone {

    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;

-    /// # Safety
-    /// This function is unsafe as it doesn't initialize the underlying data store.
-    /// The caller should ensure that the data is properly initialized as early as possible
-    /// after this call.
-    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
-
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage>;

-    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage>;
-
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;

    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -1,4 +1,3 @@
-/// Methods for backpropagation of gradients.
 use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};
 use crate::{Error, Result, Tensor, TensorId};
 use std::collections::HashMap;
@ -114,8 +113,8 @@ impl Tensor {
                    | Op::Unary(_node, UnaryOp::Floor)
                    | Op::Unary(_node, UnaryOp::Round) => nodes,
                    Op::Reshape(node)
-                    | Op::UpsampleNearest1D { arg: node, .. }
-                    | Op::UpsampleNearest2D { arg: node, .. }
+                    | Op::UpsampleNearest1D(node)
+                    | Op::UpsampleNearest2D(node)
                    | Op::AvgPool2D { arg: node, .. }
                    | Op::MaxPool2D { arg: node, .. }
                    | Op::Copy(node)
@ -176,7 +175,7 @@ impl Tensor {
            // the backprop graph of the backprop itself. This would be an issue for second order
            // derivatives but these are out of scope at the moment.
            let do_not_detach = CANDLE_GRAD_DO_NOT_DETACH.with(|b| *b);
-            let grad = if do_not_detach { grad } else { grad.detach() };
+            let grad = if do_not_detach { grad } else { grad.detach()? };
            if let Some(op) = node.op() {
                match op {
                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
@ -251,7 +250,6 @@ impl Tensor {
                            out_padding,
                            *stride,
                            *dilation,
-                            /* groups */ 1,
                        )?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
@ -311,32 +309,9 @@ impl Tensor {
                    Op::ConvTranspose1D { .. } => Err(Error::BackwardNotSupported {
                        op: "conv-transpose1d",
                    })?,
-                    Op::ConvTranspose2D {
-                        arg,
-                        kernel,
-                        padding,
-                        stride,
-                        dilation,
-                        output_padding: _output_padding,
-                    } => {
-                        let grad_arg = grad.conv2d(kernel, *padding, *dilation, *stride, 1)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-
-                        let grad_kernel = grad
-                            .transpose(0, 1)?
-                            .conv2d(&arg.transpose(0, 1)?, *padding, *stride, *dilation, 1)?
-                            .transpose(0, 1)?;
-                        let sum_grad = grads.or_insert(kernel)?;
-                        let (_, _, k0, k1) = kernel.dims4()?;
-                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
-                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
-                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
-                        } else {
-                            grad_kernel
-                        };
-                        *sum_grad = sum_grad.add(&grad_kernel)?;
-                    }
+                    Op::ConvTranspose2D { .. } => Err(Error::BackwardNotSupported {
+                        op: "conv-transpose2d",
+                    })?,
                    Op::AvgPool2D {
                        arg,
                        kernel_size,
@ -372,39 +347,12 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                    }
-                    Op::UpsampleNearest1D { arg, target_size } => {
-                        let (_n, c, size) = arg.dims3()?;
-                        if target_size % size != 0 {
-                            crate::bail!("backward not supported for non integer upscaling factors")
-                        }
-                        let scale = target_size / size;
-
-                        let kernel = Tensor::ones((c, 1, scale), arg.dtype(), arg.device())?;
-                        let conv_sum = grad.conv1d(&kernel, 0, scale, 1, c)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = conv_sum;
-                    }
-                    Op::UpsampleNearest2D {
-                        arg,
-                        target_h,
-                        target_w,
-                    } => {
-                        let (_n, c, h, w) = arg.dims4()?;
-                        if target_h % h != 0 || target_w % w != 0 {
-                            crate::bail!("backward not supported for non integer upscaling factors")
-                        }
-                        let scale_h = target_h / h;
-                        let scale_w = target_w / w;
-
-                        if scale_h != scale_w {
-                            crate::bail!("backward not supported for non uniform upscaling factors")
-                        };
-                        let kernel =
-                            Tensor::ones((c, 1, scale_h, scale_w), arg.dtype(), arg.device())?;
-                        let conv_sum = grad.conv2d(&kernel, 0, scale_h, 1, c)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = conv_sum;
-                    }
+                    Op::UpsampleNearest1D { .. } => Err(Error::BackwardNotSupported {
+                        op: "upsample-nearest1d",
+                    })?,
+                    Op::UpsampleNearest2D { .. } => Err(Error::BackwardNotSupported {
+                        op: "upsample-nearest2d",
+                    })?,
                    Op::SliceScatter0(lhs, rhs, start_rhs) => {
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        let rhs_grad = grad.narrow(0, *start_rhs, rhs.dim(0)?)?;
@ -623,13 +571,6 @@ impl Tensor {
                        let relu_grad = arg.ge(&arg.zeros_like()?)?.to_dtype(arg.dtype())?;
                        *sum_grad = sum_grad.add(&(&grad * relu_grad)?)?
                    }
-                    Op::Unary(arg, UnaryOp::Silu) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
-                        let sigmoid_arg = (*node / arg)?;
-                        let silu_grad = (&sigmoid_arg * (1. + (arg * (1. - &sigmoid_arg)?)?)?)?;
-                        *sum_grad = sum_grad.add(&(&grad * silu_grad)?)?
-                    }
                    Op::Elu(arg, alpha) => {
                        // d/dx elu(x) = 1 for x > 0, alpha * e^x for x <= 0
                        let sum_grad = grads.or_insert(arg)?;
@ -714,38 +655,30 @@ impl Tensor {
    }
 }

-/// A store for gradients, associating a tensor id to the corresponding gradient tensor, used for back propagation.
 #[derive(Debug)]
 pub struct GradStore(HashMap<TensorId, Tensor>);

 impl GradStore {
-    /// Create a new gradient store
    fn new() -> Self {
        GradStore(HashMap::new())
    }

-    /// Get the gradient tensor corresponding to the given tensor id
    pub fn get_id(&self, id: TensorId) -> Option<&Tensor> {
        self.0.get(&id)
    }

-    /// Get the gradient tensor associated with the given tensor
    pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> {
        self.0.get(&tensor.id())
    }

-    /// Remove the gradient tensor associated with the given tensor, returning it if it exists
    pub fn remove(&mut self, tensor: &Tensor) -> Option<Tensor> {
        self.0.remove(&tensor.id())
    }

-    /// Insert a gradient tensor associated with the given tensor, returning the previous gradient tensor if it existed
    pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option<Tensor> {
        self.0.insert(tensor.id(), grad)
    }

-    /// Get the gradient tensor associated with the given tensor, or, if it does not exist,
-    /// insert a tensor of zeroes, with the same shape and type as the given tensors and return it
    fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> {
        use std::collections::hash_map::Entry;
        let grad = match self.0.entry(tensor.id()) {
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
@ -187,16 +187,36 @@ impl Tensor {
        }
    }

-    fn conv_transpose1d_single_group(
+    /// Applies a 1D transposed convolution over the input tensor.
+    pub fn conv_transpose1d(
        &self,
        kernel: &Self,
-        params: &ParamsConvTranspose1D,
+        padding: usize,
+        output_padding: usize,
+        stride: usize,
+        dilation: usize,
    ) -> Result<Self> {
+        let (b_size, c_in, l_in) = self.dims3()?;
+        let (c_in_k, c_out, k_size) = kernel.dims3()?;
+        if c_in != c_in_k {
+            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
+        }
+        let params = ParamsConvTranspose1D {
+            b_size,
+            l_in,
+            k_size,
+            c_out,
+            c_in,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+        };
        let storage = self.storage().conv_transpose1d(
            self.layout(),
            &kernel.storage(),
            kernel.layout(),
-            params,
+            &params,
        )?;
        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose1D {
            arg,
@ -210,49 +230,6 @@ impl Tensor {
        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
    }

-    /// Applies a 1D transposed convolution over the input tensor.
-    pub fn conv_transpose1d(
-        &self,
-        kernel: &Self,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-        groups: usize,
-    ) -> Result<Self> {
-        let (c_in_k, c_out, k_size) = kernel.dims3()?;
-        let (b_size, c_in, l_in) = self.dims3()?;
-        if c_in != c_in_k {
-            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
-        }
-        if c_in % groups != 0 {
-            crate::bail!("in_channel {c_in} is not divisible by the number of groups")
-        }
-        let params = ParamsConvTranspose1D {
-            b_size,
-            l_in,
-            k_size,
-            c_out,
-            c_in: c_in / groups,
-            padding,
-            output_padding,
-            stride,
-            dilation,
-        };
-        if groups == 1 {
-            self.conv_transpose1d_single_group(kernel, &params)
-        } else {
-            let blocks = self.chunk(groups, 1)?;
-            let kernel = kernel.chunk(groups, 0)?;
-            let blocks = blocks
-                .iter()
-                .zip(&kernel)
-                .map(|(block, kernel)| block.conv_transpose1d_single_group(kernel, &params))
-                .collect::<Result<Vec<_>>>()?;
-            Tensor::cat(&blocks, 1)
-        }
-    }
-
    fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> Result<Self> {
        let storage =
            self.storage()
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
@ -5,7 +5,6 @@ use half::{bf16, f16};
 use rayon::prelude::*;

 const USE_IM2COL_CONV1D: bool = true;
-const USE_IM2COL_CONV1D_TR: bool = true;
 const USE_IM2COL_CONV2D: bool = true;

 // TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
@ -1023,26 +1022,6 @@ impl<'a, I: IntDType> Map2 for IndexAdd<'a, I> {
    }
 }

-#[allow(clippy::too_many_arguments)]
-fn copy2d_<T: Copy>(
-    src: &[T],
-    dst: &mut [T],
-    d1: usize,
-    d2: usize,
-    src_stride1: usize,
-    dst_stride1: usize,
-    src_offset: usize,
-    dst_offset: usize,
-) {
-    for i1 in 0..d1 {
-        let dst_idx = i1 * dst_stride1 + dst_offset;
-        let src_idx = i1 * src_stride1 + src_offset;
-        let dst = &mut dst[dst_idx..dst_idx + d2];
-        let src = &src[src_idx..src_idx + d2];
-        dst.copy_from_slice(src)
-    }
-}
-
 fn copy_strided_src_<T: Copy>(src: &[T], dst: &mut [T], dst_offset: usize, src_l: &Layout) {
    match src_l.strided_blocks() {
        crate::StridedBlocks::SingleBlock { start_offset, len } => {
@ -1277,34 +1256,6 @@ impl Map1 for Im2Col {
    }
 }

-struct Col2Im1D {
-    stride: usize,
-}
-
-impl Map1 for Col2Im1D {
-    fn f<T: WithDType>(&self, col: &[T], l: &Layout) -> Result<Vec<T>> {
-        let (b_size, l_in, c_out, k_size) = l.shape().dims4()?;
-        let stride = self.stride;
-        let l_out = (l_in - 1) * stride + k_size;
-        let mut im = vec![T::zero(); b_size * c_out * l_out];
-        let (dst_s0, dst_s1) = (c_out * l_out, l_out);
-        let (src_s0, src_s1, src_s2) = (c_out * k_size * l_in, c_out * k_size, k_size);
-        for l_in_i in 0..l_in {
-            for k_i in 0..k_size {
-                let l_out_i = l_in_i * stride + k_i;
-                for b_i in 0..b_size {
-                    for c_i in 0..c_out {
-                        let dst_idx = b_i * dst_s0 + c_i * dst_s1 + l_out_i;
-                        let src_idx = b_i * src_s0 + l_in_i * src_s1 + c_i * src_s2 + k_i;
-                        im[dst_idx] += col[src_idx]
-                    }
-                }
-            }
-        }
-        Ok(im)
-    }
-}
-
 struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);

 impl<'a> Map2 for ConvTranspose1D<'a> {
@ -1312,7 +1263,6 @@ impl<'a> Map2 for ConvTranspose1D<'a> {
    fn f<T: WithDType>(&self, inp: &[T], inp_l: &Layout, k: &[T], k_l: &Layout) -> Result<Vec<T>> {
        let p = self.0;
        let inp = &inp[inp_l.start_offset()..];
-        let k = &k[k_l.start_offset()..];
        let (inp_s0, inp_s1, inp_s2) = crate::shape::dims3(inp_l.stride())?;
        let (k_s0, k_s1, k_s2) = crate::shape::dims3(k_l.stride())?;
        let l_out = p.l_out();
@ -2472,48 +2422,6 @@ impl BackendStorage for CpuStorage {
        }
    }

-    fn copy2d(
-        &self,
-        dst: &mut Self,
-        d1: usize,
-        d2: usize,
-        src_s: usize,
-        dst_s: usize,
-        src_o: usize,
-        dst_o: usize,
-    ) -> Result<()> {
-        match (self, dst) {
-            (Self::U8(src), Self::U8(dst)) => copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o),
-            (Self::U32(src), Self::U32(dst)) => {
-                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
-            }
-            (Self::I64(src), Self::I64(dst)) => {
-                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
-            }
-            (Self::BF16(src), Self::BF16(dst)) => {
-                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
-            }
-            (Self::F16(src), Self::F16(dst)) => {
-                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
-            }
-            (Self::F32(src), Self::F32(dst)) => {
-                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
-            }
-            (Self::F64(src), Self::F64(dst)) => {
-                copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o)
-            }
-            (_, dst) => {
-                return Err(Error::DTypeMismatchBinaryOp {
-                    lhs: self.dtype(),
-                    rhs: dst.dtype(),
-                    op: "copy2d",
-                }
-                .bt());
-            }
-        }
-        Ok(())
-    }
-
    fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> {
        match (self, dst) {
            (Self::U8(src), Self::U8(dst)) => copy_strided_src_(src, dst, dst_offset, src_l),
@ -2582,10 +2490,7 @@ impl BackendStorage for CpuStorage {
            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
        } else {
            // Make the kernel contiguous if not already the case.
-            let mut kernel_c = unsafe {
-                self.device()
-                    .alloc_uninit(kernel_l.shape(), kernel.dtype())?
-            };
+            let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?;
            kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?;
            let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset())
                .transpose(1, 2)?
@ -2593,7 +2498,7 @@ impl BackendStorage for CpuStorage {
            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
        };
        let res_l = Layout::contiguous((b, l_out, params.c_out)).transpose(1, 2)?;
-        let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? };
+        let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?;
        res.copy_strided_src(&mut res_t, 0, &res_l)?;
        Ok(res_t)
    }
@ -2605,52 +2510,7 @@ impl BackendStorage for CpuStorage {
        kernel_l: &Layout,
        params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
-        let can_use_col2im = kernel_l.is_contiguous()
-            && params.dilation == 1
-            && params.padding == 0
-            && params.output_padding == 0;
-        if USE_IM2COL_CONV1D_TR && can_use_col2im {
-            let (b_size, c_in, l_in) = l.shape().dims3()?;
-            let (c_in2, c_out, k_size) = kernel_l.shape().dims3()?;
-            if !kernel_l.is_contiguous() {
-                crate::bail!(
-                    "convtr1d: the second argument (kernel) has to be contiguous {kernel_l:?}"
-                )
-            }
-            if c_in != c_in2 {
-                crate::bail!(
-                    "convtr1d: shape mismatch on c_in {:?} {:?}",
-                    l.shape(),
-                    kernel_l.shape()
-                )
-            }
-            let col = {
-                // This merges the last two dimensions of the kernel together.
-                let kernel_l_mm = Layout::new(
-                    (b_size, c_in, k_size * c_out).into(),
-                    vec![0, k_size * c_out, 1],
-                    kernel_l.start_offset(),
-                );
-                self.matmul(
-                    kernel,
-                    (
-                        b_size,
-                        /* m */ l_in,
-                        /* n */ c_out * k_size,
-                        /* k */ c_in,
-                    ),
-                    &l.transpose(1, 2)?,
-                    &kernel_l_mm,
-                )?
-            };
-            let col_l = Layout::contiguous((b_size, l_in, c_out, k_size));
-            Col2Im1D {
-                stride: params.stride,
-            }
-            .map(&col, &col_l)
-        } else {
-            ConvTranspose1D(params).map(self, l, kernel, kernel_l)
-        }
+        ConvTranspose1D(params).map(self, l, kernel, kernel_l)
    }

    fn conv2d(
@ -2684,10 +2544,7 @@ impl BackendStorage for CpuStorage {
            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
        } else {
            // Make the kernel contiguous if not already the case.
-            let mut kernel_c = unsafe {
-                self.device()
-                    .alloc_uninit(kernel_l.shape(), kernel.dtype())?
-            };
+            let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?;
            kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?;
            let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset())
                .transpose(1, 2)?
@ -2697,7 +2554,7 @@ impl BackendStorage for CpuStorage {
        let res_l = Layout::contiguous((b, h_out, w_out, params.c_out))
            .transpose(1, 2)?
            .transpose(1, 3)?;
-        let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? };
+        let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?;
        res.copy_strided_src(&mut res_t, 0, &res_l)?;
        Ok(res_t)
    }
@ -2717,7 +2574,7 @@ impl BackendStorage for CpuStorage {
            Self::U8(ids) => IndexSelect { ids, ids_l, dim }.map(self, l),
            Self::U32(ids) => IndexSelect { ids, ids_l, dim }.map(self, l),
            Self::I64(ids) => IndexSelect { ids, ids_l, dim }.map(self, l),
-            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "index-select").bt()),
+            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "index-select")),
        }
    }

@ -2726,7 +2583,7 @@ impl BackendStorage for CpuStorage {
            Self::U8(ids) => Gather { ids, ids_l, dim }.map(self, l),
            Self::U32(ids) => Gather { ids, ids_l, dim }.map(self, l),
            Self::I64(ids) => Gather { ids, ids_l, dim }.map(self, l),
-            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "gather").bt()),
+            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "gather")),
        }
    }

@ -2743,7 +2600,7 @@ impl BackendStorage for CpuStorage {
            Self::U8(ids) => ScatterAdd { ids, ids_l, dim }.map(self, l, src, src_l),
            Self::U32(ids) => ScatterAdd { ids, ids_l, dim }.map(self, l, src, src_l),
            Self::I64(ids) => ScatterAdd { ids, ids_l, dim }.map(self, l, src, src_l),
-            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "scatter-add").bt()),
+            _ => Err(Error::UnsupportedDTypeForOp(self.dtype(), "scatter-add")),
        }
    }

@ -2820,10 +2677,6 @@ impl BackendDevice for CpuDevice {
        Ok(s.clone())
    }

-    fn storage_from_cpu_storage_owned(&self, s: CpuStorage) -> Result<Self::Storage> {
-        Ok(s)
-    }
-
    fn new(_: usize) -> Result<Self> {
        Ok(Self)
    }
@ -2925,53 +2778,6 @@ impl BackendDevice for CpuDevice {
        }
    }

-    #[allow(clippy::uninit_vec)]
-    unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<CpuStorage> {
-        let elem_count = shape.elem_count();
-        // The code below is highly unsafe but hopefully not directly unsound as we only consider
-        // types that are Copy, not Drop, and for which all bit patterns are proper values.
-        // It's still pretty risky, see the following for more details:
-        // https://github.com/rust-lang/rust-clippy/issues/4483
-        let storage = match dtype {
-            DType::U8 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::U8(v)
-            }
-            DType::U32 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::U32(v)
-            }
-            DType::I64 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::I64(v)
-            }
-            DType::BF16 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::BF16(v)
-            }
-            DType::F16 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::F16(v)
-            }
-            DType::F32 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::F32(v)
-            }
-            DType::F64 => {
-                let mut v = Vec::with_capacity(elem_count);
-                v.set_len(elem_count);
-                CpuStorage::F64(v)
-            }
-        };
-        Ok(storage)
-    }
-
    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<CpuStorage> {
        let elem_count = shape.elem_count();
        let storage = match dtype {
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@ -11,31 +11,6 @@ use cudarc::driver::{
 use half::{bf16, f16};
 use std::sync::{Arc, Mutex};

-enum SlicePtrOrNull<T> {
-    Ptr(CudaSlice<T>),
-    Null,
-}
-
-unsafe impl<T: DeviceRepr> DeviceRepr for &SlicePtrOrNull<T> {
-    fn as_kernel_param(&self) -> *mut std::ffi::c_void {
-        match self {
-            SlicePtrOrNull::Ptr(slice) => slice.as_kernel_param(),
-            SlicePtrOrNull::Null => 0usize.as_kernel_param(),
-        }
-    }
-}
-
-impl SlicePtrOrNull<usize> {
-    fn params_from_layout(dev: &CudaDevice, l: &Layout) -> Result<Self> {
-        let ds = if l.is_contiguous() {
-            SlicePtrOrNull::Null
-        } else {
-            SlicePtrOrNull::Ptr(dev.htod_copy([l.dims(), l.stride()].concat()).w()?)
-        };
-        Ok(ds)
-    }
-}
-
 /// cudarc related errors
 #[derive(thiserror::Error, Debug)]
 pub enum CudaError {
@ -384,44 +359,6 @@ impl BackendDevice for CudaDevice {
        self.const_impl(1., shape, dtype)
    }

-    unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
-        let elem_count = shape.elem_count();
-        let slice = match dtype {
-            DType::U8 => {
-                let data = self.alloc::<u8>(elem_count).w()?;
-                CudaStorageSlice::U8(data)
-            }
-            DType::U32 => {
-                let data = self.alloc::<u32>(elem_count).w()?;
-                CudaStorageSlice::U32(data)
-            }
-            DType::I64 => {
-                let data = self.alloc::<i64>(elem_count).w()?;
-                CudaStorageSlice::I64(data)
-            }
-            DType::BF16 => {
-                let data = self.alloc::<bf16>(elem_count).w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            DType::F16 => {
-                let data = self.alloc::<f16>(elem_count).w()?;
-                CudaStorageSlice::F16(data)
-            }
-            DType::F32 => {
-                let data = self.alloc::<f32>(elem_count).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            DType::F64 => {
-                let data = self.alloc::<f64>(elem_count).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<CudaStorage> {
        let slice = match storage {
            CpuStorage::U8(storage) => {
@ -458,43 +395,6 @@ impl BackendDevice for CudaDevice {
            device: self.clone(),
        })
    }
-
-    fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result<CudaStorage> {
-        let slice = match storage {
-            CpuStorage::U8(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::U8(data)
-            }
-            CpuStorage::U32(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::U32(data)
-            }
-            CpuStorage::I64(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::I64(data)
-            }
-            CpuStorage::BF16(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            CpuStorage::F16(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::F16(data)
-            }
-            CpuStorage::F32(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            CpuStorage::F64(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
 }

 #[derive(Debug)]
@ -664,7 +564,7 @@ impl Map1 for Affine {
        let dims = shape.dims();
        let el = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(el as u32);
-        let ds = SlicePtrOrNull::params_from_layout(dev, layout)?;
+        let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?;
        let src = &src.slice(layout.start_offset()..);
        let func = dev.get_or_load_func(&kernel_name::<T>("affine"), kernels::AFFINE)?;
        // SAFETY: Set later by running the kernel.
@ -696,7 +596,7 @@ impl Map1 for Elu {
        let dims = shape.dims();
        let el = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(el as u32);
-        let ds = SlicePtrOrNull::params_from_layout(dev, layout)?;
+        let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?;
        let src = &src.slice(layout.start_offset()..);
        let func = dev.get_or_load_func(&kernel_name::<T>("uelu"), kernels::UNARY)?;
        // SAFETY: Set later by running the kernel.
@ -819,7 +719,7 @@ impl Map1 for Powf {
        let dims = shape.dims();
        let el = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(el as u32);
-        let ds = SlicePtrOrNull::params_from_layout(dev, layout)?;
+        let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?;
        let src = &src.slice(layout.start_offset()..);
        let func = dev.get_or_load_func(&kernel_name::<T>("upowf"), kernels::UNARY)?;
        // SAFETY: Set later by running the kernel.
@ -952,7 +852,7 @@ impl<U: UnaryOpT> Map1 for U {
        let dims = shape.dims();
        let el_count = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(el_count as u32);
-        let ds = SlicePtrOrNull::params_from_layout(dev, layout)?;
+        let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?;
        let src = &src.slice(layout.start_offset()..);
        let func = dev.get_or_load_func(&kernel_name::<T>(U::KERNEL), kernels::UNARY)?;
        // SAFETY: Set later by running the kernel.
@ -1249,55 +1149,6 @@ impl<'a> Map2 for Conv2D<'a> {
    }
 }

-struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);
-impl<'a> Map2 for ConvTranspose1D<'a> {
-    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
-        &self,
-        inp: &CudaSlice<T>,
-        inp_l: &Layout,
-        k: &CudaSlice<T>,
-        k_l: &Layout,
-        dev: &CudaDevice,
-    ) -> Result<CudaSlice<T>> {
-        // Kernel shape: (c_in_k, c_out, l_k)
-        // Input shape: (b_size, c_in, l_in)
-        let p = &self.0;
-        let l_out = p.l_out();
-        let dst_el = p.c_out * l_out * p.b_size;
-        let inp = &inp.slice(inp_l.start_offset()..);
-        let k = &k.slice(k_l.start_offset()..);
-        let shape = inp_l.shape();
-        let dims = shape.dims();
-        let el = shape.elem_count();
-
-        // SAFETY: Set later by running the kernel.
-        let out = unsafe { dev.alloc::<T>(dst_el) }.w()?;
-        let cfg = LaunchConfig::for_num_elems(dst_el as u32);
-        let func = dev.get_or_load_func(&kernel_name::<T>("conv_transpose1d"), kernels::CONV)?;
-        let ds = if dims.len() == 3 {
-            [dims, inp_l.stride(), k_l.dims(), k_l.stride()].concat()
-        } else {
-            crate::bail!("unexpected input shape for conv_transpose1d {dims:?}")
-        };
-        let ds = dev.htod_copy(ds).w()?;
-        let params = (
-            el,
-            l_out,
-            p.stride,
-            p.padding,
-            p.output_padding,
-            p.dilation,
-            &ds,
-            inp,
-            k,
-            &out,
-        );
-        // SAFETY: ffi.
-        unsafe { func.launch(cfg, params) }.w()?;
-        Ok(out)
-    }
-}
-
 struct ConvTranspose2D<'a>(&'a crate::conv::ParamsConvTranspose2D);
 impl<'a> Map2 for ConvTranspose2D<'a> {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
@ -1502,14 +1353,9 @@ impl<U: crate::op::BinaryOpT> Map2 for U {
        let dims = shape.dims();
        let elem_count = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
-        let dims_and_strides = if lhs_l.is_contiguous() && rhs_l.is_contiguous() {
-            SlicePtrOrNull::Null
-        } else {
-            SlicePtrOrNull::Ptr(
-                dev.htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat())
-                    .w()?,
-            )
-        };
+        let dims_and_strides = dev
+            .htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat())
+            .w()?;
        let lhs = &lhs.slice(lhs_l.start_offset()..);
        let rhs = &rhs.slice(rhs_l.start_offset()..);
        let func = dev.get_or_load_func(&kernel_name::<T>(U::KERNEL), kernels::BINARY)?;
@ -1536,14 +1382,9 @@ impl Map2Any for Cmp {
        let dims = shape.dims();
        let elem_count = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
-        let dims_and_strides = if lhs_l.is_contiguous() && rhs_l.is_contiguous() {
-            SlicePtrOrNull::Null
-        } else {
-            SlicePtrOrNull::Ptr(
-                dev.htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat())
-                    .w()?,
-            )
-        };
+        let dims_and_strides = dev
+            .htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat())
+            .w()?;
        let lhs = &lhs.slice(lhs_l.start_offset()..);
        let rhs = &rhs.slice(rhs_l.start_offset()..);
        let name = match self.0 {
@ -1750,7 +1591,7 @@ impl BackendStorage for CudaStorage {
        let el = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(el as u32);
        let dev = self.device();
-        let ds = SlicePtrOrNull::params_from_layout(dev, layout)?;
+        let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?;
        let start_o = layout.start_offset();
        // This returns an i64 rather than a &i64, this is useful to get around some temporary
        // lifetime issue and is safe as long as self.slice does not go out of scope before inp
@ -1954,10 +1795,7 @@ impl BackendStorage for CudaStorage {
            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
        } else {
            // Make the kernel contiguous if not already the case.
-            let mut kernel_c = unsafe {
-                self.device()
-                    .alloc_uninit(kernel_l.shape(), kernel.dtype())?
-            };
+            let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?;
            kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?;
            let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset())
                .transpose(1, 2)?
@ -1965,22 +1803,19 @@ impl BackendStorage for CudaStorage {
            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
        };
        let res_l = Layout::contiguous((b, l_out, n)).transpose(1, 2)?;
-        let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? };
+        let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?;
        res.copy_strided_src(&mut res_t, 0, &res_l)?;
        Ok(res_t)
    }

    fn conv_transpose1d(
        &self,
-        l: &Layout,
-        kernel: &Self,
-        kernel_l: &Layout,
-        params: &crate::conv::ParamsConvTranspose1D,
+        _: &Layout,
+        _: &Self,
+        _: &Layout,
+        _: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
-        let device = self.device().clone();
-        let slice =
-            ConvTranspose1D(params).map(&self.slice, l, &kernel.slice, kernel_l, &device)?;
-        Ok(Self { slice, device })
+        todo!()
    }

    #[cfg(not(feature = "cudnn"))]
@ -2022,10 +1857,7 @@ impl BackendStorage for CudaStorage {
            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
        } else {
            // Make the kernel contiguous if not already the case.
-            let mut kernel_c = unsafe {
-                self.device()
-                    .alloc_uninit(kernel_l.shape(), kernel.dtype())?
-            };
+            let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?;
            kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?;
            let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset())
                .transpose(1, 2)?
@ -2035,7 +1867,7 @@ impl BackendStorage for CudaStorage {
        let res_l = Layout::contiguous((b, h_out, w_out, n))
            .transpose(1, 2)?
            .transpose(1, 3)?;
-        let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? };
+        let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?;
        res.copy_strided_src(&mut res_t, 0, &res_l)?;
        Ok(res_t)
    }
@ -2172,7 +2004,7 @@ impl BackendStorage for CudaStorage {
        dim: usize,
    ) -> Result<Self> {
        let device = self.device().clone();
-        let mut acc = unsafe { device.alloc_uninit(l.shape(), self.dtype())? };
+        let mut acc = device.zeros_impl(l.shape(), self.dtype())?;
        self.copy_strided_src(&mut acc, 0, l)?;
        ScatterAdd(ids, ids_l, dim).map(&mut acc.slice, l.shape(), &src.slice, src_l, &device)?;
        Ok(acc)
@ -2187,7 +2019,7 @@ impl BackendStorage for CudaStorage {
        dim: usize,
    ) -> Result<Self> {
        let device = self.device().clone();
-        let mut acc = unsafe { device.alloc_uninit(l.shape(), self.dtype())? };
+        let mut acc = device.zeros_impl(l.shape(), self.dtype())?;
        self.copy_strided_src(&mut acc, 0, l)?;
        IndexAdd(ids, ids_l, dim).map(&mut acc.slice, l.shape(), &src.slice, src_l, &device)?;
        Ok(acc)
@ -2261,67 +2093,6 @@ impl BackendStorage for CudaStorage {
        Ok(Self { slice, device })
    }

-    fn copy2d(
-        &self,
-        dst: &mut Self,
-        d1: usize,
-        d2: usize,
-        src_s: usize,
-        dst_s: usize,
-        src_o: usize,
-        dst_o: usize,
-    ) -> Result<()> {
-        let dev = &self.device;
-        let d1 = d1 as u32;
-        let d2 = d2 as u32;
-        let dst_s = dst_s as u32;
-        let src_s = src_s as u32;
-        let (src, dst, kname) = match (&self.slice, &mut dst.slice) {
-            (S::U8(s), S::U8(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_u8",
-            ),
-            (S::U32(s), S::U32(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_u32",
-            ),
-            (S::I64(s), S::I64(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_i64",
-            ),
-            (S::BF16(s), S::BF16(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_bf16",
-            ),
-            (S::F16(s), S::F16(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_f16",
-            ),
-            (S::F32(s), S::F32(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_f32",
-            ),
-            (S::F64(s), S::F64(d)) => (
-                *s.slice(src_o..).device_ptr(),
-                *d.slice(dst_o..).device_ptr(),
-                "copy2d_f64",
-            ),
-            _ => Err(CudaError::InternalError("dtype mismatch in copy2d"))?,
-        };
-        let func = dev.get_or_load_func(kname, kernels::FILL)?;
-        let cfg = LaunchConfig::for_num_elems(d1 * d2);
-        let params = (src, dst, d1, d2, src_s, dst_s);
-        // SAFETY: ffi.
-        unsafe { func.launch(cfg, params) }.w()?;
-        Ok(())
-    }
-
    fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> {
        let src_shape = src_l.shape();
        let dims = src_shape.dims();
@ -2331,7 +2102,7 @@ impl BackendStorage for CudaStorage {
        }
        let cfg = LaunchConfig::for_num_elems(el_count as u32);
        let dev = &self.device;
-        let ds = SlicePtrOrNull::params_from_layout(dev, src_l)?;
+        let ds = dev.htod_copy([dims, src_l.stride()].concat()).w()?;
        match (&self.slice, &mut dst.slice) {
            (CudaStorageSlice::BF16(src), CudaStorageSlice::BF16(dst)) => {
                let (src, mut dst) = slice_src_and_dst(src, src_l, dst, dst_offset);
--- a/candle-core/src/custom_op.rs
+++ b/candle-core/src/custom_op.rs
@ -1,377 +0,0 @@
-use crate::op::{BackpropOp, Op};
-use crate::tensor::from_storage;
-use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
-use std::sync::Arc;
-
-/// Unary ops that can be defined in user-land.
-pub trait CustomOp1 {
-    // Box<dyn> does not support const yet, so use a function to get the name.
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _storage: &MetalStorage,
-        _layout: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// This function takes as argument the argument `arg` used in the forward pass, the result
-    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
-    /// The function should return the gradient of the argument.
-    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait CustomOp2 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-    ) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
-    fn bwd(
-        &self,
-        _arg1: &Tensor,
-        _arg2: &Tensor,
-        _res: &Tensor,
-        _grad_res: &Tensor,
-    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait CustomOp3 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-        s3: &CpuStorage,
-        l3: &Layout,
-    ) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
-    fn bwd(
-        &self,
-        _arg1: &Tensor,
-        _arg2: &Tensor,
-        _arg3: &Tensor,
-        _res: &Tensor,
-        _grad_res: &Tensor,
-    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-impl Tensor {
-    /// Applies a unary custom op without backward support
-    pub fn apply_op1_no_bwd<C: CustomOp1>(&self, c: &C) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op1(self.layout(), c)?;
-        Ok(from_storage(storage, shape, BackpropOp::none(), false))
-    }
-
-    /// Applies a binary custom op without backward support
-    pub fn apply_op2_no_bwd<C: CustomOp2>(&self, rhs: &Self, c: &C) -> Result<Self> {
-        let (storage, shape) =
-            self.storage()
-                .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?;
-        Ok(from_storage(storage, shape, BackpropOp::none(), false))
-    }
-
-    /// Applies a ternary custom op without backward support
-    pub fn apply_op3_no_bwd<C: CustomOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op3(
-            self.layout(),
-            &t2.storage(),
-            t2.layout(),
-            &t3.storage(),
-            t3.layout(),
-            c,
-        )?;
-        Ok(from_storage(storage, shape, BackpropOp::none(), false))
-    }
-
-    /// Applies a unary custom op.
-    pub fn apply_op1_arc(&self, c: Arc<Box<dyn CustomOp1 + Send + Sync>>) -> Result<Self> {
-        let (storage, shape) = self
-            .storage()
-            .apply_op1(self.layout(), c.as_ref().as_ref())?;
-        let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone()));
-        Ok(from_storage(storage, shape, op, false))
-    }
-
-    pub fn apply_op1<C: 'static + CustomOp1 + Send + Sync>(&self, c: C) -> Result<Self> {
-        self.apply_op1_arc(Arc::new(Box::new(c)))
-    }
-
-    /// Applies a binary custom op.
-    pub fn apply_op2_arc(
-        &self,
-        rhs: &Self,
-        c: Arc<Box<dyn CustomOp2 + Send + Sync>>,
-    ) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op2(
-            self.layout(),
-            &rhs.storage(),
-            rhs.layout(),
-            c.as_ref().as_ref(),
-        )?;
-        let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone()));
-        Ok(from_storage(storage, shape, op, false))
-    }
-
-    pub fn apply_op2<C: 'static + CustomOp2 + Send + Sync>(&self, r: &Self, c: C) -> Result<Self> {
-        self.apply_op2_arc(r, Arc::new(Box::new(c)))
-    }
-
-    /// Applies a ternary custom op.
-    pub fn apply_op3_arc(
-        &self,
-        t2: &Self,
-        t3: &Self,
-        c: Arc<Box<dyn CustomOp3 + Send + Sync>>,
-    ) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op3(
-            self.layout(),
-            &t2.storage(),
-            t2.layout(),
-            &t3.storage(),
-            t3.layout(),
-            c.as_ref().as_ref(),
-        )?;
-        let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| {
-            Op::CustomOp3(t1, t2, t3, c.clone())
-        });
-        Ok(from_storage(storage, shape, op, false))
-    }
-
-    pub fn apply_op3<C: 'static + CustomOp3 + Send + Sync>(
-        &self,
-        t2: &Self,
-        t3: &Self,
-        c: C,
-    ) -> Result<Self> {
-        self.apply_op3_arc(t2, t3, Arc::new(Box::new(c)))
-    }
-}
-
-// In place ops.
-
-/// Unary ops that can be defined in user-land.
-/// These ops work in place and as such back-prop is unsupported.
-pub trait InplaceOp1 {
-    // Box<dyn> does not support const yet, so use a function to get the name.
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result<()>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Result<()> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> Result<()> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-}
-
-pub trait InplaceOp2 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout)
-        -> Result<()>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout) -> Result<()> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &mut MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<()> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-}
-
-pub trait InplaceOp3 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &mut CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-        s3: &CpuStorage,
-        l3: &Layout,
-    ) -> Result<()>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &mut CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<()> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &mut MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<()> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-}
-
-impl Tensor {
-    /// Applies a unary custom op in place.
-    pub fn inplace_op1<C: InplaceOp1>(&self, c: &C) -> Result<()> {
-        self.storage_mut().inplace_op1(self.layout(), c)
-    }
-
-    /// Applies a unary custom op in place (for the first tensor).
-    pub fn inplace_op2<C: InplaceOp2>(&self, rhs: &Self, c: &C) -> Result<()> {
-        self.storage_mut()
-            .inplace_op2(self.layout(), &rhs.storage(), rhs.layout(), c)
-    }
-
-    /// Applies a ternary custom op in place (for the first tensor).
-    pub fn inplace_op3<C: InplaceOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<()> {
-        self.storage_mut().inplace_op3(
-            self.layout(),
-            &t2.storage(),
-            t2.layout(),
-            &t3.storage(),
-            t3.layout(),
-            c,
-        )
-    }
-}
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -201,9 +201,10 @@ impl Device {
                    Ok(Storage::Cuda(storage))
                }
            }
-            Device::Metal(device) => {
-                let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                Ok(Storage::Metal(storage))
+            Device::Metal(_device) => {
+                // let storage = device.rand_uniform(shape, dtype, lo, up)?;
+                // Ok(Storage::Metal(storage))
+                crate::bail!("Metal rand_uniform not implemented")
            }
        }
    }
@ -289,34 +290,17 @@ impl Device {
        }
    }

-    pub(crate) unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
-        match self {
-            Device::Cpu => {
-                let storage = CpuDevice.alloc_uninit(shape, dtype)?;
-                Ok(Storage::Cpu(storage))
-            }
-            Device::Cuda(device) => {
-                let storage = device.alloc_uninit(shape, dtype)?;
-                Ok(Storage::Cuda(storage))
-            }
-            Device::Metal(device) => {
-                let storage = device.alloc_uninit(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
-        }
-    }
-
    pub(crate) fn storage<A: NdArray>(&self, array: A) -> Result<Storage> {
        match self {
            Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())),
            Device::Cuda(device) => {
                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Metal(storage))
            }
        }
@ -327,12 +311,12 @@ impl Device {
            Device::Cpu => Ok(Storage::Cpu(S::to_cpu_storage_owned(data))),
            Device::Cuda(device) => {
                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Metal(storage))
            }
        }
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -65,13 +65,12 @@ impl std::fmt::Debug for Tensor {
 }

 /// Options for Tensor pretty printing
-#[derive(Debug, Clone)]
 pub struct PrinterOptions {
-    pub precision: usize,
-    pub threshold: usize,
-    pub edge_items: usize,
-    pub line_width: usize,
-    pub sci_mode: Option<bool>,
+    precision: usize,
+    threshold: usize,
+    edge_items: usize,
+    line_width: usize,
+    sci_mode: Option<bool>,
 }

 static PRINT_OPTS: std::sync::Mutex<PrinterOptions> =
@ -90,10 +89,6 @@ impl PrinterOptions {
    }
 }

-pub fn print_options() -> &'static std::sync::Mutex<PrinterOptions> {
-    &PRINT_OPTS
-}
-
 pub fn set_print_options(options: PrinterOptions) {
    *PRINT_OPTS.lock().unwrap() = options
 }
@ -122,26 +117,6 @@ pub fn set_print_options_full() {
    }
 }

-pub fn set_line_width(line_width: usize) {
-    PRINT_OPTS.lock().unwrap().line_width = line_width
-}
-
-pub fn set_precision(precision: usize) {
-    PRINT_OPTS.lock().unwrap().precision = precision
-}
-
-pub fn set_edge_items(edge_items: usize) {
-    PRINT_OPTS.lock().unwrap().edge_items = edge_items
-}
-
-pub fn set_threshold(threshold: usize) {
-    PRINT_OPTS.lock().unwrap().threshold = threshold
-}
-
-pub fn set_sci_mode(sci_mode: Option<bool>) {
-    PRINT_OPTS.lock().unwrap().sci_mode = sci_mode
-}
-
 struct FmtSize {
    current_size: usize,
 }
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@ -23,15 +23,7 @@ pub enum DType {
 }

 #[derive(Debug, PartialEq, Eq)]
-pub struct DTypeParseError(String);
-
-impl std::fmt::Display for DTypeParseError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "cannot parse '{}' as a dtype", self.0)
-    }
-}
-
-impl std::error::Error for DTypeParseError {}
+pub struct DTypeParseError;

 impl std::str::FromStr for DType {
    type Err = DTypeParseError;
@ -44,7 +36,7 @@ impl std::str::FromStr for DType {
            "f16" => Ok(Self::F16),
            "f32" => Ok(Self::F32),
            "f64" => Ok(Self::F64),
-            _ => Err(DTypeParseError(s.to_string())),
+            _ => Err(DTypeParseError),
        }
    }
 }
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -154,19 +154,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn copy2d(
-        &self,
-        _: &mut Self,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-    ) -> Result<()> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
@ -210,18 +197,10 @@ impl crate::backend::BackendDevice for CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -166,19 +166,6 @@ impl crate::backend::BackendStorage for MetalStorage {
        Err(Error::NotCompiledWithMetalSupport)
    }

-    fn copy2d(
-        &self,
-        _: &mut Self,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-    ) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
@ -222,18 +209,10 @@ impl crate::backend::BackendDevice for MetalDevice {
        Err(Error::NotCompiledWithMetalSupport)
    }

-    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }

-    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
--- a/candle-core/src/indexer.rs
+++ b/candle-core/src/indexer.rs
@ -64,7 +64,7 @@ impl Tensor {
 #[derive(Debug)]
 /// Generic structure used to index a slice of the tensor
 pub enum TensorIndexer {
-    /// This selects the elements for which an index has some specific value.
+    /// This selects the elemnts for which an index has some specific value.
    Select(usize),
    /// This is a regular slice, purely indexing a chunk of the tensor
    Narrow(Bound<usize>, Bound<usize>),
@ -104,31 +104,37 @@ impl From<&Tensor> for TensorIndexer {
    }
 }

-trait RB: RangeBounds<usize> {}
-impl RB for Range<usize> {}
-impl RB for RangeFrom<usize> {}
-impl RB for RangeFull {}
-impl RB for RangeInclusive<usize> {}
-impl RB for RangeTo<usize> {}
-impl RB for RangeToInclusive<usize> {}
+macro_rules! impl_from_range {
+    ($range_type:ty) => {
+        impl From<$range_type> for TensorIndexer {
+            fn from(range: $range_type) -> Self {
+                use std::ops::Bound::*;

-impl<T: RB> From<T> for TensorIndexer {
-    fn from(range: T) -> Self {
-        use std::ops::Bound::*;
-        let start = match range.start_bound() {
-            Included(idx) => Included(*idx),
-            Excluded(idx) => Excluded(*idx),
-            Unbounded => Unbounded,
-        };
-        let end = match range.end_bound() {
-            Included(idx) => Included(*idx),
-            Excluded(idx) => Excluded(*idx),
-            Unbounded => Unbounded,
-        };
-        TensorIndexer::Narrow(start, end)
-    }
+                let start = match range.start_bound() {
+                    Included(idx) => Included(*idx),
+                    Excluded(idx) => Excluded(*idx),
+                    Unbounded => Unbounded,
+                };
+
+                let end = match range.end_bound() {
+                    Included(idx) => Included(*idx),
+                    Excluded(idx) => Excluded(*idx),
+                    Unbounded => Unbounded,
+                };
+
+                TensorIndexer::Narrow(start, end)
+            }
+        }
+    };
 }

+impl_from_range!(Range<usize>);
+impl_from_range!(RangeFrom<usize>);
+impl_from_range!(RangeFull);
+impl_from_range!(RangeInclusive<usize>);
+impl_from_range!(RangeTo<usize>);
+impl_from_range!(RangeToInclusive<usize>);
+
 /// Trait used to implement multiple signatures for ease of use of the slicing
 /// of a tensor
 pub trait IndexOp<T> {
--- a/candle-core/src/layout.rs
+++ b/candle-core/src/layout.rs
@ -70,7 +70,7 @@ impl Layout {
        self.shape.is_fortran_contiguous(&self.stride)
    }

-    pub fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
+    pub(crate) fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
        let dims = self.shape().dims();
        if dim >= dims.len() {
            Err(Error::DimOutOfRange {
@ -99,7 +99,7 @@ impl Layout {
        })
    }

-    pub fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
+    pub(crate) fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
        let rank = self.shape.rank();
        if rank <= dim1 || rank <= dim2 {
            Err(Error::UnexpectedNumberOfDims {
@ -120,7 +120,7 @@ impl Layout {
        })
    }

-    pub fn permute(&self, idxs: &[usize]) -> Result<Self> {
+    pub(crate) fn permute(&self, idxs: &[usize]) -> Result<Self> {
        let is_permutation =
            idxs.len() == self.shape.rank() && (0..idxs.len()).all(|i| idxs.contains(&i));
        if !is_permutation {
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -45,7 +45,6 @@ pub mod cpu_backend;
 pub mod cuda_backend;
 #[cfg(feature = "cudnn")]
 pub mod cudnn;
-mod custom_op;
 mod device;
 pub mod display;
 mod dtype;
@ -68,18 +67,17 @@ pub mod shape;
 mod storage;
 mod strided_index;
 mod tensor;
-mod tensor_cat;
 pub mod test_utils;
 pub mod utils;
 mod variable;

 pub use cpu_backend::CpuStorage;
-pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3};
-pub use device::{Device, DeviceLocation, NdArray};
+pub use device::{Device, DeviceLocation};
 pub use dtype::{DType, FloatDType, IntDType, WithDType};
 pub use error::{Error, Result};
 pub use indexer::IndexOp;
 pub use layout::Layout;
+pub use op::{CustomOp1, CustomOp2, CustomOp3};
 pub use shape::{Shape, D};
 pub use storage::Storage;
 pub use strided_index::{StridedBlocks, StridedIndex};
@ -125,18 +123,15 @@ pub trait Module {
    fn forward(&self, xs: &Tensor) -> Result<Tensor>;
 }

-impl<T: Fn(&Tensor) -> Result<Tensor>> Module for T {
+impl Module for quantized::QMatMul {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        self(xs)
+        self.forward(xs)
    }
 }

-impl<M: Module> Module for Option<&M> {
+impl<T: Fn(&Tensor) -> Result<Tensor>> Module for T {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        match self {
-            None => Ok(xs.clone()),
-            Some(m) => m.forward(xs),
-        }
+        self(xs)
    }
 }

--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
--- a/candle-core/src/mkl.rs
+++ b/candle-core/src/mkl.rs
@ -333,16 +333,6 @@ pub fn vd_tanh_inplace(y: &mut [f64]) {
    unsafe { ffi::vdTanh(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
 }

-#[inline]
-pub fn vs_exp_inplace(y: &mut [f32]) {
-    unsafe { ffi::vsExp(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
-}
-
-#[inline]
-pub fn vd_exp_inplace(y: &mut [f64]) {
-    unsafe { ffi::vdExp(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
-}
-
 #[inline]
 pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
@ -365,28 +355,6 @@ pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
    }
 }

-#[inline]
-pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vs_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vd_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
 macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $mkl_name:ident) => {
        #[inline]
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,5 +1,5 @@
 #![allow(clippy::redundant_closure_call)]
-use crate::Tensor;
+use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
 use half::{bf16, f16};
 use num_traits::float::Float;

@ -61,7 +61,6 @@ pub enum UnaryOp {
    GeluErf,
    Erf,
    Relu,
-    Silu,
    Tanh,
    Floor,
    Ceil,
@ -132,15 +131,8 @@ pub enum Op {
        stride: (usize, usize),
    },

-    UpsampleNearest1D {
-        arg: Tensor,
-        target_size: usize,
-    },
-    UpsampleNearest2D {
-        arg: Tensor,
-        target_h: usize,
-        target_w: usize,
-    },
+    UpsampleNearest1D(Tensor),
+    UpsampleNearest2D(Tensor),

    Cat(Vec<Tensor>, usize),

@ -161,23 +153,168 @@ pub enum Op {
    Permute(Tensor, Vec<usize>),
    Elu(Tensor, f64),
    Powf(Tensor, f64),
-    CustomOp1(
-        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp1 + Send + Sync>>,
-    ),
+    CustomOp1(Tensor, std::sync::Arc<Box<dyn CustomOp1 + Send + Sync>>),
    CustomOp2(
        Tensor,
        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp2 + Send + Sync>>,
+        std::sync::Arc<Box<dyn CustomOp2 + Send + Sync>>,
    ),
    CustomOp3(
        Tensor,
        Tensor,
        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp3 + Send + Sync>>,
+        std::sync::Arc<Box<dyn CustomOp3 + Send + Sync>>,
    ),
 }

+/// Unary ops that can be defined in user-land.
+pub trait CustomOp1 {
+    // Box<dyn> does not support const yet, so use a function to get the name.
+    fn name(&self) -> &'static str;
+
+    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
+
+    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
+        Err(crate::Error::Cuda(
+            format!("no cuda implementation for {}", self.name()).into(),
+        ))
+    }
+
+    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn metal_fwd(
+        &self,
+        _storage: &MetalStorage,
+        _layout: &Layout,
+    ) -> Result<(MetalStorage, Shape)> {
+        Err(crate::Error::Metal(
+            format!("no metal implementation for {}", self.name()).into(),
+        ))
+    }
+
+    /// This function takes as argument the argument `arg` used in the forward pass, the result
+    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
+    /// The function should return the gradient of the argument.
+    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
+        Err(crate::Error::BackwardNotSupported { op: self.name() })
+    }
+}
+
+pub trait CustomOp2 {
+    fn name(&self) -> &'static str;
+
+    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cpu_fwd(
+        &self,
+        s1: &CpuStorage,
+        l1: &Layout,
+        s2: &CpuStorage,
+        l2: &Layout,
+    ) -> Result<(CpuStorage, Shape)>;
+
+    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cuda_fwd(
+        &self,
+        _: &CudaStorage,
+        _: &Layout,
+        _: &CudaStorage,
+        _: &Layout,
+    ) -> Result<(CudaStorage, Shape)> {
+        Err(crate::Error::Cuda(
+            format!("no cuda implementation for {}", self.name()).into(),
+        ))
+    }
+
+    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn metal_fwd(
+        &self,
+        _: &MetalStorage,
+        _: &Layout,
+        _: &MetalStorage,
+        _: &Layout,
+    ) -> Result<(MetalStorage, Shape)> {
+        Err(crate::Error::Metal(
+            format!("no metal implementation for {}", self.name()).into(),
+        ))
+    }
+
+    fn bwd(
+        &self,
+        _arg1: &Tensor,
+        _arg2: &Tensor,
+        _res: &Tensor,
+        _grad_res: &Tensor,
+    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
+        Err(crate::Error::BackwardNotSupported { op: self.name() })
+    }
+}
+
+pub trait CustomOp3 {
+    fn name(&self) -> &'static str;
+
+    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cpu_fwd(
+        &self,
+        s1: &CpuStorage,
+        l1: &Layout,
+        s2: &CpuStorage,
+        l2: &Layout,
+        s3: &CpuStorage,
+        l3: &Layout,
+    ) -> Result<(CpuStorage, Shape)>;
+
+    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cuda_fwd(
+        &self,
+        _: &CudaStorage,
+        _: &Layout,
+        _: &CudaStorage,
+        _: &Layout,
+        _: &CudaStorage,
+        _: &Layout,
+    ) -> Result<(CudaStorage, Shape)> {
+        Err(crate::Error::Cuda(
+            format!("no cuda implementation for {}", self.name()).into(),
+        ))
+    }
+
+    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn metal_fwd(
+        &self,
+        _: &MetalStorage,
+        _: &Layout,
+        _: &MetalStorage,
+        _: &Layout,
+        _: &MetalStorage,
+        _: &Layout,
+    ) -> Result<(MetalStorage, Shape)> {
+        Err(crate::Error::Metal(
+            format!("no metal implementation for {}", self.name()).into(),
+        ))
+    }
+
+    fn bwd(
+        &self,
+        _arg1: &Tensor,
+        _arg2: &Tensor,
+        _arg3: &Tensor,
+        _res: &Tensor,
+        _grad_res: &Tensor,
+    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
+        Err(crate::Error::BackwardNotSupported { op: self.name() })
+    }
+}
+
 pub trait UnaryOpT {
    const NAME: &'static str;
    const KERNEL: &'static str;
@ -249,7 +386,6 @@ pub(crate) struct Gelu;
 pub(crate) struct GeluErf;
 pub(crate) struct Erf;
 pub(crate) struct Relu;
-pub(crate) struct Silu;
 pub(crate) struct Tanh;
 pub(crate) struct Floor;
 pub(crate) struct Ceil;
@ -584,77 +720,6 @@ impl UnaryOpT for Erf {
    }
 }

-/// Silu operation
-impl UnaryOpT for Silu {
-    const NAME: &'static str = "silu";
-    const V: Self = Silu;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v / (bf16::ONE + (-v).exp())
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v / (f16::ONE + (-v).exp())
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v / (1.0 + (-v).exp())
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v / (1.0 + (-v).exp())
-    }
-    #[inline(always)]
-    fn u8(_: u8) -> u8 {
-        0
-    }
-    #[inline(always)]
-    fn u32(_: u32) -> u32 {
-        0
-    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
-    const KERNEL: &'static str = "usilu";
-
-    #[cfg(feature = "mkl")]
-    const F32_VEC: bool = true;
-
-    #[cfg(feature = "mkl")]
-    #[inline(always)]
-    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-        crate::mkl::vs_silu(xs, ys)
-    }
-
-    #[cfg(feature = "mkl")]
-    const F64_VEC: bool = true;
-
-    #[cfg(feature = "mkl")]
-    #[inline(always)]
-    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-        crate::mkl::vd_silu(xs, ys)
-    }
-
-    #[cfg(feature = "accelerate")]
-    const F32_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-        crate::accelerate::vs_silu(xs, ys)
-    }
-
-    #[cfg(feature = "accelerate")]
-    const F64_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-        crate::accelerate::vd_silu(xs, ys)
-    }
-}
-
 impl UnaryOpT for Abs {
    const NAME: &'static str = "abs";
    const KERNEL: &'static str = "uabs";
--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@ -42,7 +42,7 @@ pub enum OpCode {
    Stop = b'.',
    NewObj = 0x81,
    EmptyList = b']',
-    BinFloat = b'G',
+    BinFloat = b'g',
    Append = b'a',
    Appends = b'e',
 }
@ -217,13 +217,6 @@ impl Object {
                let args = args.remove(1);
                (callable, args)
            }
-            Object::Class {
-                module_name,
-                class_name,
-            } if module_name == "torch._utils" && class_name == "_rebuild_parameter" => {
-                let mut args = args.tuple()?;
-                args.remove(0).reduce()?
-            }
            _ => (callable, args),
        };
        match callable {
@ -234,11 +227,13 @@ impl Object {
            _ => return Ok(None),
        };
        let (layout, dtype, file_path, storage_size) = rebuild_args(args)?;
+        let mut path = dir_name.to_path_buf();
+        path.push(file_path);
        Ok(Some(TensorInfo {
            name,
            dtype,
            layout,
-            path: format!("{}/{}", dir_name.to_string_lossy(), file_path),
+            path: path.to_string_lossy().into_owned(),
            storage_size,
        }))
    }
@ -350,10 +345,8 @@ impl Stack {
                module_name,
                class_name,
            } => {
-                if module_name == "collections"
-                    && (class_name == "OrderedDict" || class_name == "defaultdict")
-                {
-                    // TODO: have a separate ordered dict and a separate default dict.
+                if module_name == "collections" && class_name == "OrderedDict" {
+                    // TODO: have a separate ordered dict.
                    Some(Object::Dict(vec![]))
                } else {
                    None
@ -462,10 +455,7 @@ impl Stack {
                self.push(Object::Int(arg))
            }
            OpCode::BinFloat => {
-                // Somehow floats are encoded using BigEndian whereas int types use LittleEndian.
-                // https://github.com/python/cpython/blob/0c80da4c14d904a367968955544dd6ae58c8101c/Lib/pickletools.py#L855
-                // https://github.com/pytorch/pytorch/blob/372d078f361e726bb4ac0884ac334b04c58179ef/torch/_weights_only_unpickler.py#L243
-                let arg = r.read_f64::<byteorder::BigEndian>()?;
+                let arg = r.read_f64::<LittleEndian>()?;
                self.push(Object::Float(arg))
            }
            OpCode::BinUnicode => {
@ -637,16 +627,9 @@ pub struct TensorInfo {
    pub storage_size: usize,
 }

-/// Read the tensor info from a .pth file.
-///
-/// # Arguments
-/// * `file` - The path to the .pth file.
-/// * `verbose` - Whether to print debug information.
-/// * `key` - Optional key to retrieve `state_dict` from the pth file.
 pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
    file: P,
    verbose: bool,
-    key: Option<&str>,
 ) -> Result<Vec<TensorInfo>> {
    let file = std::fs::File::open(file)?;
    let zip_reader = std::io::BufReader::new(file);
@ -668,9 +651,8 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
        stack.read_loop(&mut reader)?;
        let obj = stack.finalize()?;
        if VERBOSE || verbose {
-            println!("{obj:#?}");
+            println!("{obj:?}");
        }
-
        let obj = match obj {
            Object::Build { callable, args } => match *callable {
                Object::Reduce { callable, args: _ } => match *callable {
@ -684,24 +666,6 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
            },
            obj => obj,
        };
-
-        // If key is provided, then we need to extract the state_dict from the object.
-        let obj = if let Some(key) = key {
-            if let Object::Dict(key_values) = obj {
-                key_values
-                    .into_iter()
-                    .find(|(k, _)| *k == Object::Unicode(key.to_owned()))
-                    .map(|(_, v)| v)
-                    .ok_or_else(|| E::Msg(format!("key {key} not found")))?
-            } else {
-                obj
-            }
-        } else {
-            obj
-        };
-
-        // If the object is a dict, then we can extract the tensor info from it.
-        // NOTE: We are assuming that the `obj` is state_dict by this stage.
        if let Object::Dict(key_values) = obj {
            for (name, value) in key_values.into_iter() {
                match value.into_tensor_info(name, &dir_name) {
@ -724,8 +688,8 @@ pub struct PthTensors {
 }

 impl PthTensors {
-    pub fn new<P: AsRef<std::path::Path>>(path: P, key: Option<&str>) -> Result<Self> {
-        let tensor_infos = read_pth_tensor_info(path.as_ref(), false, key)?;
+    pub fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
+        let tensor_infos = read_pth_tensor_info(path.as_ref(), false)?;
        let tensor_infos = tensor_infos
            .into_iter()
            .map(|ti| (ti.name.to_string(), ti))
@ -739,7 +703,6 @@ impl PthTensors {
    }

    pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
-        use std::io::Read;
        let tensor_info = match self.tensor_infos.get(name) {
            None => return Ok(None),
            Some(tensor_info) => tensor_info,
@ -748,56 +711,27 @@ impl PthTensors {
        let zip_reader = std::io::BufReader::new(std::fs::File::open(&self.path)?);
        let mut zip = zip::ZipArchive::new(zip_reader)?;
        let mut reader = zip.by_name(&tensor_info.path)?;
-        let is_fortran_contiguous = tensor_info.layout.is_fortran_contiguous();
-        let rank = tensor_info.layout.shape().rank();

-        // Reading the data is a bit tricky as it can be strided, for now only support the basic
-        // case and when the tensor is fortran contiguous.
-        if !tensor_info.layout.is_contiguous() && !is_fortran_contiguous {
+        // Reading the data is a bit tricky as it can be strided, use an offset, etc.
+        // For now only support the basic case.
+        if tensor_info.layout.start_offset() != 0 || !tensor_info.layout.is_contiguous() {
            crate::bail!(
                "cannot retrieve non-contiguous tensors {:?}",
                tensor_info.layout
            )
        }
-        let start_offset = tensor_info.layout.start_offset();
-        if start_offset > 0 {
-            std::io::copy(
-                &mut reader.by_ref().take(start_offset as u64),
-                &mut std::io::sink(),
-            )?;
-        }
        let tensor = Tensor::from_reader(
            tensor_info.layout.shape().clone(),
            tensor_info.dtype,
            &mut reader,
        )?;
-
-        if rank > 1 && is_fortran_contiguous {
-            // Reverse the shape, e.g. Shape(2, 3, 4) -> Shape(4, 3, 2)
-            let shape_reversed: Vec<_> = tensor_info.layout.dims().iter().rev().cloned().collect();
-            let tensor = tensor.reshape(shape_reversed)?;
-
-            // Permute (transpose) the dimensions, e.g. Shape(4, 3, 2) -> Shape(2, 3, 4)
-            let dim_indeces_reversed: Vec<_> = (0..rank).rev().collect();
-            let tensor = tensor.permute(dim_indeces_reversed)?;
-            Ok(Some(tensor))
-        } else {
-            Ok(Some(tensor))
-        }
+        Ok(Some(tensor))
    }
 }

-/// Read all the tensors from a PyTorch pth file with a given key.
-///
-/// # Arguments
-/// * `path` - Path to the pth file.
-/// * `key` - Optional key to retrieve `state_dict` from the pth file. Sometimes the pth file
-///           contains multiple objects and the state_dict is the one we are interested in.
-pub fn read_all_with_key<P: AsRef<std::path::Path>>(
-    path: P,
-    key: Option<&str>,
-) -> Result<Vec<(String, Tensor)>> {
-    let pth = PthTensors::new(path, key)?;
+/// Read all the tensors from a PyTorch pth file.
+pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tensor)>> {
+    let pth = PthTensors::new(path)?;
    let tensor_names = pth.tensor_infos.keys();
    let mut tensors = Vec::with_capacity(tensor_names.len());
    for name in tensor_names {
@ -807,11 +741,3 @@ pub fn read_all_with_key<P: AsRef<std::path::Path>>(
    }
    Ok(tensors)
 }
-
-/// Read all the tensors from a PyTorch pth file.
-///
-/// # Arguments
-/// * `path` - Path to the pth file.
-pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tensor)>> {
-    read_all_with_key(path, None)
-}
--- a/candle-core/src/quantized/avx.rs
+++ b/candle-core/src/quantized/avx.rs
@ -353,7 +353,7 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                q3 = q3.add(32);

                // Prepare low and high bits
-                // We hardcode the shifts here to avoid loading them into a separate register
+                // We hardcode the shifts here to avoid loading them into a seperate register
                let q3l_0 = _mm256_and_si256(q3bits, m3);
                let q3h_0 = if j == 0 {
                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 0)), 0)
@ -586,7 +586,7 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bits = _mm256_loadu_si256(q5 as *const __m256i);
                q5 = q5.add(32);

-                //Similar to q3k we hardcode the shifts here to avoid loading them into a separate register
+                //Similar to q3k we hardcode the shifts here to avoid loading them into a seperate register
                let q5l_0 = _mm256_and_si256(q5bits, m4);
                let q5l_0_shift_input = _mm256_and_si256(hbits, hmask);
                let q5l_0_right_shift = match j {
--- a/candle-core/src/quantized/cuda.rs
+++ b/candle-core/src/quantized/cuda.rs
@ -1,343 +0,0 @@
-use super::{GgmlDType, QStorage};
-use crate::{backend::BackendDevice, cuda_backend::WrapErr};
-use crate::{CudaDevice, CudaStorage, Result};
-
-use cudarc::driver::{CudaSlice, DeviceSlice};
-
-pub struct QCudaStorage {
-    data: CudaSlice<u8>,
-    dtype: GgmlDType,
-    device: CudaDevice,
-}
-
-pub const WARP_SIZE: usize = 32;
-pub const MMQ_X_Q4_0_AMPERE: usize = 4;
-pub const MMQ_Y_Q4_0_AMPERE: usize = 32;
-pub const NWARPS_Q4_0_AMPERE: usize = 4;
-pub const GGML_CUDA_MMV_X: usize = 32;
-pub const GGML_CUDA_MMV_Y: usize = 1;
-pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256;
-
-fn dequantize(
-    data: &CudaSlice<u8>,
-    dtype: GgmlDType,
-    elem_count: usize,
-    dev: &CudaDevice,
-) -> Result<CudaStorage> {
-    use cudarc::driver::LaunchAsync;
-
-    let nb = (elem_count + 255) / 256;
-    let (kernel_name, is_k, block_dim, num_blocks) = match dtype {
-        GgmlDType::Q4_0 => ("dequantize_block_q4_0", false, 32, nb),
-        GgmlDType::Q4_1 => ("dequantize_block_q4_1", false, 32, nb),
-        GgmlDType::Q5_0 => {
-            let nb = (elem_count + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1)
-                / (2 * CUDA_DEQUANTIZE_BLOCK_SIZE);
-            (
-                "dequantize_block_q5_0",
-                false,
-                CUDA_DEQUANTIZE_BLOCK_SIZE,
-                nb,
-            )
-        }
-        GgmlDType::Q5_1 => {
-            let nb = (elem_count + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1)
-                / (2 * CUDA_DEQUANTIZE_BLOCK_SIZE);
-            (
-                "dequantize_block_q5_1",
-                false,
-                CUDA_DEQUANTIZE_BLOCK_SIZE,
-                nb,
-            )
-        }
-        GgmlDType::Q8_0 => ("dequantize_block_q8_0", false, 32, nb),
-        GgmlDType::Q2K => ("dequantize_block_q2_K", true, 64, nb),
-        GgmlDType::Q3K => ("dequantize_block_q3_K", true, 64, nb),
-        GgmlDType::Q4K => ("dequantize_block_q4_K", true, 32, nb),
-        GgmlDType::Q5K => ("dequantize_block_q5_K", true, 64, nb),
-        GgmlDType::Q6K => ("dequantize_block_q6_K", true, 64, nb),
-        GgmlDType::Q8K => ("dequantize_block_q8_K", true, 32, nb),
-        _ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
-    };
-    let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
-    let dst = dev.alloc_zeros::<f32>(elem_count).w()?;
-    // See e.g.
-    // https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (num_blocks as u32, 1, 1),
-        block_dim: (block_dim as u32, 1, 1),
-        shared_mem_bytes: 0,
-    };
-
-    if is_k {
-        let params = (data, &dst);
-        unsafe { func.launch(cfg, params) }.w()?;
-    } else {
-        let nb32 = match dtype {
-            GgmlDType::Q5_0 | GgmlDType::Q5_1 => elem_count,
-            _ => elem_count / 32,
-        };
-        let params = (data, &dst, nb32 as i32);
-        unsafe { func.launch(cfg, params) }.w()?;
-    }
-    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
-}
-
-fn dequantize_mut_mal_vec(
-    data: &CudaSlice<u8>,
-    y: &cudarc::driver::CudaView<f32>,
-    dtype: GgmlDType,
-    ncols: usize,
-    nrows: usize,
-    dev: &CudaDevice,
-) -> Result<CudaStorage> {
-    use cudarc::driver::LaunchAsync;
-
-    let kernel_name = match dtype {
-        GgmlDType::Q4_0 => "dequantize_mul_mat_vec_q4_0_cuda",
-        GgmlDType::Q4_1 => "dequantize_mul_mat_vec_q4_1_cuda",
-        GgmlDType::Q5_0 => "dequantize_mul_mat_vec_q5_0_cuda",
-        GgmlDType::Q5_1 => "dequantize_mul_mat_vec_q5_1_cuda",
-        GgmlDType::Q8_0 => "dequantize_mul_mat_vec_q8_0_cuda",
-        GgmlDType::Q2K => "dequantize_mul_mat_vec_q2_k",
-        GgmlDType::Q3K => "dequantize_mul_mat_vec_q3_k",
-        GgmlDType::Q4K => "dequantize_mul_mat_vec_q4_k",
-        GgmlDType::Q5K => "dequantize_mul_mat_vec_q5_k",
-        GgmlDType::Q6K => "dequantize_mul_mat_vec_q6_k",
-        _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
-    };
-    let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
-    let dst = dev.alloc_zeros::<f32>(nrows).w()?;
-    let block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (block_num_y as u32, 1, 1),
-        block_dim: (WARP_SIZE as u32, GGML_CUDA_MMV_Y as u32, 1),
-        shared_mem_bytes: 0,
-    };
-
-    let params = (data, y, &dst, ncols as i32, nrows as i32);
-    unsafe { func.launch(cfg, params) }.w()?;
-    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
-}
-
-impl QCudaStorage {
-    pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result<Self> {
-        let size_in_bytes = el_count * dtype.type_size() / dtype.block_size();
-        let data = device.alloc_zeros::<u8>(size_in_bytes).w()?;
-        Ok(QCudaStorage {
-            data,
-            device: device.clone(),
-            dtype,
-        })
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &CudaDevice {
-        &self.device
-    }
-
-    pub fn dequantize(&self, elem_count: usize) -> Result<CudaStorage> {
-        let fast_kernel = matches!(
-            self.dtype,
-            GgmlDType::Q4_0
-                | GgmlDType::Q4_1
-                | GgmlDType::Q5_0
-                | GgmlDType::Q5_1
-                | GgmlDType::Q8_0
-                | GgmlDType::Q2K
-                | GgmlDType::Q3K
-                | GgmlDType::Q4K
-                | GgmlDType::Q5K
-                | GgmlDType::Q6K
-                | GgmlDType::Q8K
-        );
-        if fast_kernel {
-            return dequantize(&self.data, self.dtype, elem_count, self.device());
-        }
-        // Run the dequantization on cpu.
-        use crate::quantized::k_quants::GgmlType;
-
-        let buffer = self.device.dtoh_sync_copy(&self.data).w()?;
-        let mut out = vec![0.0; elem_count];
-        let block_len = elem_count / self.dtype.block_size();
-        match self.dtype {
-            GgmlDType::F32 => {
-                let slice =
-                    unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const f32, block_len) };
-                out.copy_from_slice(slice)
-            }
-            GgmlDType::F16 => {
-                let vec: Vec<half::f16> = read_to_vec(&buffer, block_len);
-                half::f16::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4_0 => {
-                let vec: Vec<crate::quantized::BlockQ4_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4_1 => {
-                let vec: Vec<crate::quantized::BlockQ4_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5_0 => {
-                let vec: Vec<crate::quantized::BlockQ5_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5_1 => {
-                let vec: Vec<crate::quantized::BlockQ5_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8_0 => {
-                let vec: Vec<crate::quantized::BlockQ8_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8_1 => {
-                let vec: Vec<crate::quantized::BlockQ8_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q2K => {
-                let vec: Vec<crate::quantized::BlockQ2K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ2K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q3K => {
-                let vec: Vec<crate::quantized::BlockQ3K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ3K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4K => {
-                let vec: Vec<crate::quantized::BlockQ4K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5K => {
-                let vec: Vec<crate::quantized::BlockQ5K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q6K => {
-                let vec: Vec<crate::quantized::BlockQ6K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ6K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8K => {
-                let vec: Vec<crate::quantized::BlockQ8K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8K::to_float(&vec, &mut out)?;
-            }
-        }
-
-        self.device
-            .storage_from_cpu_storage(&crate::CpuStorage::F32(out))
-    }
-
-    pub fn quantize(&mut self, src: &CudaStorage) -> Result<()> {
-        // Run the quantization on cpu.
-        let src = match &src.slice {
-            crate::cuda_backend::CudaStorageSlice::F32(data) => {
-                self.device.dtoh_sync_copy(data).w()?
-            }
-            _ => crate::bail!("only f32 can be quantized"),
-        };
-        let src_len = src.len();
-        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
-        let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
-        qcpu_storage.quantize(&src)?;
-        let data = qcpu_storage.data()?;
-        let data = self.device.htod_sync_copy(data.as_ref()).w()?;
-        self.data = data;
-        Ok(())
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        self.data.len()
-    }
-
-    pub fn fwd(
-        &self,
-        self_shape: &crate::Shape,
-        storage: &CudaStorage,
-        layout: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        if matches!(layout.shape().dims(), [1, 1, _] | [1, _]) {
-            self.dequantize_matmul_vec(self_shape, storage, layout)
-        } else {
-            self.dequantize_matmul(self_shape, storage, layout)
-        }
-    }
-}
-
-impl QCudaStorage {
-    fn dequantize_matmul_vec(
-        &self,
-        self_shape: &crate::Shape,
-        rhs: &CudaStorage,
-        rhs_l: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        let (nrows, ncols) = self_shape.dims2()?;
-        let rhs = rhs.as_cuda_slice::<f32>()?;
-        let rhs = match rhs_l.contiguous_offsets() {
-            Some((o1, o2)) => rhs.slice(o1..o2),
-            None => Err(crate::Error::RequiresContiguous { op: "dmmv" }.bt())?,
-        };
-        let (with_batch, k) = match rhs_l.shape().dims() {
-            [1, 1, k] => (true, k),
-            [1, k] => (false, k),
-            _ => crate::bail!("unexpected rhs shape in dmmv {:?}", rhs_l.shape()),
-        };
-        if ncols != *k {
-            crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", rhs_l.shape())
-        }
-
-        let out =
-            dequantize_mut_mal_vec(&self.data, &rhs, self.dtype, ncols, nrows, self.device())?;
-        let out_shape = if with_batch {
-            vec![1, 1, nrows]
-        } else {
-            vec![1, nrows]
-        };
-        Ok((out, out_shape.into()))
-    }
-
-    fn dequantize_matmul(
-        &self,
-        self_shape: &crate::Shape,
-        storage: &CudaStorage,
-        layout: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        use crate::backend::BackendStorage;
-        let (n, k) = self_shape.dims2()?;
-        let (b, m, k2) = match layout.shape().dims() {
-            &[b, m, k2] => (b, m, k2),
-            &[m, k2] => (1, m, k2),
-            s => crate::bail!("unexpected shape for input {s:?}"),
-        };
-        if k2 != k {
-            crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", layout.shape())
-        }
-
-        let data_f32 = self.dequantize(n * k)?;
-        let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0).broadcast_as((b, k, n))?;
-        let out = storage.matmul(&data_f32, (b, m, n, k), layout, &rhs_l)?;
-        let mut out_shape = layout.shape().dims().to_vec();
-        out_shape.pop();
-        out_shape.push(n);
-        Ok((out, out_shape.into()))
-    }
-}
-
-fn read_to_vec<T: Clone>(buffer: &[u8], n: usize) -> Vec<T> {
-    let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) };
-    slice.to_vec()
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    device: &CudaDevice,
-    data: &[T],
-) -> Result<super::QStorage> {
-    let data = unsafe {
-        std::slice::from_raw_parts(data.as_ptr() as *const u8, core::mem::size_of_val(data))
-    };
-    let data = device.htod_sync_copy(data).w()?;
-    Ok(QStorage::Cuda(QCudaStorage {
-        data,
-        device: device.clone(),
-        dtype: T::DTYPE,
-    }))
-}
--- a/candle-core/src/quantized/dummy_cuda.rs
+++ b/candle-core/src/quantized/dummy_cuda.rs
@ -1,50 +0,0 @@
-#![allow(unused)]
-use super::GgmlDType;
-use crate::{CudaDevice, CudaStorage, Error, Result};
-
-pub struct QCudaStorage {
-    dtype: GgmlDType,
-    device: CudaDevice,
-}
-
-impl QCudaStorage {
-    pub fn zeros(_: &CudaDevice, _: usize, _: GgmlDType) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &CudaDevice {
-        &self.device
-    }
-
-    pub fn dequantize(&self, _elem_count: usize) -> Result<CudaStorage> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    pub fn quantize(&mut self, _src: &CudaStorage) -> Result<()> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        0
-    }
-
-    pub fn fwd(
-        &self,
-        _self_shape: &crate::Shape,
-        _storage: &CudaStorage,
-        _layout: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    _device: &CudaDevice,
-    _data: &[T],
-) -> Result<super::QStorage> {
-    Err(Error::NotCompiledWithCudaSupport)
-}
--- a/candle-core/src/quantized/dummy_metal.rs
+++ b/candle-core/src/quantized/dummy_metal.rs
@ -1,50 +0,0 @@
-#![allow(unused)]
-use super::GgmlDType;
-use crate::{Error, MetalDevice, MetalStorage, Result};
-
-pub struct QMetalStorage {
-    dtype: GgmlDType,
-    device: MetalDevice,
-}
-
-impl QMetalStorage {
-    pub fn zeros(_: &MetalDevice, _: usize, _: GgmlDType) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &MetalDevice {
-        &self.device
-    }
-
-    pub fn dequantize(&self, _elem_count: usize) -> Result<MetalStorage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    pub fn quantize(&mut self, _src: &MetalStorage) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        0
-    }
-
-    pub fn fwd(
-        &self,
-        _self_shape: &crate::Shape,
-        _storage: &MetalStorage,
-        _layout: &crate::Layout,
-    ) -> Result<(MetalStorage, crate::Shape)> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    _device: &MetalDevice,
-    _data: &[T],
-) -> Result<super::QStorage> {
-    Err(Error::NotCompiledWithMetalSupport)
-}
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -1,7 +1,7 @@
 //! Support for the GGML file format.

-use super::{k_quants, GgmlDType, QStorage};
-use crate::{Device, Result};
+use super::{k_quants, GgmlDType};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;

@ -121,17 +121,11 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
    raw_data: &[u8],
    size_in_bytes: usize,
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let raw_data_ptr = raw_data.as_ptr();
    let n_blocks = size_in_bytes / std::mem::size_of::<T>();
    let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
-    let data: QStorage = match device {
-        Device::Cpu => QStorage::Cpu(Box::new(data.to_vec())),
-        Device::Metal(metal) => super::metal::load_quantized(metal, data)?,
-        Device::Cuda(cuda) => super::cuda::load_quantized(cuda, data)?,
-    };
-    super::QTensor::new(data, dims)
+    super::QTensor::new(data.to_vec(), dims)
 }

 /// Creates a [Tensor] from a raw GGML tensor.
@ -139,50 +133,29 @@ pub fn qtensor_from_ggml(
    ggml_dtype: GgmlDType,
    raw_data: &[u8],
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let tensor_elems = dims.iter().product::<usize>();
-    let block_size = ggml_dtype.block_size();
-    if tensor_elems % block_size != 0 {
+    let blck_size = ggml_dtype.blck_size();
+    if tensor_elems % blck_size != 0 {
        crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
+            "the number of elements {tensor_elems} is not divisible by the block size {blck_size}"
        )
    }
-    let size_in_bytes = tensor_elems / block_size * ggml_dtype.type_size();
+    let size_in_bytes = tensor_elems / blck_size * ggml_dtype.type_size();

    match ggml_dtype {
-        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::Q4_0 => {
-            from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4_1 => {
-            from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_0 => {
-            from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_1 => {
-            from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q8_0 => {
-            from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q2K => {
-            from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q3K => {
-            from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4K => {
-            from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5K => {
-            from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q6K => {
-            from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
-        }
+        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims),
+        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_0 => from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_1 => from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_0 => from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_1 => from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q8_0 => from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q2K => from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q3K => from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4K => from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5K => from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q6K => from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims),
        _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
    }
 }
@ -190,7 +163,6 @@ pub fn qtensor_from_ggml(
 fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    reader: &mut R,
    magic: VersionedMagic,
-    device: &Device,
 ) -> Result<(String, super::QTensor)> {
    let n_dims = reader.read_u32::<LittleEndian>()?;
    let name_len = reader.read_u32::<LittleEndian>()?;
@ -211,11 +183,11 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    }
    let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>();
    let tensor_elems = dims.iter().product::<usize>();
-    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.block_size();
+    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
    // TODO: Mmap version to avoid copying the data around?
    let mut raw_data = vec![0u8; size_in_bytes];
    reader.read_exact(&mut raw_data)?;
-    match qtensor_from_ggml(ggml_dtype, &raw_data, dims, device) {
+    match qtensor_from_ggml(ggml_dtype, &raw_data, dims) {
        Ok(tensor) => Ok((name, tensor)),
        Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
    }
@ -226,14 +198,10 @@ pub struct Content {
    pub hparams: HParams,
    pub vocab: Vocab,
    pub tensors: HashMap<String, super::QTensor>,
-    pub device: Device,
 }

 impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(
-        reader: &mut R,
-        device: &Device,
-    ) -> Result<Content> {
+    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Content> {
        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
        let last_position = reader.seek(std::io::SeekFrom::End(0))?;
        reader.seek(std::io::SeekFrom::Start(0))?;
@ -243,16 +211,14 @@ impl Content {
        let mut tensors = HashMap::new();

        while reader.stream_position()? != last_position {
-            let (name, tensor) = read_one_tensor(reader, magic, device)?;
+            let (name, tensor) = read_one_tensor(reader, magic)?;
            tensors.insert(name, tensor);
        }
-        let device = device.clone();
        Ok(Self {
            magic,
            hparams,
            vocab,
            tensors,
-            device,
        })
    }

--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -3,7 +3,7 @@
 //! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md

 use super::{GgmlDType, QTensor};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::collections::HashMap;

@ -41,7 +41,7 @@ impl VersionedMagic {
            (Magic::Gguf, 1) => Self::GgufV1,
            (Magic::Gguf, 2) => Self::GgufV2,
            (Magic::Gguf, 3) => Self::GgufV3,
-            _ => crate::bail!("gguf: unsupported magic/version {magic:?}/{version}"),
+            _ => crate::bail!("ggml: unsupported magic/version {magic:?}/{version}"),
        };
        Ok(versioned_magic)
    }
@ -59,25 +59,19 @@ impl TensorInfo {
        &self,
        reader: &mut R,
        tensor_data_offset: u64,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_elems = self.shape.elem_count();
-        let block_size = self.ggml_dtype.block_size();
-        if tensor_elems % block_size != 0 {
+        let blck_size = self.ggml_dtype.blck_size();
+        if tensor_elems % blck_size != 0 {
            crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
+            "the number of elements {tensor_elems} is not divisible by the block size {blck_size}"
        )
        }
-        let size_in_bytes = tensor_elems / block_size * self.ggml_dtype.type_size();
+        let size_in_bytes = tensor_elems / blck_size * self.ggml_dtype.type_size();
        let mut raw_data = vec![0u8; size_in_bytes];
        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(
-            self.ggml_dtype,
-            &raw_data,
-            self.shape.dims().to_vec(),
-            device,
-        )
+        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
    }
 }

@ -466,13 +460,12 @@ impl Content {
        &self,
        reader: &mut R,
        name: &str,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_info = match self.tensor_infos.get(name) {
            Some(tensor_info) => tensor_info,
-            None => crate::bail!("cannot find tensor info for {name}"),
+            None => crate::bail!("cannot find tensor-infor for {name}"),
        };
-        tensor_info.read(reader, self.tensor_data_offset, device)
+        tensor_info.read(reader, self.tensor_data_offset)
    }
 }

@ -524,9 +517,10 @@ pub fn write<W: std::io::Seek + std::io::Write>(
                "internal error, unexpected current position {tensor_start_pos} {offset} {pos}"
            )
        }
-        let data = tensor.data()?;
-        let size_in_bytes = data.len();
-        w.write_all(&data)?;
+        let data_ptr = tensor.as_ptr();
+        let size_in_bytes = tensor.storage_size_in_bytes();
+        let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
+        w.write_all(data)?;
        let padding = 31 - (31 + size_in_bytes) % 32;
        w.write_all(&vec![0u8; padding])?;
    }
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@ -1545,13 +1545,13 @@ impl GgmlType for BlockQ5K {
                let d2 = d * sc as f32;
                let m2 = min * m as f32;
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u1 != 0 { 16f32 } else { 0f32 };
-                    y[ys_index] = d1 * ((ql & 0xF) as f32 + to_add) - m1;
+                    let to_add = if qh & u1 != 0 { 16 } else { 1 };
+                    y[ys_index] = d1 * ((ql & 0xF) + to_add) as f32 - m1;
                    ys_index += 1;
                }
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u2 != 0 { 16f32 } else { 0f32 };
-                    y[ys_index] = d2 * ((ql >> 4) as f32 + to_add) - m2;
+                    let to_add = if qh & u2 != 0 { 16 } else { 1 };
+                    y[ys_index] = d2 * ((ql >> 4) + to_add) as f32 - m2;
                    ys_index += 1;
                }
                is += 2;
--- a/candle-core/src/quantized/metal.rs
+++ b/candle-core/src/quantized/metal.rs
@ -1,222 +0,0 @@
-use super::{GgmlDType, QStorage};
-use crate::backend::BackendStorage;
-use crate::{DType, MetalDevice, MetalStorage, Result, Shape};
-use metal::Buffer;
-use std::sync::Arc;
-
-pub struct QMetalStorage {
-    dtype: GgmlDType,
-    device: MetalDevice,
-    buffer: Arc<Buffer>,
-}
-
-impl QMetalStorage {
-    pub fn zeros(device: &MetalDevice, elem_count: usize, dtype: GgmlDType) -> Result<Self> {
-        let size = elem_count * dtype.type_size() / dtype.block_size();
-        let buffer = device.allocate_zeros(size)?;
-        Ok(Self {
-            buffer,
-            device: device.clone(),
-            dtype,
-        })
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &MetalDevice {
-        &self.device
-    }
-
-    pub fn buffer(&self) -> &Buffer {
-        &self.buffer
-    }
-
-    pub fn dequantize(&self, elem_count: usize) -> Result<MetalStorage> {
-        use crate::quantized::k_quants::GgmlType;
-
-        let buffer = self.device.new_buffer_managed(self.buffer.length())?;
-        let command_buffer = self.device.command_buffer()?;
-        command_buffer.set_label("to_cpu");
-        let blit = command_buffer.new_blit_command_encoder();
-        blit.set_label("blit_to_cpu");
-        blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
-        blit.end_encoding();
-        self.device.wait_until_completed()?;
-        let mut out = vec![0.0; elem_count];
-        let block_len = elem_count / self.dtype.block_size();
-        match self.dtype {
-            GgmlDType::F32 => {
-                let vec: Vec<f32> = read_to_vec(&buffer, block_len);
-                f32::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::F16 => {
-                let vec: Vec<half::f16> = read_to_vec(&buffer, block_len);
-                half::f16::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4_0 => {
-                let vec: Vec<crate::quantized::BlockQ4_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4_1 => {
-                let vec: Vec<crate::quantized::BlockQ4_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5_0 => {
-                let vec: Vec<crate::quantized::BlockQ5_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5_1 => {
-                let vec: Vec<crate::quantized::BlockQ5_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8_0 => {
-                let vec: Vec<crate::quantized::BlockQ8_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8_1 => {
-                let vec: Vec<crate::quantized::BlockQ8_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q2K => {
-                let vec: Vec<crate::quantized::BlockQ2K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ2K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q3K => {
-                let vec: Vec<crate::quantized::BlockQ3K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ3K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4K => {
-                let vec: Vec<crate::quantized::BlockQ4K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5K => {
-                let vec: Vec<crate::quantized::BlockQ5K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q6K => {
-                let vec: Vec<crate::quantized::BlockQ6K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ6K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8K => {
-                let vec: Vec<crate::quantized::BlockQ8K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8K::to_float(&vec, &mut out)?;
-            }
-        }
-
-        let buffer = self.device.new_buffer_with_data(&out)?;
-        Ok(MetalStorage::new(
-            buffer,
-            self.device.clone(),
-            elem_count,
-            DType::F32,
-        ))
-    }
-
-    pub fn quantize(&mut self, src: &MetalStorage) -> Result<()> {
-        // Quantization only happens on CPU for now.
-        let src = src.to_cpu::<f32>()?;
-        let elem_count = src.len();
-        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
-        let mut qcpu_storage = crate::Device::Cpu.qzeros(elem_count, self.dtype)?;
-        qcpu_storage.quantize(&src)?;
-        let buffer = self.device.new_buffer_with_data(&qcpu_storage.data()?)?;
-        self.buffer = buffer;
-        Ok(())
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        self.buffer.length() as usize
-    }
-
-    pub fn fwd(
-        &self,
-        self_shape: &Shape,
-        storage: &MetalStorage,
-        layout: &crate::Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        use crate::MetalError;
-
-        if !layout.is_contiguous() {
-            crate::bail!("input tensor is not contiguous {layout:?}")
-        }
-        let src_shape = layout.shape();
-        // self is transposed so n is first then k.
-        if src_shape.rank() < 2 {
-            crate::bail!("input tensor has only one dimension {layout:?}")
-        }
-        let (n, k) = self_shape.dims2()?;
-        let mut dst_shape = src_shape.dims().to_vec();
-
-        let (b, m) = match dst_shape.len() {
-            3 => (dst_shape[0], dst_shape[1]),
-            2 => (1, dst_shape[0]),
-            n => crate::bail!("Invalid rank {n} for quantized matmul metal"),
-        };
-        let last_k = dst_shape.pop().unwrap();
-        if last_k != k {
-            crate::bail!("input tensor {layout:?} incompatible with {:?}", self_shape)
-        }
-        dst_shape.push(n);
-        let dst_shape = Shape::from(dst_shape);
-        let device = storage.device().clone();
-        let dst = device.new_buffer(dst_shape.elem_count(), DType::F32, "qmatmul")?;
-        let command_buffer = device.command_buffer()?;
-        candle_metal_kernels::call_quantized_matmul_t(
-            device.device(),
-            &command_buffer,
-            device.kernels(),
-            self.dtype.into(),
-            (b, m, n, k),
-            storage.buffer(),
-            layout.start_offset() * storage.dtype().size_in_bytes(),
-            &self.buffer,
-            &dst,
-        )
-        .map_err(MetalError::from)?;
-        let dst_storage = crate::MetalStorage::new(dst, device, dst_shape.elem_count(), DType::F32);
-        Ok((dst_storage, dst_shape))
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    device: &MetalDevice,
-    data: &[T],
-) -> Result<QStorage> {
-    let buffer = device.new_buffer_with_data(data)?;
-    let device = device.clone();
-    Ok(QStorage::Metal(QMetalStorage {
-        dtype: T::DTYPE,
-        device,
-        buffer,
-    }))
-}
-
-fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
-    let ptr = buffer.contents() as *const T;
-    assert!(!ptr.is_null());
-    let slice = unsafe { std::slice::from_raw_parts(ptr, n) };
-    slice.to_vec()
-}
-
-impl From<GgmlDType> for candle_metal_kernels::GgmlDType {
-    fn from(value: GgmlDType) -> Self {
-        match value {
-            GgmlDType::Q4_0 => candle_metal_kernels::GgmlDType::Q4_0,
-            GgmlDType::Q4_1 => candle_metal_kernels::GgmlDType::Q4_1,
-            GgmlDType::Q5_0 => candle_metal_kernels::GgmlDType::Q5_0,
-            GgmlDType::Q5_1 => candle_metal_kernels::GgmlDType::Q5_1,
-            GgmlDType::Q8_0 => candle_metal_kernels::GgmlDType::Q8_0,
-            GgmlDType::Q8_1 => candle_metal_kernels::GgmlDType::Q8_1,
-            GgmlDType::Q2K => candle_metal_kernels::GgmlDType::Q2K,
-            GgmlDType::Q3K => candle_metal_kernels::GgmlDType::Q3K,
-            GgmlDType::Q4K => candle_metal_kernels::GgmlDType::Q4K,
-            GgmlDType::Q5K => candle_metal_kernels::GgmlDType::Q5K,
-            GgmlDType::Q6K => candle_metal_kernels::GgmlDType::Q6K,
-            GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K,
-            GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16,
-            GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32,
-        }
-    }
-}
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -1,134 +1,23 @@
-use crate::{CpuStorage, Device, Result, Shape, Storage, Tensor};
-use k_quants::*;
-use std::borrow::Cow;
+use crate::{Device, Result, Shape, Tensor};

 #[cfg(target_feature = "avx")]
 pub mod avx;
-mod dummy_cuda;
-mod dummy_metal;
 pub mod ggml_file;
 pub mod gguf_file;
 pub mod k_quants;
-#[cfg(feature = "metal")]
-pub mod metal;
-#[cfg(not(feature = "metal"))]
-mod metal {
-    pub use super::dummy_metal::*;
-}
-#[cfg(feature = "cuda")]
-pub mod cuda;
-#[cfg(not(feature = "cuda"))]
-mod cuda {
-    pub use super::dummy_cuda::*;
-}
-
 #[cfg(target_feature = "neon")]
 pub mod neon;
 #[cfg(target_feature = "simd128")]
 pub mod simd128;
 pub mod utils;
-use half::f16;

 pub use k_quants::GgmlType;

 pub struct QTensor {
-    storage: QStorage,
+    data: Box<dyn QuantizedType>,
    shape: Shape,
 }

-impl Device {
-    fn qzeros(&self, elem_count: usize, dtype: GgmlDType) -> Result<QStorage> {
-        match self {
-            Device::Cpu => {
-                let storage = dtype.cpu_zeros(elem_count);
-                Ok(QStorage::Cpu(storage))
-            }
-            Device::Metal(metal) => {
-                let storage = metal::QMetalStorage::zeros(metal, elem_count, dtype)?;
-                Ok(QStorage::Metal(storage))
-            }
-            Device::Cuda(cuda) => {
-                let storage = cuda::QCudaStorage::zeros(cuda, elem_count, dtype)?;
-                Ok(QStorage::Cuda(storage))
-            }
-        }
-    }
-}
-
-pub enum QStorage {
-    Cpu(Box<dyn QuantizedType>),
-    Metal(metal::QMetalStorage),
-    Cuda(cuda::QCudaStorage),
-}
-
-impl QStorage {
-    fn block_size(&self) -> usize {
-        match self {
-            QStorage::Cpu(storage) => storage.block_size(),
-            QStorage::Metal(storage) => storage.dtype().block_size(),
-            QStorage::Cuda(storage) => storage.dtype().block_size(),
-        }
-    }
-
-    fn dtype(&self) -> GgmlDType {
-        match self {
-            QStorage::Cpu(storage) => storage.dtype(),
-            QStorage::Metal(storage) => storage.dtype(),
-            QStorage::Cuda(storage) => storage.dtype(),
-        }
-    }
-
-    fn device(&self) -> Device {
-        match self {
-            QStorage::Cpu(_storage) => Device::Cpu,
-            QStorage::Metal(storage) => Device::Metal(storage.device().clone()),
-            QStorage::Cuda(storage) => Device::Cuda(storage.device().clone()),
-        }
-    }
-
-    fn size_in_bytes(&self) -> usize {
-        match self {
-            QStorage::Cpu(storage) => storage.storage_size_in_bytes(),
-            QStorage::Metal(storage) => storage.storage_size_in_bytes(),
-            QStorage::Cuda(storage) => storage.storage_size_in_bytes(),
-        }
-    }
-
-    fn quantize(&mut self, src: &Storage) -> Result<()> {
-        match (self, src) {
-            (QStorage::Cpu(storage), Storage::Cpu(src)) => {
-                storage.from_float(src.as_slice::<f32>()?)?;
-            }
-            (QStorage::Metal(storage), Storage::Metal(src)) => storage.quantize(src)?,
-            (QStorage::Cuda(storage), Storage::Cuda(src)) => storage.quantize(src)?,
-            _ => crate::bail!("Invalid dequantize storage locations do not match"),
-        }
-        Ok(())
-    }
-
-    fn dequantize(&self, elem_count: usize) -> Result<Storage> {
-        match self {
-            QStorage::Cpu(storage) => Ok(Storage::Cpu(storage.dequantize(elem_count)?)),
-            QStorage::Metal(storage) => Ok(Storage::Metal(storage.dequantize(elem_count)?)),
-            QStorage::Cuda(storage) => Ok(Storage::Cuda(storage.dequantize(elem_count)?)),
-        }
-    }
-
-    fn data(&self) -> Result<Cow<[u8]>> {
-        match self {
-            QStorage::Cpu(storage) => {
-                let data_ptr = storage.as_ptr();
-                let size_in_bytes = storage.storage_size_in_bytes();
-                let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
-                Ok(Cow::from(data))
-            }
-            QStorage::Metal(_) | QStorage::Cuda(_) => {
-                crate::bail!("not implemented");
-            }
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum GgmlDType {
    F32,
@ -188,25 +77,6 @@ impl GgmlDType {
        }
    }

-    /// The block dtype
-    pub fn cpu_zeros(&self, elem_count: usize) -> Box<dyn QuantizedType> {
-        match self {
-            Self::F32 => Box::new(vec![f32::zeros(); elem_count]),
-            Self::F16 => Box::new(vec![f16::zeros(); elem_count]),
-            Self::Q4_0 => Box::new(vec![BlockQ4_0::zeros(); elem_count / BlockQ4_0::BLCK_SIZE]),
-            Self::Q4_1 => Box::new(vec![BlockQ4_1::zeros(); elem_count / BlockQ4_1::BLCK_SIZE]),
-            Self::Q5_0 => Box::new(vec![BlockQ5_0::zeros(); elem_count / BlockQ5_0::BLCK_SIZE]),
-            Self::Q5_1 => Box::new(vec![BlockQ5_1::zeros(); elem_count / BlockQ5_1::BLCK_SIZE]),
-            Self::Q8_0 => Box::new(vec![BlockQ8_0::zeros(); elem_count / BlockQ8_0::BLCK_SIZE]),
-            Self::Q8_1 => Box::new(vec![BlockQ8_1::zeros(); elem_count / BlockQ8_1::BLCK_SIZE]),
-            Self::Q2K => Box::new(vec![BlockQ2K::zeros(); elem_count / BlockQ2K::BLCK_SIZE]),
-            Self::Q3K => Box::new(vec![BlockQ3K::zeros(); elem_count / BlockQ3K::BLCK_SIZE]),
-            Self::Q4K => Box::new(vec![BlockQ4K::zeros(); elem_count / BlockQ4K::BLCK_SIZE]),
-            Self::Q5K => Box::new(vec![BlockQ5K::zeros(); elem_count / BlockQ5K::BLCK_SIZE]),
-            Self::Q6K => Box::new(vec![BlockQ6K::zeros(); elem_count / BlockQ6K::BLCK_SIZE]),
-            Self::Q8K => Box::new(vec![BlockQ8K::zeros(); elem_count / BlockQ8K::BLCK_SIZE]),
-        }
-    }
    /// The type size for blocks in bytes.
    pub fn type_size(&self) -> usize {
        use k_quants::*;
@ -230,7 +100,7 @@ impl GgmlDType {
    }

    /// The block size, i.e. the number of elements stored in each block.
-    pub fn block_size(&self) -> usize {
+    pub fn blck_size(&self) -> usize {
        match self {
            Self::F32 => 1,
            Self::F16 => 1,
@ -249,13 +119,9 @@ impl GgmlDType {
 pub trait QuantizedType: Send + Sync {
    fn dtype(&self) -> GgmlDType;
    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()>;
-    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage>;
+    fn to_float(&self, ys: &mut [f32]) -> Result<()>;
    fn storage_size_in_bytes(&self) -> usize;
    fn as_ptr(&self) -> *const u8;
-    fn block_size(&self) -> usize;
-    #[allow(clippy::wrong_self_convention)]
-    fn from_float(&mut self, xs: &[f32]) -> Result<()>;
-    fn size(&self) -> usize;
 }

 impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
@ -263,26 +129,12 @@ impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
        k_quants::matmul(mkn, lhs, self.as_slice(), dst)
    }

-    fn size(&self) -> usize {
-        self.len() * core::mem::size_of::<T>()
-    }
-
-    fn from_float(&mut self, xs: &[f32]) -> Result<()> {
-        T::from_float(xs, self)
-    }
-
    fn dtype(&self) -> GgmlDType {
        T::DTYPE
    }

-    fn block_size(&self) -> usize {
-        T::BLCK_SIZE
-    }
-
-    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage> {
-        let mut ys = vec![0.0f32; elem_count];
-        T::to_float(self.as_slice(), &mut ys)?;
-        Ok(CpuStorage::F32(ys))
+    fn to_float(&self, ys: &mut [f32]) -> Result<()> {
+        T::to_float(self.as_slice(), ys)
    }

    fn storage_size_in_bytes(&self) -> usize {
@ -300,53 +152,56 @@ impl std::fmt::Debug for QTensor {
    }
 }

-fn check_shape(shape: &Shape, block_size: usize) -> Result<()> {
+fn check_shape<T: k_quants::GgmlType>(shape: &Shape) -> Result<()> {
    let dims = shape.dims();
    if dims.is_empty() {
        crate::bail!("scalar tensor cannot be quantized {shape:?}")
    }
-    if dims[dims.len() - 1] % block_size != 0 {
+    if dims[dims.len() - 1] % T::BLCK_SIZE != 0 {
        crate::bail!(
            "quantized tensor must have their last dim divisible by block size {shape:?} {}",
-            block_size
+            T::BLCK_SIZE
        )
    }
    Ok(())
 }

 impl QTensor {
-    pub fn new<S: Into<Shape>>(storage: QStorage, shape: S) -> Result<Self> {
+    pub fn new<S: Into<Shape>, T: k_quants::GgmlType + Send + Sync + 'static>(
+        data: Vec<T>,
+        shape: S,
+    ) -> Result<Self> {
        let shape = shape.into();
-        check_shape(&shape, storage.block_size())?;
-        Ok(Self { storage, shape })
+        check_shape::<T>(&shape)?;
+        Ok(Self {
+            data: Box::new(data),
+            shape,
+        })
    }

-    pub fn quantize(src: &Tensor, dtype: GgmlDType) -> Result<Self> {
+    pub fn quantize<T: k_quants::GgmlType + Send + Sync + 'static>(src: &Tensor) -> Result<Self> {
        let shape = src.shape();
-        let block_size = dtype.block_size();
-        check_shape(shape, block_size)?;
-        let src = src.to_dtype(crate::DType::F32)?.flatten_all()?;
-        let elem_count = shape.elem_count();
-        if elem_count % block_size != 0 {
+        check_shape::<T>(shape)?;
+        let src = src
+            .to_dtype(crate::DType::F32)?
+            .flatten_all()?
+            .to_vec1::<f32>()?;
+        if src.len() % T::BLCK_SIZE != 0 {
            crate::bail!(
                "tensor size ({shape:?}) is not divisible by block size {}",
-                block_size
+                T::BLCK_SIZE
            )
        }
-        let mut storage = src.device().qzeros(elem_count, dtype)?;
-        storage.quantize(&src.storage())?;
+        let mut data = vec![T::zeros(); src.len() / T::BLCK_SIZE];
+        T::from_float(&src, &mut data)?;
        Ok(Self {
-            storage,
+            data: Box::new(data),
            shape: shape.clone(),
        })
    }

    pub fn dtype(&self) -> GgmlDType {
-        self.storage.dtype()
-    }
-
-    pub fn device(&self) -> Device {
-        self.storage.device()
+        self.data.dtype()
    }

    pub fn rank(&self) -> usize {
@ -358,19 +213,21 @@ impl QTensor {
    }

    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
-        let storage = self.storage.dequantize(self.shape.elem_count())?;
-        let none = crate::op::BackpropOp::none();
-        let is_variable = false;
-        crate::tensor::from_storage(storage, self.shape.clone(), none, is_variable)
-            .to_device(device)
+        let mut f32_data = vec![0f32; self.shape.elem_count()];
+        self.data.to_float(&mut f32_data)?;
+        Tensor::from_vec(f32_data, &self.shape, device)
+    }
+
+    pub fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> {
+        self.data.matmul_t(mkn, lhs, dst)
    }

    pub fn storage_size_in_bytes(&self) -> usize {
-        self.storage.size_in_bytes()
+        self.data.storage_size_in_bytes()
    }

-    pub fn data(&self) -> Result<Cow<'_, [u8]>> {
-        self.storage.data()
+    pub fn as_ptr(&self) -> *const u8 {
+        self.data.as_ptr()
    }
 }

@ -398,7 +255,7 @@ impl QMatMul {
            _ => DEQUANTIZE_ALL.with(|b| *b),
        };
        let t = if dequantize {
-            let tensor = qtensor.dequantize(&qtensor.device())?;
+            let tensor = qtensor.dequantize(&Device::Cpu)?;
            Self::Tensor(tensor)
        } else {
            Self::QTensor(qtensor)
@ -437,45 +294,21 @@ impl crate::CustomOp1 for QTensor {
        }
        dst_shape.push(n);
        let dst_shape = Shape::from(dst_shape);
-        #[allow(clippy::infallible_destructuring_match)]
-        let self_storage = match &self.storage {
-            QStorage::Cpu(storage) => storage,
-            QStorage::Metal(_) | QStorage::Cuda(_) => crate::bail!("Invalid storage"),
-        };
-        let slice = storage.as_slice::<f32>()?;
-        let slice = &slice[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
+        let storage = storage.as_slice::<f32>()?;
+        let storage =
+            &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
        let mut dst_storage = vec![0f32; dst_shape.elem_count()];
-        self_storage.matmul_t((dst_shape.elem_count() / n, k, n), slice, &mut dst_storage)?;
+        self.matmul_t(
+            (dst_shape.elem_count() / n, k, n),
+            storage,
+            &mut dst_storage,
+        )?;
        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
    }
-
-    fn metal_fwd(
-        &self,
-        storage: &crate::MetalStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::MetalStorage, Shape)> {
-        let self_storage = match &self.storage {
-            QStorage::Metal(metal) => metal,
-            _ => unreachable!("Cannot call metal matmul on non metal QTensor"),
-        };
-        self_storage.fwd(&self.shape, storage, layout)
-    }
-
-    fn cuda_fwd(
-        &self,
-        storage: &crate::CudaStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::CudaStorage, Shape)> {
-        let self_storage = match &self.storage {
-            QStorage::Cuda(cuda) => cuda,
-            _ => unreachable!("Cannot call cuda matmul on non cuda QTensor"),
-        };
-        self_storage.fwd(&self.shape, storage, layout)
-    }
 }

-impl crate::Module for QMatMul {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+impl QMatMul {
+    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        match self {
            Self::QTensor(t) => xs.apply_op1_no_bwd(t.as_ref()),
            Self::Tensor(w) => {
--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
@ -12,14 +12,6 @@ use core::arch::arm::*;
 #[cfg(target_arch = "aarch64")]
 use core::arch::aarch64::*;

-#[inline(always)]
-unsafe fn vdotq_s32(a: int8x16_t, b: int8x16_t) -> int32x4_t {
-    // TODO: dotprod
-    let p0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    let p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-    vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))
-}
-
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
@ -51,8 +43,15 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
            let v1_0l = vld1q_s8(y0.qs.as_ptr());
            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));

-            let pl0 = vdotq_s32(v0_0ls, v1_0l);
-            let ph0 = vdotq_s32(v0_0hs, v1_0h);
+            // TODO: Support dotprod when it's available outside of nightly.
+            let pl0l = vmull_s8(vget_low_s8(v0_0ls), vget_low_s8(v1_0l));
+            let pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+            let ph0l = vmull_s8(vget_low_s8(v0_0hs), vget_low_s8(v1_0h));
+            let ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+            let pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+            let ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+
            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
@ -83,8 +82,14 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
            let y0_0 = vld1q_s8(y0.qs.as_ptr());
            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));

-            let p0 = vdotq_s32(x0_0, y0_0);
-            let p1 = vdotq_s32(x0_1, y0_1);
+            // TODO dotprod once this is the intrinsics are.
+            let p0_0 = vmull_s8(vget_low_s8(x0_0), vget_low_s8(y0_0));
+            let p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
+            let p0_2 = vmull_s8(vget_low_s8(x0_1), vget_low_s8(y0_1));
+            let p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
+
+            let p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
+            let p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));

            sumv0 = vmlaq_n_f32(
                sumv0,
@ -113,7 +118,10 @@ pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Res
            for i in (0..QK_K).step_by(16) {
                let xs = vld1q_s8(xs.add(i));
                let ys = vld1q_s8(ys.add(i));
-                let xy = vdotq_s32(xs, ys);
+                let xy_lo = vmull_s8(vget_low_s8(xs), vget_low_s8(ys));
+                let xy_up = vmull_s8(vget_high_s8(xs), vget_high_s8(ys));
+
+                let xy = vaddq_s32(vpaddlq_s16(xy_lo), vpaddlq_s16(xy_up));
                sum_i = vaddq_s32(sum_i, xy)
            }
            sumf += vaddvq_s32(sum_i) as f32 * scale
@ -183,16 +191,30 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.2, m4b), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.3, m4b), q6h_3));

-                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
-                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                // TODO: dotprod
+
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
                scale = scale.add(2);

-                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
-                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
+                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
+                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
                scale = scale.add(2);

                let q8bytes = vld1q_s8_x4(q8);
@ -212,16 +234,29 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.2, 4), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.3, 4), q6h_3));

-                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
-                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                // TODO: dotprod case.
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
                scale = scale.add(2);

-                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
-                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
+                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
+                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
                scale = scale.add(2);
            }
            sum += d_all * y.d * ((isum - 32 * isum_mins) as f32);
@ -298,14 +333,28 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.0, 4), q5h_2));
                let q5bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.1, 4), q5h_3));

-                let p0 = vdotq_s32(q5bytes_0, q8bytes.0);
-                let p1 = vdotq_s32(q5bytes_1, q8bytes.1);
-                sumi += vaddvq_s32(vaddq_s32(p0, p1)) * *scales as i32;
+                // TODO: dotprod
+
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q5bytes_0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q5bytes_1), vget_high_s8(q8bytes.1)),
+                );
+                sumi += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * *scales as i32;
                scales = scales.add(1);

-                let p2 = vdotq_s32(q5bytes_2, q8bytes.2);
-                let p3 = vdotq_s32(q5bytes_3, q8bytes.3);
-                sumi += vaddvq_s32(vaddq_s32(p2, p3)) * *scales as i32;
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_2), vget_low_s8(q8bytes.2)),
+                    vmull_s8(vget_high_s8(q5bytes_2), vget_high_s8(q8bytes.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_3), vget_low_s8(q8bytes.3)),
+                    vmull_s8(vget_high_s8(q5bytes_3), vget_high_s8(q8bytes.3)),
+                );
+                sumi += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * *scales as i32;
                scales = scales.add(1);
            }
            sumf += d * sumi as f32 - dmin * sumi_mins as f32;
@ -368,15 +417,22 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
            for j in 0..QK_K / 64 {
                let q4bits = vld1q_u8_x2(q4);
                q4 = q4.add(32);
+                // TODO: dotprod
                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
                let q4bytes = int8x16x2_t(
                    vreinterpretq_s8_u8(vandq_u8(q4bits.0, m4b)),
                    vreinterpretq_s8_u8(vandq_u8(q4bits.1, m4b)),
                );
-                let p0 = vdotq_s32(q4bytes.0, q8bytes.0);
-                let p1 = vdotq_s32(q4bytes.1, q8bytes.1);
-                sumi1 += vaddvq_s32(vaddq_s32(p0, p1)) * scales[2 * j] as i32;
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
+                );
+                sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * scales[2 * j] as i32;

                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
@ -384,9 +440,15 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.0, 4)),
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.1, 4)),
                );
-                let p2 = vdotq_s32(q4bytes.0, q8bytes.0);
-                let p3 = vdotq_s32(q4bytes.1, q8bytes.1);
-                sumi2 += vaddvq_s32(vaddq_s32(p2, p3)) * scales[2 * j + 1] as i32;
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
+                );
+                sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * scales[2 * j + 1] as i32;
            }
            sumf += d * (sumi1 + sumi2) as f32;
        }
@ -464,14 +526,27 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );

-                let p0 = vdotq_s32(q3bytes_0, q8bytes_1.0);
-                let p1 = vdotq_s32(q3bytes_1, q8bytes_1.1);
-                let p2 = vdotq_s32(q3bytes_2, q8bytes_1.2);
-                let p3 = vdotq_s32(q3bytes_3, q8bytes_1.3);
-                isum += vaddvq_s32(p0) * *scale as i32
-                    + vaddvq_s32(p1) * *scale.add(1) as i32
-                    + vaddvq_s32(p2) * *scale.add(2) as i32
-                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                // TODO: dotprod
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_1.0)),
+                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_1.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_1.1)),
+                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_1.1)),
+                );
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_1.2)),
+                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_1.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_1.3)),
+                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_1.3)),
+                );
+                isum += vaddvq_s16(p0) as i32 * *scale as i32
+                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
+                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
+                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
                scale = scale.add(4);

                let q3h_0 = vbicq_u8(m2, qhbits.0);
@ -496,14 +571,27 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );

-                let p0 = vdotq_s32(q3bytes_0, q8bytes_2.0);
-                let p1 = vdotq_s32(q3bytes_1, q8bytes_2.1);
-                let p2 = vdotq_s32(q3bytes_2, q8bytes_2.2);
-                let p3 = vdotq_s32(q3bytes_3, q8bytes_2.3);
-                isum += vaddvq_s32(p0) * *scale as i32
-                    + vaddvq_s32(p1) * *scale.add(1) as i32
-                    + vaddvq_s32(p2) * *scale.add(2) as i32
-                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                // TODO: dotprod
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_2.0)),
+                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_2.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_2.1)),
+                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_2.1)),
+                );
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_2.2)),
+                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_2.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_2.3)),
+                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_2.3)),
+                );
+                isum += vaddvq_s16(p0) as i32 * *scale as i32
+                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
+                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
+                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
                scale = scale.add(4);

                if j == 0 {
@ -561,6 +649,7 @@ pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Res
            let mut is = 0usize;

            // TODO: dotprod
+
            for _j in 0..QK_K / 128 {
                let q2bits = vld1q_u8_x2(q2);
                q2 = q2.add(32);
@ -607,7 +696,14 @@ unsafe fn multiply_accum_with_scale(
    q2bytes: int8x16x2_t,
    q8bytes: int8x16x2_t,
 ) -> i32 {
-    let p1 = vdotq_s32(q2bytes.0, q8bytes.0);
-    let p2 = vdotq_s32(q2bytes.1, q8bytes.1);
-    vaddvq_s32(p1) * aux[is + index] as i32 + vaddvq_s32(p2) * aux[is + 1 + index] as i32
+    let p1 = vaddq_s16(
+        vmull_s8(vget_low_s8(q2bytes.0), vget_low_s8(q8bytes.0)),
+        vmull_s8(vget_high_s8(q2bytes.0), vget_high_s8(q8bytes.0)),
+    );
+    let p2 = vaddq_s16(
+        vmull_s8(vget_low_s8(q2bytes.1), vget_low_s8(q8bytes.1)),
+        vmull_s8(vget_high_s8(q2bytes.1), vget_high_s8(q8bytes.1)),
+    );
+    vaddvq_s16(p1) as i32 * aux[is + index] as i32
+        + vaddvq_s16(p2) as i32 * aux[is + 1 + index] as i32
 }
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -478,6 +478,23 @@ extract_dims!(
    (usize, usize, usize, usize, usize)
 );

+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn stride() {
+        let shape = Shape::from(());
+        assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
+        let shape = Shape::from(42);
+        assert_eq!(shape.stride_contiguous(), [1]);
+        let shape = Shape::from((42, 1337));
+        assert_eq!(shape.stride_contiguous(), [1337, 1]);
+        let shape = Shape::from((299, 792, 458));
+        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
+    }
+}
+
 pub trait ShapeWithOneHole {
    fn into_shape(self, el_count: usize) -> Result<Shape>;
 }
@ -610,20 +627,3 @@ impl ShapeWithOneHole for (usize, usize, usize, usize, ()) {
        Ok((d1, d2, d3, d4, d).into())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn stride() {
-        let shape = Shape::from(());
-        assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
-        let shape = Shape::from(42);
-        assert_eq!(shape.stride_contiguous(), [1]);
-        let shape = Shape::from((42, 1337));
-        assert_eq!(shape.stride_contiguous(), [1337, 1]);
-        let shape = Shape::from((299, 792, 458));
-        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
-    }
-}
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,7 +1,6 @@
 use crate::backend::BackendStorage;
-use crate::op::{self, CmpOp, ReduceOp};
+use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
 use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
-use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3};

 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -253,51 +252,6 @@ impl Storage {
        }
    }

-    pub(crate) fn inplace_op1(&mut self, l: &Layout, c: &dyn InplaceOp1) -> Result<()> {
-        match self {
-            Self::Cpu(storage) => c.cpu_fwd(storage, l),
-            Self::Cuda(storage) => c.cuda_fwd(storage, l),
-            Self::Metal(storage) => c.metal_fwd(storage, l),
-        }
-    }
-
-    pub(crate) fn inplace_op2(
-        &mut self,
-        l1: &Layout,
-        t2: &Self,
-        l2: &Layout,
-        c: &dyn InplaceOp2,
-    ) -> Result<()> {
-        self.same_device(t2, c.name())?;
-        match (self, t2) {
-            (Self::Cpu(s1), Self::Cpu(s2)) => c.cpu_fwd(s1, l1, s2, l2),
-            (Self::Cuda(s1), Self::Cuda(s2)) => c.cuda_fwd(s1, l1, s2, l2),
-            (Self::Metal(s1), Self::Metal(s2)) => c.metal_fwd(s1, l1, s2, l2),
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn inplace_op3(
-        &mut self,
-        l1: &Layout,
-        t2: &Self,
-        l2: &Layout,
-        t3: &Self,
-        l3: &Layout,
-        c: &dyn InplaceOp3,
-    ) -> Result<()> {
-        self.same_device(t2, c.name())?;
-        self.same_device(t3, c.name())?;
-        match (self, t2, t3) {
-            (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => c.cpu_fwd(s1, l1, s2, l2, s3, l3),
-            (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => c.cuda_fwd(s1, l1, s2, l2, s3, l3),
-            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
-                c.metal_fwd(s1, l1, s2, l2, s3, l3)
-            }
-            _ => unreachable!(),
-        }
-    }
-
    pub(crate) fn unary_impl<B: op::UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
        match self {
            Storage::Cpu(storage) => {
@ -398,10 +352,6 @@ impl Storage {
                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -747,32 +697,4 @@ impl Storage {
            .bt()),
        }
    }
-
-    #[allow(clippy::too_many_arguments)]
-    pub(crate) fn copy2d(
-        &self,
-        dst: &mut Self,
-        d1: usize,
-        d2: usize,
-        src_s: usize,
-        dst_s: usize,
-        src_o: usize,
-        dst_o: usize,
-    ) -> Result<()> {
-        match (self, dst) {
-            (Self::Cpu(src), Self::Cpu(dst)) => src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o),
-            (Self::Cuda(src), Self::Cuda(dst)) => {
-                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
-            }
-            (Self::Metal(src), Self::Metal(dst)) => {
-                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
-            }
-            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
-                lhs: lhs.device().location(),
-                rhs: rhs.device().location(),
-                op: "copy2d",
-            }
-            .bt()),
-        }
-    }
 }
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -1,7 +1,9 @@
-//! Tensors are N-dimensional matrixes of elements using a single data type.
+//! Tensors are N-dimenional matrixes of elements using a single data type.
 #![allow(clippy::redundant_closure_call)]
 use crate::backend::{BackendDevice, BackendStorage};
-use crate::op::{BackpropOp, BinaryOp, CmpOp, Op, ReduceOp, UnaryOp};
+use crate::op::{
+    BackpropOp, BinaryOp, CmpOp, CustomOp1, CustomOp2, CustomOp3, Op, ReduceOp, UnaryOp,
+};
 use crate::scalar::TensorOrScalar;
 use crate::shape::{Dim, Dims};
 use crate::{bail, storage::Storage, DType, Device, Error, Layout, Result, Shape};
@ -359,16 +361,6 @@ impl Tensor {
        Self::new_impl(array, shape, device, false)
    }

-    /// Returns a new tensor with all the elements having the same specified value. Note that
-    /// the tensor is not contiguous so you would have to call `.contiguous()` on it if needed.
-    pub fn full<D: crate::WithDType, S: Into<Shape>>(
-        value: D,
-        shape: S,
-        device: &Device,
-    ) -> Result<Self> {
-        Self::from_vec_impl(vec![value], (), device, false)?.broadcast_as(shape)
-    }
-
    /// Creates a new 1D tensor from an iterator.
    pub fn from_iter<D: crate::WithDType>(
        iter: impl IntoIterator<Item = D>,
@ -394,7 +386,7 @@ impl Tensor {
        device: &Device,
    ) -> Result<Self> {
        if D::is_zero(&step) {
-            bail!("step cannot be zero")
+            crate::bail!("step cannot be zero")
        }
        let mut data = vec![];
        let mut current = start;
@ -506,7 +498,6 @@ impl Tensor {
    unary_op!(gelu_erf, GeluErf);
    unary_op!(erf, Erf);
    unary_op!(relu, Relu);
-    unary_op!(silu, Silu);
    unary_op!(ceil, Ceil);
    unary_op!(floor, Floor);
    unary_op!(round, Round);
@ -664,7 +655,7 @@ impl Tensor {
        Ok(from_storage(storage, self.shape(), op, false))
    }

-    pub(crate) fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> {
+    fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> {
        if dim >= self.dims().len() {
            Err(Error::DimOutOfRange {
                shape: self.shape().clone(),
@ -678,7 +669,7 @@ impl Tensor {
    }

    /// Split a tensor into the specified number of chunks, this may return less chunks than
-    /// specified.
+    /// specificed.
    pub fn chunk<D: Dim>(&self, chunks: usize, dim: D) -> Result<Vec<Self>> {
        let dim = dim.to_index(self.shape(), "chunk")?;
        let size = self.dim(dim)?;
@ -803,35 +794,6 @@ impl Tensor {
        }
    }

-    /// Roll the tensor input along the given dimension.
-    /// Elements that are shifted beyond the last position are re-introduced at the first position.
-    ///
-    /// ```rust
-    /// # use candle_core::{Tensor, Device};
-    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
-    /// let tensor = tensor.roll(1, 0)?;
-    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[4., 5.], [0., 1.], [2., 3.]]);
-    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
-    /// let tensor = tensor.roll(-1, 0)?;
-    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[2., 3.], [4., 5.], [0., 1.]]);
-    /// # Ok::<(), candle_core::Error>(())
-    /// ```
-    pub fn roll<D>(&self, shift: i32, dim: D) -> Result<Self>
-    where
-        D: Dim + Clone,
-    {
-        let dim = dim.to_index(self.shape(), "roll")?;
-        let dim_size = self.dim(dim)?;
-        let shift = shift.rem_euclid(dim_size as i32) as usize;
-        if shift == 0 {
-            Ok(self.clone())
-        } else {
-            let a = self.narrow(dim, 0, dim_size - shift)?;
-            let b = self.narrow(dim, dim_size - shift, shift)?;
-            Tensor::cat(&[&b, &a], dim)
-        }
-    }
-
    /// Returns the sum of all elements in the input tensor. The sum is performed over all the
    /// input dimensions.
    ///
@ -1013,7 +975,7 @@ impl Tensor {
    /// tensor also has three dimensions, `(batch, channels, target_size)`.
    pub fn interpolate1d(&self, target_size: usize) -> Result<Self> {
        let (n, c, _l) = self.dims3()?;
-        let op = BackpropOp::new1(self, |arg| Op::UpsampleNearest1D { arg, target_size });
+        let op = BackpropOp::new1(self, Op::UpsampleNearest1D);
        let storage = self
            .storage()
            .upsample_nearest1d(self.layout(), target_size)?;
@ -1032,11 +994,7 @@ impl Tensor {
    /// tensor also has four dimensions, `(batch, channels, target_h, target_w)`.
    pub fn interpolate2d(&self, target_h: usize, target_w: usize) -> Result<Self> {
        let (n, c, _h, _w) = self.dims4()?;
-        let op = BackpropOp::new1(self, |arg| Op::UpsampleNearest2D {
-            arg,
-            target_h,
-            target_w,
-        });
+        let op = BackpropOp::new1(self, Op::UpsampleNearest2D);
        let storage = self
            .storage()
            .upsample_nearest2d(self.layout(), target_h, target_w)?;
@ -1069,9 +1027,6 @@ impl Tensor {
        let kernel_size = kernel_size.to_usize2();
        let stride = stride.to_usize2();
        let (n, c, h, w) = self.dims4()?;
-        if h < kernel_size.0 || w < kernel_size.1 {
-            bail!("kernel-size {kernel_size:?} is larger than the input size {h},{w}")
-        }
        // https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html#torch.nn.AvgPool2d
        let h_out = (h - kernel_size.0) / stride.0 + 1;
        let w_out = (w - kernel_size.1) / stride.1 + 1;
@ -1107,9 +1062,6 @@ impl Tensor {
        let kernel_size = kernel_size.to_usize2();
        let stride = stride.to_usize2();
        let (n, c, h, w) = self.dims4()?;
-        if h < kernel_size.0 || w < kernel_size.1 {
-            bail!("kernel-size {kernel_size:?} is larger than the input size {h},{w}")
-        }
        // https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
        let h_out = (h - kernel_size.0) / stride.0 + 1;
        let w_out = (w - kernel_size.1) / stride.1 + 1;
@ -1349,7 +1301,7 @@ impl Tensor {
            }
            .bt())?
        }
-        let mut storage = unsafe { self.device().alloc_uninit(self.shape(), self.dtype())? };
+        let mut storage = self.device().zeros(self.shape(), self.dtype())?;
        self.storage()
            .copy_strided_src(&mut storage, 0, self.layout())?;
        let offset = start * src.dims()[1..].iter().product::<usize>();
@ -1832,7 +1784,7 @@ impl Tensor {
        let is_permutation =
            dims.len() == self.rank() && (0..dims.len()).all(|i| dims.contains(&i));
        if !is_permutation {
-            bail!(
+            crate::bail!(
                "dimension mismatch in permute, tensor {:?}, dims: {:?}",
                self.dims(),
                dims
@ -1881,9 +1833,9 @@ impl Tensor {
    /// this new node. The storage of this tensor is shared with the initial tensor.
    ///
    /// If the tensor is already detached from the computation graph, the same tensor is returned.
-    pub fn detach(&self) -> Tensor {
+    pub fn detach(&self) -> Result<Tensor> {
        if self.op.is_none() && !self.is_variable {
-            self.clone()
+            Ok(self.clone())
        } else {
            let tensor_ = Tensor_ {
                id: TensorId::new(),
@ -1894,7 +1846,7 @@ impl Tensor {
                dtype: self.dtype,
                device: self.device.clone(),
            };
-            Tensor(Arc::new(tensor_))
+            Ok(Tensor(Arc::new(tensor_)))
        }
    }

@ -1911,7 +1863,10 @@ impl Tensor {
                    Storage::Metal(metal.storage_from_cpu_storage(storage)?)
                }
                (Storage::Cuda(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
-                (Storage::Metal(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
+                (Storage::Metal(storage), Device::Cpu) => {
+                    println!("{storage:?} - {:?}", storage.to_cpu_storage()?);
+                    Storage::Cpu(storage.to_cpu_storage()?)
+                }
                (Storage::Cuda(storage), Device::Cuda(cuda)) => {
                    // TODO: Avoid passing through the cpu storage here, especially if the gpu ids
                    // are the same.
@ -1999,7 +1954,7 @@ impl Tensor {
            Ok(self.clone())
        } else {
            let shape = self.shape();
-            let mut storage = unsafe { self.device().alloc_uninit(shape, self.dtype())? };
+            let mut storage = self.device().zeros(shape, self.dtype())?;
            self.storage()
                .copy_strided_src(&mut storage, 0, self.layout())?;
            let op = BackpropOp::new1(self, Op::Copy);
@ -2011,7 +1966,7 @@ impl Tensor {
    /// copied.
    pub(crate) fn make_var(&self) -> Result<Tensor> {
        let shape = self.shape().clone();
-        let mut storage = unsafe { self.device().alloc_uninit(&shape, self.dtype())? };
+        let mut storage = self.device().zeros(&shape, self.dtype())?;
        self.storage()
            .copy_strided_src(&mut storage, 0, self.layout())?;
        Ok(from_storage(storage, shape, BackpropOp::none(), true))
@ -2064,7 +2019,7 @@ impl Tensor {
            };
            Ok(Tensor(Arc::new(tensor_)))
        } else {
-            let mut storage = unsafe { self.device().alloc_uninit(&shape, self.dtype())? };
+            let mut storage = self.device().zeros(&shape, self.dtype())?;
            self.storage()
                .copy_strided_src(&mut storage, 0, self.layout())?;
            Ok(from_storage(storage, shape, op, false))
@ -2091,19 +2046,8 @@ impl Tensor {
        let dim = dim.to_index(self.shape(), "squeeze")?;
        if dims[dim] == 1 {
            let mut dims = dims.to_vec();
-            let mut strides = self.stride().to_vec();
            dims.remove(dim);
-            strides.remove(dim);
-            let tensor_ = Tensor_ {
-                id: TensorId::new(),
-                storage: self.storage.clone(),
-                layout: Layout::new(dims.into(), strides, self.layout.start_offset()),
-                op: BackpropOp::new1(self, Op::Reshape),
-                is_variable: false,
-                dtype: self.dtype,
-                device: self.device.clone(),
-            };
-            Ok(Tensor(Arc::new(tensor_)))
+            self.reshape(dims)
        } else {
            Ok(self.clone())
        }
@ -2124,24 +2068,10 @@ impl Tensor {
    /// ```
    pub fn unsqueeze<D: Dim>(&self, dim: D) -> Result<Self> {
        let mut dims = self.dims().to_vec();
-        let mut strides = self.stride().to_vec();
        let dim = dim.to_index_plus_one(self.shape(), "unsqueeze")?;
        // Cannot panic because to_index_plus_one already checks dimensions
        dims.insert(dim, 1);
-        // Any stride would work here, but we pick one so as to maximize the probability to remain
-        // C contiguous.
-        let stride = if dim < strides.len() { strides[dim] } else { 1 };
-        strides.insert(dim, stride);
-        let tensor_ = Tensor_ {
-            id: TensorId::new(),
-            storage: self.storage.clone(),
-            layout: Layout::new(dims.into(), strides, self.layout.start_offset()),
-            op: BackpropOp::new1(self, Op::Reshape),
-            is_variable: false,
-            dtype: self.dtype,
-            device: self.device.clone(),
-        };
-        Ok(Tensor(Arc::new(tensor_)))
+        self.reshape(dims)
    }

    /// Stacks two or more tensors along a particular dimension.
@ -2172,6 +2102,152 @@ impl Tensor {
        Self::cat(&args, dim)
    }

+    /// Concatenates two or more tensors along a particular dimension.
+    ///
+    /// All tensors must of the same rank, and the output will have
+    /// the same rank
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, DType, Device};
+    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+    ///
+    /// let c = Tensor::cat(&[&a, &b], 0)?;
+    /// assert_eq!(c.shape().dims(), &[4, 3]);
+    ///
+    /// let c = Tensor::cat(&[&a, &b], 1)?;
+    /// assert_eq!(c.shape().dims(), &[2, 6]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let dim = dim.to_index(arg0.shape(), "cat")?;
+        for arg in args {
+            arg.as_ref().check_dim(dim, "cat")?;
+        }
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg0.rank() != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: arg0.rank(),
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx != dim && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+        }
+        if dim == 0 {
+            Self::cat0(args)
+        } else {
+            // TODO: Avoid these transpositions and have an implementation that works
+            // for dim != 0...
+            let args: Vec<Tensor> = args
+                .iter()
+                .map(|a| a.as_ref().transpose(0, dim))
+                .collect::<Result<Vec<_>>>()?;
+            let cat = Self::cat0(&args)?;
+            cat.transpose(0, dim)
+        }
+    }
+
+    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
+        if args.is_empty() {
+            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
+        }
+        let arg0 = args[0].as_ref();
+        if args.len() == 1 {
+            return Ok(arg0.clone());
+        }
+        let rank = arg0.rank();
+        let device = arg0.device();
+        let dtype = arg0.dtype();
+        let first_dims = arg0.shape().dims();
+        let mut cat_dims = first_dims.to_vec();
+        cat_dims[0] = 0;
+        let mut offsets = vec![0usize];
+        for (arg_idx, arg) in args.iter().enumerate() {
+            let arg = arg.as_ref();
+            if arg.dtype() != dtype {
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: dtype,
+                    rhs: arg.dtype(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if arg.device().location() != device.location() {
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: device.location(),
+                    rhs: arg.device().location(),
+                    op: "cat",
+                }
+                .bt())?
+            }
+            if rank != arg.rank() {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: rank,
+                    got: arg.rank(),
+                    shape: arg.shape().clone(),
+                }
+                .bt())?
+            }
+            for (dim_idx, (v1, v2)) in arg0
+                .shape()
+                .dims()
+                .iter()
+                .zip(arg.shape().dims().iter())
+                .enumerate()
+            {
+                if dim_idx == 0 {
+                    cat_dims[0] += v2;
+                }
+                if dim_idx != 0 && v1 != v2 {
+                    Err(Error::ShapeMismatchCat {
+                        dim: dim_idx,
+                        first_shape: arg0.shape().clone(),
+                        n: arg_idx + 1,
+                        nth_shape: arg.shape().clone(),
+                    }
+                    .bt())?
+                }
+            }
+            let next_offset = offsets.last().unwrap() + arg.elem_count();
+            offsets.push(next_offset);
+        }
+        let shape = Shape::from(cat_dims);
+        let op = BackpropOp::new(args, |args| Op::Cat(args, 0));
+        let mut storage = device.zeros(&shape, dtype)?;
+        for (arg, &offset) in args.iter().zip(offsets.iter()) {
+            let arg = arg.as_ref();
+            arg.storage()
+                .copy_strided_src(&mut storage, offset, arg.layout())?;
+        }
+        Ok(from_storage(storage, shape, op, false))
+    }
+
    /// Pad the input tensor using 0s along dimension `dim`. This adds `left` elements before the
    /// input tensor values and `right` elements after.
    pub fn pad_with_zeros<D: Dim>(&self, dim: D, left: usize, right: usize) -> Result<Self> {
@ -2206,7 +2282,7 @@ impl Tensor {
        if left == 0 && right == 0 {
            Ok(self.clone())
        } else if self.elem_count() == 0 {
-            bail!("cannot use pad_with_same on an empty tensor")
+            crate::bail!("cannot use pad_with_same on an empty tensor")
        } else if left == 0 {
            let dim = dim.to_index(self.shape(), "pad_with_same")?;
            let r = self.narrow(dim, self.dim(dim)? - 1, 1)?;
@ -2254,10 +2330,6 @@ impl Tensor {
        self.storage.read().unwrap()
    }

-    pub(crate) fn storage_mut(&self) -> std::sync::RwLockWriteGuard<'_, Storage> {
-        self.storage.write().unwrap()
-    }
-
    // If we extend the visibility of this function to be usable outside of this crate, we should
    // make it unsafe.
    pub(crate) fn storage_mut_and_layout(
@ -2279,141 +2351,112 @@ impl Tensor {
        std::ptr::eq(lhs, rhs)
    }

+    /// Applies a unary custom op without backward support
+    pub fn apply_op1_no_bwd<C: CustomOp1>(&self, c: &C) -> Result<Self> {
+        let (storage, shape) = self.storage().apply_op1(self.layout(), c)?;
+        Ok(from_storage(storage, shape, BackpropOp::none(), false))
+    }
+
+    /// Applies a binary custom op without backward support
+    pub fn apply_op2_no_bwd<C: CustomOp2>(&self, rhs: &Self, c: &C) -> Result<Self> {
+        let (storage, shape) =
+            self.storage()
+                .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?;
+        Ok(from_storage(storage, shape, BackpropOp::none(), false))
+    }
+
+    /// Applies a ternary custom op without backward support
+    pub fn apply_op3_no_bwd<C: CustomOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<Self> {
+        let (storage, shape) = self.storage().apply_op3(
+            self.layout(),
+            &t2.storage(),
+            t2.layout(),
+            &t3.storage(),
+            t3.layout(),
+            c,
+        )?;
+        Ok(from_storage(storage, shape, BackpropOp::none(), false))
+    }
+
+    /// Applies a unary custom op.
+    pub fn apply_op1_arc(&self, c: Arc<Box<dyn CustomOp1 + Send + Sync>>) -> Result<Self> {
+        let (storage, shape) = self
+            .storage()
+            .apply_op1(self.layout(), c.as_ref().as_ref())?;
+        let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone()));
+        Ok(from_storage(storage, shape, op, false))
+    }
+
+    pub fn apply_op1<C: 'static + CustomOp1 + Send + Sync>(&self, c: C) -> Result<Self> {
+        self.apply_op1_arc(Arc::new(Box::new(c)))
+    }
+
+    /// Applies a binary custom op.
+    pub fn apply_op2_arc(
+        &self,
+        rhs: &Self,
+        c: Arc<Box<dyn CustomOp2 + Send + Sync>>,
+    ) -> Result<Self> {
+        let (storage, shape) = self.storage().apply_op2(
+            self.layout(),
+            &rhs.storage(),
+            rhs.layout(),
+            c.as_ref().as_ref(),
+        )?;
+        let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone()));
+        Ok(from_storage(storage, shape, op, false))
+    }
+
+    pub fn apply_op2<C: 'static + CustomOp2 + Send + Sync>(&self, r: &Self, c: C) -> Result<Self> {
+        self.apply_op2_arc(r, Arc::new(Box::new(c)))
+    }
+
+    /// Applies a ternary custom op.
+    pub fn apply_op3_arc(
+        &self,
+        t2: &Self,
+        t3: &Self,
+        c: Arc<Box<dyn CustomOp3 + Send + Sync>>,
+    ) -> Result<Self> {
+        let (storage, shape) = self.storage().apply_op3(
+            self.layout(),
+            &t2.storage(),
+            t2.layout(),
+            &t3.storage(),
+            t3.layout(),
+            c.as_ref().as_ref(),
+        )?;
+        let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| {
+            Op::CustomOp3(t1, t2, t3, c.clone())
+        });
+        Ok(from_storage(storage, shape, op, false))
+    }
+
+    pub fn apply_op3<C: 'static + CustomOp3 + Send + Sync>(
+        &self,
+        t2: &Self,
+        t3: &Self,
+        c: C,
+    ) -> Result<Self> {
+        self.apply_op3_arc(t2, t3, Arc::new(Box::new(c)))
+    }
+
    /// Normalize a 'relative' axis value: positive values are kept, negative
    /// values means counting the dimensions from the back.
    pub fn normalize_axis(&self, axis: i64) -> Result<usize> {
        let rank = self.rank() as i64;
        if rank <= axis {
-            bail!("axis {axis} is too large, tensor rank {rank}")
+            crate::bail!("axis {axis} is too large, tensor rank {rank}")
        } else if 0 <= axis {
            Ok(axis as usize)
        } else {
            let naxis = rank + axis;
            if naxis < 0 {
-                bail!("axis {axis} is too small, tensor rank {rank}")
+                crate::bail!("axis {axis} is too small, tensor rank {rank}")
            }
            Ok(naxis as usize)
        }
    }
-
-    /// Returns a lower triangular matrix of ones of size n by n.
-    pub fn tril2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
-        let t = Tensor::arange(0u32, n as u32, device)?;
-        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
-        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
-        t1.le(&t2)?.to_dtype(dtype)
-    }
-
-    /// Returns an upper triangular matrix of ones of size n by n.
-    pub fn triu2(n: usize, dtype: DType, device: &Device) -> Result<Self> {
-        let t = Tensor::arange(0u32, n as u32, device)?;
-        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
-        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
-        t1.ge(&t2)?.to_dtype(dtype)
-    }
-
-    /// Returns a matrix with a diagonal of ones of size n by n.
-    pub fn eye(n: usize, dtype: DType, device: &Device) -> Result<Self> {
-        let t = Tensor::arange(0u32, n as u32, device)?;
-        let t1 = t.reshape((1, n))?.broadcast_as((n, n))?;
-        let t2 = t.reshape((n, 1))?.broadcast_as((n, n))?;
-        t1.eq(&t2)?.to_dtype(dtype)
-    }
-
-    /// Returns the cumulative sum of elements of the input tensor summed over the specified
-    /// dimension.
-    ///
-    /// This operation is most efficient when dim is the last dimension of the tensor.
-    pub fn cumsum<D: Dim>(&self, dim: D) -> Result<Self> {
-        let dim = dim.to_index(self.shape(), "cumsum")?;
-        let rank = self.rank();
-        if rank == 0 {
-            return Ok(self.clone());
-        }
-        let n_axis = self.dim(dim)?;
-        let triu = Tensor::triu2(n_axis, self.dtype(), self.device())?;
-        if rank == 1 {
-            self.unsqueeze(0)?.matmul(&triu)?.squeeze(0)
-        } else {
-            let last = rank - 1;
-            let t = self.transpose(dim, last)?;
-            let t = t.broadcast_matmul(&triu)?;
-            t.transpose(dim, last)
-        }
-    }
-
-    /// Returns a copy of `self` where the values within `ranges` have been replaced with the
-    /// content of `src`.
-    pub fn slice_assign<D: std::ops::RangeBounds<usize>>(
-        &self,
-        ranges: &[D],
-        src: &Tensor,
-    ) -> Result<Self> {
-        let src_dims = src.dims();
-        let self_dims = self.dims();
-        if self_dims.len() != src_dims.len() {
-            bail!(
-                "slice-assign requires input with the same rank {} <> {}",
-                self_dims.len(),
-                src_dims.len()
-            )
-        }
-        if self_dims.len() != ranges.len() {
-            bail!(
-                "slice-assign requires input with the same rank as there are ranges {} <> {}",
-                self_dims.len(),
-                ranges.len()
-            )
-        }
-        let mut src = src.clone();
-        let mut mask = Self::ones(src.shape(), DType::U8, src.device())?;
-        for (i, range) in ranges.iter().enumerate() {
-            let start_included = match range.start_bound() {
-                std::ops::Bound::Unbounded => 0,
-                std::ops::Bound::Included(v) => *v,
-                std::ops::Bound::Excluded(v) => *v + 1,
-            };
-            let end_excluded = match range.end_bound() {
-                std::ops::Bound::Unbounded => self_dims[i],
-                std::ops::Bound::Included(v) => *v + 1,
-                std::ops::Bound::Excluded(v) => *v,
-            };
-            if end_excluded <= start_included {
-                bail!("slice-assign: empty range for dim {i}, {start_included} {end_excluded}")
-            }
-            if self_dims[i] < end_excluded {
-                bail!(
-                    "slice-assign: upper bound is out of range for dim {i}, {end_excluded} {}",
-                    self_dims[i]
-                )
-            }
-            if end_excluded - start_included != src_dims[i] {
-                bail!(
-                    "slice-assign: the range for dim {i} ({start_included}..{end_excluded}) does not match the size of src {}", src_dims[i]
-                )
-            }
-            src = src.pad_with_zeros(i, start_included, self_dims[i] - end_excluded)?;
-            mask = mask.pad_with_zeros(i, start_included, self_dims[i] - end_excluded)?
-        }
-        mask.where_cond(/* on_true= */ &src, /* on_false= */ self)
-    }
-
-    /// Returns log(sum(exp(tensor), dim)).
-    pub fn log_sum_exp<D: Dims>(&self, sum_dims: D) -> Result<Self> {
-        let exp = self.exp()?;
-        let sum = exp.sum(sum_dims)?;
-        sum.log()
-    }
-
-    /// Pointwise pow operation.
-    pub fn pow(&self, rhs: &Tensor) -> Result<Self> {
-        rhs.mul(&self.log()?)?.exp()
-    }
-
-    /// Broadcasting version of `pow`.
-    pub fn broadcast_pow(&self, rhs: &Tensor) -> Result<Self> {
-        rhs.broadcast_mul(&self.log()?)?.exp()
-    }
 }

 macro_rules! bin_trait {
--- a/candle-core/src/tensor_cat.rs
+++ b/candle-core/src/tensor_cat.rs
@ -1,240 +0,0 @@
-use crate::{shape::Dim, Error, Result, Shape, Tensor};
-
-impl Tensor {
-    /// Concatenates two or more tensors along a particular dimension.
-    ///
-    /// All tensors must of the same rank, and the output will have
-    /// the same rank
-    ///
-    /// ```rust
-    /// # use candle_core::{Tensor, DType, Device};
-    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
-    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
-    ///
-    /// let c = Tensor::cat(&[&a, &b], 0)?;
-    /// assert_eq!(c.shape().dims(), &[4, 3]);
-    ///
-    /// let c = Tensor::cat(&[&a, &b], 1)?;
-    /// assert_eq!(c.shape().dims(), &[2, 6]);
-    /// # Ok::<(), candle_core::Error>(())
-    /// ```
-    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let dim = dim.to_index(arg0.shape(), "cat")?;
-        for arg in args {
-            arg.as_ref().check_dim(dim, "cat")?;
-        }
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg0.rank() != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: arg0.rank(),
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx != dim && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-        }
-        if dim == 0 {
-            Self::cat0(args)
-        } else {
-            let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous());
-            if all_contiguous {
-                Self::cat_contiguous(args, dim)
-            } else {
-                let args: Vec<Tensor> = args
-                    .iter()
-                    .map(|a| a.as_ref().transpose(0, dim))
-                    .collect::<Result<Vec<_>>>()?;
-                let cat = Self::cat0(&args)?;
-                cat.transpose(0, dim)
-            }
-        }
-    }
-
-    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let rank = arg0.rank();
-        let device = arg0.device();
-        let dtype = arg0.dtype();
-        let first_dims = arg0.shape().dims();
-        let mut cat_dims = first_dims.to_vec();
-        cat_dims[0] = 0;
-        let mut offsets = vec![0usize];
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg.dtype() != dtype {
-                Err(Error::DTypeMismatchBinaryOp {
-                    lhs: dtype,
-                    rhs: arg.dtype(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if arg.device().location() != device.location() {
-                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: device.location(),
-                    rhs: arg.device().location(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if rank != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: rank,
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx == 0 {
-                    cat_dims[0] += v2;
-                }
-                if dim_idx != 0 && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-            let next_offset = offsets.last().unwrap() + arg.elem_count();
-            offsets.push(next_offset);
-        }
-        let shape = Shape::from(cat_dims);
-        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0));
-        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
-        for (arg, &offset) in args.iter().zip(offsets.iter()) {
-            let arg = arg.as_ref();
-            arg.storage()
-                .copy_strided_src(&mut storage, offset, arg.layout())?;
-        }
-        Ok(crate::tensor::from_storage(storage, shape, op, false))
-    }
-
-    fn cat_contiguous<A: AsRef<Tensor>>(args: &[A], dim: usize) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let rank = arg0.rank();
-        let device = arg0.device();
-        let dtype = arg0.dtype();
-        let first_dims = arg0.shape().dims();
-        let mut cat_dims = first_dims.to_vec();
-        cat_dims[dim] = 0;
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg.dtype() != dtype {
-                Err(Error::DTypeMismatchBinaryOp {
-                    lhs: dtype,
-                    rhs: arg.dtype(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if arg.device().location() != device.location() {
-                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: device.location(),
-                    rhs: arg.device().location(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if rank != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: rank,
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx == dim {
-                    cat_dims[dim] += v2;
-                }
-                if dim_idx != dim && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-        }
-        let cat_target_dim_len = cat_dims[dim];
-        let block_size: usize = cat_dims.iter().skip(1 + dim).product();
-        let shape = Shape::from(cat_dims);
-        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim));
-        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
-        let mut dst_o = 0;
-        for arg in args.iter() {
-            let arg = arg.as_ref();
-            let arg_dims = arg.shape().dims();
-            let d1: usize = arg_dims.iter().take(dim).product();
-            let d2 = block_size * arg_dims[dim];
-            let dst_s = block_size * cat_target_dim_len;
-            let src_o = arg.layout().start_offset();
-            arg.storage().copy2d(
-                &mut storage,
-                d1,
-                d2,
-                /* src_s */ d2,
-                dst_s,
-                src_o,
-                dst_o,
-            )?;
-            dst_o += d2;
-        }
-        Ok(crate::tensor::from_storage(storage, shape, op, false))
-    }
-}
--- a/candle-core/src/variable.rs
+++ b/candle-core/src/variable.rs
@ -107,10 +107,6 @@ impl Var {
        Ok(Self(inner))
    }

-    pub fn as_detached_tensor(&self) -> Tensor {
-        self.0.detach()
-    }
-
    pub fn as_tensor(&self) -> &Tensor {
        &self.0
    }
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -18,9 +18,6 @@ w_t = w.transpose(0, 1)
 res = torch.nn.functional.conv_transpose1d(t, w_t)
 print(res.shape)
 print(res)
-res = torch.nn.functional.conv_transpose1d(t, w_t, groups=2)
-print(res.shape)
-print(res)
 */
 fn conv1d(dev: &Device) -> Result<()> {
    let t = Tensor::new(
@ -53,11 +50,8 @@ fn conv1d(dev: &Device) -> Result<()> {
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
-
-    let w = w.transpose(0, 1)?;
-    // The CPU kernels applied in the contiguous and non contiguous cases are different.
-    for w in [w.clone(), w.contiguous()?] {
-        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 1)?;
+    if dev.is_cpu() {
+        let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
        assert_eq!(res.dims(), [1, 2, 7]);
        assert_eq!(
            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
@ -66,17 +60,6 @@ fn conv1d(dev: &Device) -> Result<()> {
                4.7076, -5.9745, -0.8276, 1.621
            ],
        );
-        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 2)?;
-        assert_eq!(res.dims(), [1, 4, 7]);
-        assert_eq!(
-            test_utils::to_vec2_round(&res.squeeze(0)?, 4)?,
-            [
-                [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819],
-                [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721],
-                [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113],
-                [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488]
-            ]
-        );
    }
    Ok(())
 }
@ -135,7 +118,7 @@ fn conv2d(dev: &Device) -> Result<()> {
            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
        ],
        dev,
    )?;
@ -163,9 +146,7 @@ fn conv2d(dev: &Device) -> Result<()> {
            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
        ]
    );
-
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-
    assert_eq!(res.dims(), [1, 2, 7, 7]);
    assert_eq!(
        test_utils::to_vec3_round(&res.i(0)?, 4)?,
@ -190,7 +171,6 @@ fn conv2d(dev: &Device) -> Result<()> {
            ]
        ]
    );
-
    // Dilations.
    let res = t.conv2d(&w, 0, 1, 2, 1)?;
    assert_eq!(res.dims(), [1, 2, 1, 1]);
@ -229,7 +209,6 @@ fn conv2d(dev: &Device) -> Result<()> {
            ]
        ]
    );
-
    Ok(())
 }

@ -276,13 +255,13 @@ fn conv2d_small(dev: &Device) -> Result<()> {
    assert_eq!(
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1640,
-            -0.0111, -0.1742, 0.0, 0.0, 0.0, 0.0, 2.6437, -2.0268, 1.1823, 0.0, 0.0, 0.0, 0.0,
-            3.2855, -1.0324, 0.2539, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-            0.0, 0.0, 0.0, 0.0
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1640, -0.0111, -0.1742, 0.0000, 0.0000,
+            0.0000, 0.0000, 2.6437, -2.0268, 1.1823, 0.0000, 0.0000, 0.0000, 0.0000, 3.2855,
+            -1.0324, 0.2539, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000
        ]
    );
-
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
    assert_eq!(res.dims(), [1, 1, 3, 3]);
    assert_eq!(
@ -384,7 +363,6 @@ print(w.grad.shape)
 print(w.grad[0])
 */
 fn conv2d_grad(dev: &Device) -> Result<()> {
-    // conv-transposes are not implemented for metal
    use candle_core::Var;
    let t = Var::from_slice(
        &[
@ -397,7 +375,7 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
        ],
        (1, 4, 5, 5),
        dev,
@ -582,154 +560,6 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
        ]
    );

-    // Conv Transpose 2d Test
-    //tested against following python
-
-    // import torch
-    // torch.manual_seed(4242)
-    // padding = 4
-    // outpadding = 2
-    // dilation = 3
-    // stride = 3
-    // input = torch.randn((1, 4, 7, 5), requires_grad=True)
-    // kernel = torch.randn((4, 2, 3, 5), requires_grad=True)
-    // print("input", input.flatten())
-    // print("kernel", kernel.flatten())
-    // res = torch.nn.functional.conv_transpose2d(
-    //     input,
-    //     kernel,
-    //     stride=stride,
-    //     padding=padding,
-    //     dilation=dilation,
-    //     output_padding=outpadding,
-    // )
-    // res.retain_grad()
-    // print(res.shape)
-    // loss = (res**2).sum()
-    // print(loss)
-    // loss.backward()
-    // print(input.grad.shape)
-    // print("input grad", torch.round(input.grad, decimals=1))
-    // print(kernel.grad.shape)
-    // print("kernel grad", torch.round(kernel.grad.flatten(), decimals=1))
-
-    let padding = 4;
-    let outpadding = 2;
-    let dilation = 3;
-    let stride = 3;
-
-    let t = Var::from_slice(
-        &[
-            0.4056_f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997,
-            3.0616, 1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843,
-            0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013,
-            -0.6836, 0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130,
-            1.3123, 1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071,
-            1.1586, 0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090,
-            0.2049, 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323,
-            -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742,
-            0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.5411, -1.1715, -2.4997, 2.3249, -0.8912,
-            -0.4733, -0.5701, -2.8888, -1.4112, -0.5471, -0.9234, -1.1660, 0.4189, -0.7465,
-            -0.6473, 0.1402, 0.7875, 0.5377, -0.6779, -0.8088, -0.4864, -0.2312, 0.9279, 0.1264,
-            1.5480, 0.8265, -0.1025, 0.5138, -0.2512, 0.1576, 1.2705, 0.3641, -0.9325, 0.6451,
-            -0.8537, 0.2378, 0.1794, 0.2752, -0.3687, -1.1149, -0.1410, -0.5829, -0.0892, 1.4258,
-            -2.2789, 0.5270, 0.1825, 1.7007, -0.5263, -0.2954, 0.4440, 0.5537, 0.3492, 0.6186,
-            1.6475, 0.2219,
-        ],
-        (1, 4, 7, 5),
-        dev,
-    )?;
-
-    #[rustfmt::skip]
-    let w = Var::from_slice(
-        &[
-            -1.1744_f32, 0.3266, 2.5893, 1.0142, 0.1763, 0.7752, 0.6604, 0.2029, -0.2145, 0.7234,
-            -0.3441, -1.5400, -0.6333, 0.6613, 0.2083, 0.6230, -1.7002, 0.3393, 0.4049, 1.0762,
-            0.2723, 1.4181, 0.0029, -0.2122, 1.7668, 1.4168, 0.3320, -0.2719, 0.7932, -0.7204,
-            0.4447, 0.1211, 0.5908, 1.0089, -0.1646, 1.8033, -0.6286, 0.2016, -0.3370, 1.2555,
-            0.8009, -0.6488, -0.4652, -1.5685, 1.5860, 0.5583, 0.4623, 0.6026, 0.8828, 2.4990,
-            0.6811, -0.3369, 1.3320, 1.7669, -1.1067, 1.2958, -0.9415, -0.9655, -0.4462, 0.7181,
-            0.5181, -1.1658, -1.8467, -0.7763, 1.2769, 0.8651, 0.9890, 1.5092, 0.7207, -0.8481,
-            0.7417, 0.3375, -1.2685, 1.4572, 1.0915, 0.1093, -0.8550, -0.5831, -0.6309, -0.2509,
-            0.5220, -0.0914, 0.7900, 0.1096, 0.3258, 0.2723, -1.0942, -0.3393, -0.1653, 0.5732,
-            -0.8014, 1.8194, -1.9023, 0.2127, 1.8636, -0.8979, 0.1927, -0.2778, 0.3105, 0.0071,
-            -1.1823, 0.2476, -0.7178, -1.3821, 1.0769, -0.4376, -0.9967, -0.1227, 1.6197, -1.0604,
-            0.1372, 0.8141, -0.6163, 0.7304, -0.8285, 2.0636, -0.7176, 0.2495, -0.2581, -0.4478,
-        ],
-        (4, 2, 3, 5),
-        dev,
-    )?;
-    let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?;
-    let loss = res.sqr()?.sum_all()?;
-    assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 2904.0);
-    let grads = loss.backward()?;
-
-    let grad_t = grads.get(&t).unwrap();
-    let grad_w = grads.get(&w).unwrap();
-    assert_eq!(grad_t.dims(), [1, 4, 7, 5]);
-    assert_eq!(grad_w.dims(), [4, 2, 3, 5]);
-
-    assert_eq!(
-        test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?,
-        [
-            // torch gets 89.1
-            -89.0, -135.3, 136.7, 102.0, -53.4, 117.9, 118.6, -43.9, -218.0, -58.5, -114.3, -150.0,
-            -15.6, 172.1, 66.3, -64.3, -27.9, -19.8, 31.7, 62.1, 5.5, 92.6, 28.2, -29.6, 55.9,
-            52.7, -72.7, -119.8, 53.8, -25.5, 128.8, 19.3, 68.0, 190.9, -64.1, -86.2, -111.2,
-            106.6, -67.7, 37.8, 115.9, 50.4, -77.7, -54.9, 22.3, -4.6, 89.8, 61.7, 122.4, 192.6,
-            -27.8, -104.6, 57.0, 166.4, 27.1, 6.1, 18.7, -93.2, 31.5, 168.2, -3.7, -99.5, -55.5,
-            -10.8, 17.5, 20.8, 16.9, 43.8, 42.0, -89.2, 18.8, -9.6, -84.1, 212.6, 19.7, -50.0,
-            -52.0, -40.0, -166.6, -73.2, -10.8, -73.3, 31.5, -23.4, -79.3, -27.0, -84.4, -42.9,
-            -20.3, 51.8, -16.7, 76.3, -120.5, -65.8, 96.5, -10.7, -45.9, -88.1, 65.4, -7.0, -1.5,
-            92.8, -25.1, -114.2, -5.8, -14.8, -51.2, -20.7, 54.2, -79.8, 47.7, -29.2, -8.8, 53.5,
-            -28.4, 85.0, -18.3, 107.0, 28.3, -71.8
-        ]
-    );
-
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_t.i(0)?, 1)?,
-        [
-            [
-                [32.3, -41.6, -24.0, 14.1, 17.6],
-                [-11.8, 72.5, 87.6, 46.4, 61.5],
-                [115.0, 108.5, -48.6, -63.4, -50.0],
-                [51.3, 5.4, 31.3, 91.1, -30.9],
-                [52.7, 92.8, -68.0, -47.0, 83.0],
-                // pytorch gets -107.1
-                [-10.2, -107.0, -5.4, 213.1, -31.4],
-                [-2.4, 65.1, 9.2, -146.2, -24.2]
-            ],
-            [
-                [-72.6, -63.9, -61.9, 45.3, 33.0],
-                [79.3, -0.5, -26.2, 78.2, 42.7],
-                [90.9, 141.6, 40.1, -62.7, 37.0],
-                [32.8, 198.2, -0.8, -31.1, 27.3],
-                // torch gets 48.0
-                [34.5, 34.9, -47.9, 127.6, -12.3],
-                [-61.4, -3.2, -2.9, -10.9, -16.6],
-                [74.6, 60.1, -68.9, 34.5, -50.4]
-            ],
-            [
-                [37.5, -56.9, -43.6, -13.5, -9.9],
-                [40.0, 97.3, 28.6, 14.2, -30.1],
-                [-22.3, -126.3, -68.8, -8.2, 26.1],
-                [-32.9, 37.3, 108.5, -54.8, 29.6],
-                [34.9, -176.9, -125.0, -28.3, -13.9],
-                [-54.9, 142.6, 62.1, -80.4, -65.6],
-                [7.4, -91.1, -67.6, 35.0, 39.7]
-            ],
-            [
-                [-57.2, -40.9, -10.1, 32.6, 29.4],
-                [18.7, -18.0, 29.5, -1.2, 59.2],
-                [-14.0, -74.4, 19.8, -117.0, 58.2],
-                [-21.8, 163.5, -71.1, -99.0, 80.9],
-                [-58.9, -10.9, 93.8, -139.6, 98.0],
-                // torch gets 54.5
-                [-54.4, 135.3, 6.0, -79.1, 134.6],
-                [27.5, -76.0, 43.4, -2.8, -7.8]
-            ]
-        ]
-    );
    Ok(())
 }

--- a/candle-core/tests/custom_op_tests.rs
+++ b/candle-core/tests/custom_op_tests.rs
@ -112,34 +112,3 @@ fn custom_op1_with_backward() -> Result<()> {

    Ok(())
 }
-
-impl candle_core::InplaceOp1 for Elu {
-    fn name(&self) -> &'static str {
-        "elu"
-    }
-
-    fn cpu_fwd(&self, s: &mut CpuStorage, _l: &Layout) -> Result<()> {
-        let alpha = self.alpha;
-        match s {
-            CpuStorage::BF16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            CpuStorage::F16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            CpuStorage::F32(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            CpuStorage::F64(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            _ => candle_core::bail!("unsupported dtype for inplace elu"),
-        }
-        Ok(())
-    }
-}
-
-#[test]
-fn inplace_op1() -> Result<()> {
-    let cpu = &Device::Cpu;
-    let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?;
-    let t = (t - 5.)?;
-    t.inplace_op1(&Elu { alpha: 1. })?;
-    assert_eq!(
-        to_vec1_round(&t, 4)?,
-        &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-    );
-    Ok(())
-}
--- a/candle-core/tests/fortran_tensor_3d.pth
+++ b/candle-core/tests/fortran_tensor_3d.pth
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -1,4 +1,3 @@
-#![allow(clippy::approx_constant)]
 use anyhow::{Context, Result};
 use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var};

@ -97,24 +96,24 @@ fn unary_grad(device: &Device) -> Result<()> {
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [20.0855, 2.7183, 54.5982, 1.1618]
+        y.to_vec1::<f32>()?,
+        [20.085537, 2.7182817, 54.59815, 1.1618342]
    );
    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [20.0855, 2.7183, 54.5982, 1.1618]
+        grad_x.to_vec1::<f32>()?,
+        [20.085537, 2.7182817, 54.59815, 1.1618342]
    );
    let y = x.exp()?.sqr()?;
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        test_utils::to_vec1_round(&y, 3)?,
-        [403.429, 7.389, 2980.958, 1.35]
+        y.to_vec1::<f32>()?,
+        [403.4288, 7.3890557, 2980.9578, 1.3498588]
    );
    // exp(x)^2 = exp(2*x)
    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 2)?,
-        [806.86, 14.78, 5961.92, 2.7]
+        grad_x.to_vec1::<f32>()?,
+        [806.8576, 14.778111, 5961.9155, 2.6997175]
    );
    let y = x.sin()?;
    let grads = y.backward()?;
@ -262,7 +261,6 @@ fn unary_grad(device: &Device) -> Result<()> {
    let y = elu_x.elu(2.)?;
    let grads = y.backward()?;
    let grad_x = grads.get(&elu_x).context("no grad for x")?;
-
    assert_eq!(
        test_utils::to_vec1_round(&y, 4)?,
        [-1.2642, 0.0000, -1.7293, 3.0000]
@ -272,194 +270,6 @@ fn unary_grad(device: &Device) -> Result<()> {
        [0.7358, 2.0000, 0.2707, 1.0000]
    );

-    // testing compared to pytorch nn.Silu()
-    let y = x.silu()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [2.8577, 0.7311, 3.9281, 0.0806]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [1.0881, 0.9277, 1.0527, 0.5747],
-    );
-
-    if device.is_cpu() {
-        let x = Var::new(&[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]], device)?;
-        let y = x.interpolate1d(12)?.reshape(36)?;
-
-        let z = Tensor::new(
-            &[
-                1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16.,
-                17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-                33., 34., 35., 36.,
-            ],
-            device,
-        )?;
-
-        let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-        let grads = loss.backward()?;
-        let grad_x = grads.get(&x).context("no grad for x")?;
-
-        assert_eq!(
-            test_utils::to_vec3_round(grad_x, 4)?,
-            [[[10_f32, 26., 42.], [58., 74., 90.], [106., 122., 138.]]]
-        );
-    }
-
-    // manually checked: see comments
-    let x = Var::new(&[[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]]], device)?;
-    let y = x.interpolate2d(6, 6)?.reshape(36)?;
-
-    let z = Tensor::new(
-        &[
-            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
-            35., 36.,
-        ],
-        device,
-    )?;
-    // gradient should be
-    // row 1
-    // 1+2+7+8 = 18
-    // 3+4+9+10 = 26
-    // 5+6+11+12 = 34
-    // row 2
-    // 13+14+19+20 = 66
-    // 15+16+21+22 = 74
-    // 17+18+23+24 = 82
-    // row 3
-    // 25+26+31+32 = 114
-    // 27+28+33+34 = 122
-    // 29+30+35+36 = 130
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
-        [[18_f32, 26., 34.], [66., 74., 82.], [114., 122., 130.]]
-    );
-
-    // manually checked: see comments
-    let x = Var::new(&[[[[1f32, 2.], [4., 5.]]]], device)?;
-    let y = x.interpolate2d(6, 6)?.reshape(36)?;
-
-    let z = Tensor::new(
-        &[
-            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
-            35., 36.,
-        ],
-        device,
-    )?;
-    // gradient should be
-    // row 1
-    // 1+2+3+7+8+9+13+14+15 = 72
-    // 4+5+6+10+11+12+16+17+18 = 99
-    // row 2
-    // 19+20+21+25+26+27+31+32+33 = 234
-    // 22+23+24+28+29+30+34+35+36 = 243
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
-        [[72_f32, 99.], [234., 261.]]
-    );
-
-    // manually checked: see comments
-    let x = Var::new(&[[[[1f32, 2.], [4., 5.]], [[6f32, 7.], [8., 9.]]]], device)?;
-
-    let y = x.interpolate2d(4, 4)?.reshape(32)?;
-
-    #[rustfmt::skip]
-    let z = Tensor::new(
-        &[
-            1_f32, 02., 03., 04.,
-            05.,   06., 07., 08.,
-            09.,   10., 11., 12.,
-            13.,   14., 15., 16.,
-            17.,   18., 19., 20.,
-            21.,   22., 23., 24.,
-            25.,   26., 27., 28.,
-            29.,   30., 31., 32.
-        ],
-        device,
-    )?;
-    // gradient should be
-    // m1r1
-    // 1+2+5+6=14
-    // 3+4+7+8=22
-    // m1r2
-    // 9+10+13+14=46
-    // 11+12+15+16=54
-    // m2r1
-    // 17+18+21+22=78
-    // 19+20+23+24=86
-    // m2r2
-    // 25+26+29+30=110
-    // 27+28+31+32=118
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
-        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
-    );
-
-    // manually checked: see comments
-    let x = Var::new(
-        &[[[[1f32, 2.], [4., 5.]]], [[[6f32, 7.], [8., 9.]]]],
-        device,
-    )?;
-
-    let y = x.interpolate2d(4, 4)?.reshape(32)?;
-
-    #[rustfmt::skip]
-       let z = Tensor::new(
-           &[
-               1_f32, 02., 03., 04.,
-               05.,   06., 07., 08.,
-               09.,   10., 11., 12.,
-               13.,   14., 15., 16.,
-               17.,   18., 19., 20.,
-               21.,   22., 23., 24.,
-               25.,   26., 27., 28.,
-               29.,   30., 31., 32.
-           ],
-           device,
-       )?;
-    // gradient should be
-    // m1r1
-    // 1+2+5+6=14
-    // 3+4+7+8=22
-    // m1r2
-    // 9+10+13+14=46
-    // 11+12+15+16=54
-    // m2r1
-    // 17+18+21+22=78
-    // 19+20+23+24=86
-    // m2r2
-    // 25+26+29+30=110
-    // 27+28+31+32=118
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
-        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
-    );
    Ok(())
 }

--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
@ -91,32 +91,3 @@ fn index_3d() -> Result<()> {
    assert_eq!(tensor.i((1, .., 3))?.to_vec1::<u32>()?, &[15, 19, 23]);
    Ok(())
 }
-
-#[test]
-fn slice_assign() -> Result<()> {
-    let dev = Device::Cpu;
-
-    let tensor = Tensor::arange(0u32, 4 * 5, &dev)?.reshape((4, 5))?;
-    let src = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((3, 2))?;
-    let out = tensor.slice_assign(&[1..4, 3..5], &src)?;
-    assert_eq!(
-        out.to_vec2::<u32>()?,
-        &[
-            [0, 1, 2, 3, 4],
-            [5, 6, 7, 0, 1],
-            [10, 11, 12, 2, 3],
-            [15, 16, 17, 4, 5]
-        ]
-    );
-    let out = tensor.slice_assign(&[0..3, 0..2], &src)?;
-    assert_eq!(
-        out.to_vec2::<u32>()?,
-        &[
-            [0, 1, 2, 3, 4],
-            [2, 3, 7, 8, 9],
-            [4, 5, 12, 13, 14],
-            [15, 16, 17, 18, 19]
-        ]
-    );
-    Ok(())
-}
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -88,7 +88,7 @@ fn strided_blocks() -> Result<()> {
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
-    let tensor = tensor.i((.., 1))?.contiguous()?;
+    let tensor = tensor.i((.., 1))?;
    match tensor.strided_blocks() {
        candle::StridedBlocks::SingleBlock { start_offset, len } => {
            assert_eq!(start_offset, 0);
@ -100,20 +100,6 @@ fn strided_blocks() -> Result<()> {
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
-    let tensor = tensor.i((.., 1))?;
-    match tensor.strided_blocks() {
-        candle::StridedBlocks::SingleBlock { .. } => {
-            panic!("unexpected block structure")
-        }
-        candle::StridedBlocks::MultipleBlocks {
-            block_len,
-            block_start_index,
-        } => {
-            assert_eq!(block_len, 4);
-            assert_eq!(block_start_index.collect::<Vec<_>>(), &[4, 16])
-        }
-    };
-    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
    match tensor.t()?.strided_blocks() {
        candle::StridedBlocks::SingleBlock { .. } => {
            panic!("unexpected block structure")
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
@ -43,9 +43,6 @@ res = torch.nn.functional.avg_pool2d(t, 2)
 print(res)
 */
 fn avg_pool2d_pytorch(dev: &Device) -> Result<()> {
-    if dev.is_metal() {
-        return Ok(());
-    }
    let t = Tensor::new(
        &[
            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
--- a/candle-core/tests/pth.py
+++ b/candle-core/tests/pth.py
@ -1,37 +0,0 @@
-import torch
-from collections import OrderedDict
-
-# Write a trivial tensor to a pt file
-a= torch.tensor([[1,2,3,4], [5,6,7,8]])
-o = OrderedDict()
-o["test"] = a
-
-# Write a trivial tensor to a pt file
-torch.save(o, "test.pt")
-
-############################################################################################################
-# Write a trivial tensor to a pt file with a key
-torch.save({"model_state_dict": o}, "test_with_key.pt")
-
-############################################################################################################
-# Create a tensor with fortran contiguous memory layout
-import numpy as np
-
-# Step 1: Create a 3D NumPy array with Fortran order using a range of numbers
-# For example, creating a 2x3x4 array
-array_fortran = np.asfortranarray(np.arange(1, 2*3*4 + 1).reshape(2, 3, 4))
-
-# Verify the memory order
-print("Is Fortran contiguous (F order):", array_fortran.flags['F_CONTIGUOUS'])  # Should be True
-print("Is C contiguous (C order):", array_fortran.flags['C_CONTIGUOUS'])  # Should be False
-
-# Step 2: Convert the NumPy array to a PyTorch tensor
-tensor_fortran = torch.from_numpy(array_fortran)
-
-# Verify the tensor layout
-print("Tensor stride:", tensor_fortran.stride())  # Stride will reflect the Fortran memory layout
-
-# Step 3: Save the PyTorch tensor to a .pth file
-torch.save({"tensor_fortran": tensor_fortran}, 'fortran_tensor_3d.pth')
-
-print("3D Tensor saved with Fortran layout.")
--- a/candle-core/tests/pth_tests.rs
+++ b/candle-core/tests/pth_tests.rs
@ -1,31 +0,0 @@
-/// Regression test for pth files not loading on Windows.
-#[test]
-fn test_pth() {
-    let tensors = candle_core::pickle::PthTensors::new("tests/test.pt", None).unwrap();
-    tensors.get("test").unwrap().unwrap();
-}
-
-#[test]
-fn test_pth_with_key() {
-    let tensors =
-        candle_core::pickle::PthTensors::new("tests/test_with_key.pt", Some("model_state_dict"))
-            .unwrap();
-    tensors.get("test").unwrap().unwrap();
-}
-
-#[test]
-fn test_pth_fortran_congiguous() {
-    let tensors =
-        candle_core::pickle::PthTensors::new("tests/fortran_tensor_3d.pth", None).unwrap();
-    let tensor = tensors.get("tensor_fortran").unwrap().unwrap();
-
-    assert_eq!(tensor.dims3().unwrap(), (2, 3, 4));
-
-    assert_eq!(
-        tensor.to_vec3::<i64>().unwrap(),
-        [
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
-            [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]
-        ]
-    );
-}
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
@ -1,9 +1,7 @@
 use candle_core::{
-    bail,
    quantized::{self, GgmlDType},
-    test_device,
    test_utils::to_vec2_round,
-    Device, Module, Result, Tensor,
+    Device, Result, Tensor,
 };
 use quantized::{k_quants, GgmlType};
 use rand::prelude::*;
@ -15,48 +13,16 @@ const GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS: f32 = 0.0075;
 const GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS: f32 = 0.0040;
 const GGML_MAX_DOT_PRODUCT_ERROR: f32 = 0.02;

-fn test_matmul(
-    device: &Device,
-    (b, m, n, k): (usize, usize, usize, usize),
-    dtype: GgmlDType,
-) -> Result<()> {
-    let lhs = (0..(m * k))
-        .map(|v| v as f32 / (m * k) as f32)
-        .collect::<Vec<_>>();
-    let rhs = (0..(k * n))
-        .map(|v| v as f32 / (n * k) as f32)
-        .collect::<Vec<_>>();
-
-    let lhs = Tensor::from_slice(&lhs, (m, k), device)?;
-    let rhs = Tensor::from_slice(&rhs, (k, n), device)?;
-    let mm = lhs.matmul(&rhs)?;
-    let qtensor = quantized::QTensor::quantize(&rhs.t()?, dtype)?;
-    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
-    let res = matmul.forward(&lhs)?;
-
-    let error: f32 = ((&mm - &res)?.abs()? / &mm.abs()?)?
-        .sum_all()?
-        .to_scalar()?;
-    let error = error / (b * m * n) as f32;
-    assert!(
-        error <= 0.02,
-        "Error {error} is too big. \nExpected:\n {mm} \nFound:\n {res}\n for {dtype:?}"
-    );
-
-    Ok(())
-}
-
-fn quantized_matmul(device: &Device) -> Result<()> {
-    // TODO Enable this later when we enable cuda.
-    if device.is_cuda() {
-        return Ok(());
-    }
+#[test]
+fn quantized_matmul() -> Result<()> {
+    let cpu = &Device::Cpu;
    let (m, k, n) = (3, 64, 4);
    let lhs = (0..(m * k)).map(|v| v as f32).collect::<Vec<_>>();
-    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), cpu)?;
    let mut dst = vec![42.; 3 * 4];
    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
    let rhs = (0..(k * n)).map(|v| v as f32).collect::<Vec<_>>();
+    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), cpu)?.t()?;
    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
    assert_eq!(
@ -66,7 +32,6 @@ fn quantized_matmul(device: &Device) -> Result<()> {
            341876.0, 994283.0, 1655709.0, 2301518.0
        ]
    );
-    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), device)?.t()?;
    let mm = tensor_lhs.matmul(&tensor_rhs)?;
    assert_eq!(
        mm.to_vec2::<f32>()?,
@ -77,49 +42,35 @@ fn quantized_matmul(device: &Device) -> Result<()> {
        ]
    );

-    let qtensor = quantized::QTensor::quantize(&tensor_rhs.t()?, GgmlDType::Q4_0)?;
+    let qtensor = quantized::QTensor::new(rhs_t, (4, 64))?;
    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
    let res = matmul.forward(&tensor_lhs)?;
-    match device {
-        Device::Metal(_) => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [84946.0, 214126.0, 344757.0, 473798.0],
-                [213458.0, 604350.0, 1000469.0, 1387990.0],
-                [341970.0, 994574.0, 1656181.0, 2302182.0]
-            ]
-        ),
-        _ => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [85120.0, 214562.0, 345455.0, 474748.0],
-                [213475.0, 604465.0, 1000686.0, 1388317.0],
-                [341876.0, 994283.0, 1655709.0, 2301518.0]
-            ]
-        ),
-    }
-
-    test_matmul(device, (1, 3, 4, 256), GgmlDType::Q4_0)?;
+    assert_eq!(
+        to_vec2_round(&res, 0)?,
+        &[
+            [85120.0, 214562.0, 345455.0, 474748.0],
+            [213475.0, 604465.0, 1000686.0, 1388317.0],
+            [341876.0, 994283.0, 1655709.0, 2301518.0]
+        ]
+    );

    Ok(())
 }

-fn quantized_matmul_neg(device: &Device) -> Result<()> {
-    // TODO Enable this later when we enable cuda.
-    if device.is_cuda() {
-        return Ok(());
-    }
+#[test]
+fn quantized_matmul_neg() -> Result<()> {
+    let cpu = &Device::Cpu;
    let (m, k, n) = (3, 64, 4);
    let lhs = (0..(m * k))
        .map(|v| v as f32 - (m * k) as f32 / 2.0)
        .collect::<Vec<_>>();
-    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), cpu)?;
    let mut dst = vec![42.; 3 * 4];
    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
    let rhs = (0..k * n)
        .map(|v| v as f32 - (k * n) as f32 / 3.0)
        .collect::<Vec<_>>();
-    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), device)?.t()?;
+    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), cpu)?.t()?;
    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
    assert_eq!(
@ -139,52 +90,32 @@ fn quantized_matmul_neg(device: &Device) -> Result<()> {
        ]
    );

-    let qtensor = quantized::QTensor::quantize(&tensor_rhs.t()?, GgmlDType::Q4_0)?;
+    let qtensor = quantized::QTensor::new(rhs_t, (4, 64))?;
    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
    let res = matmul.forward(&tensor_lhs)?;
-    match device {
-        Device::Metal(_) => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [243666.0, -19714.0, -285433.0, -550453.0],
-                [23782.0, 21654.0, 19400.0, 18369.0],
-                [-196102.0, 63022.0, 324233.0, 587191.0]
-            ]
-        ),
-        _ => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [243524.0, -19596.0, -285051.0, -549815.0],
-                [23777.0, 21651.0, 19398.0, 18367.0],
-                [-196472.0, 63012.0, 324585.0, 587902.0]
-            ]
-        ),
-    }
+    assert_eq!(
+        to_vec2_round(&res, 0)?,
+        &[
+            [243524.0, -19596.0, -285051.0, -549815.0],
+            [23777.0, 21651.0, 19398.0, 18367.0],
+            [-196472.0, 63012.0, 324585.0, 587902.0]
+        ]
+    );

    Ok(())
 }

-test_device!(
-    quantized_matmul,
-    quantized_matmul_cpu,
-    quantized_matmul_cuda,
-    quantized_matmul_metal
-);
-test_device!(
-    quantized_matmul_neg,
-    quantized_matmul_neg_cpu,
-    quantized_matmul_neg_cuda,
-    quantized_matmul_neg_metal
-);
+#[test]
+fn quantize_q4_0() -> Result<()> {
+    use k_quants::BlockQ4_0;

-fn quantize_q4_0(device: &Device) -> Result<()> {
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_0)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ4_0::zeros(); 4];
+    BlockQ4_0::from_float(&src, &mut quant)?;
+    BlockQ4_0::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        dst.to_vec1::<f32>()?,
+        dst,
        &[
            -0.0, -0.0, 3.875, 3.875, 3.875, 3.875, 7.75, 7.75, 7.75, 7.75, 11.625, 11.625, 11.625,
            11.625, 15.5, 15.5, 15.5, 15.5, 19.375, 19.375, 19.375, 19.375, 23.25, 23.25, 23.25,
@ -200,17 +131,21 @@ fn quantize_q4_0(device: &Device) -> Result<()> {
            127.0, 127.0
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q4_0, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ4_0>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q4_1(device: &Device) -> Result<()> {
+#[test]
+fn quantize_q4_1() -> Result<()> {
+    use k_quants::BlockQ4_1;
+
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_1)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ4_1::zeros(); 4];
+    BlockQ4_1::from_float(&src, &mut quant)?;
+    BlockQ4_1::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        round_vector(&dst.to_vec1::<f32>()?),
+        round_vector(&dst),
        &[
            0.0, 0.0, 2.066, 2.066, 4.133, 4.133, 6.199, 6.199, 8.266, 8.266, 10.332, 10.332,
            12.398, 12.398, 14.465, 14.465, 16.531, 16.531, 18.598, 18.598, 20.664, 20.664, 22.73,
@ -226,17 +161,21 @@ fn quantize_q4_1(device: &Device) -> Result<()> {
            118.73, 118.73, 120.797, 120.797, 122.863, 122.863, 124.93, 124.93, 126.996, 126.996
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q4_1, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ4_1>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q5_0(device: &Device) -> Result<()> {
+#[test]
+fn quantize_q5_0() -> Result<()> {
+    use k_quants::BlockQ5_0;
+
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_0)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ5_0::zeros(); 4];
+    BlockQ5_0::from_float(&src, &mut quant)?;
+    BlockQ5_0::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        round_vector(&dst.to_vec1::<f32>()?),
+        round_vector(&dst),
        &[
            -0.0, 1.938, 1.938, 3.875, 3.875, 5.813, 5.813, 7.75, 7.75, 9.688, 9.688, 11.625,
            11.625, 13.563, 13.563, 15.5, 15.5, 17.438, 17.438, 19.375, 19.375, 21.313, 21.313,
@ -252,17 +191,21 @@ fn quantize_q5_0(device: &Device) -> Result<()> {
            119.063, 119.063, 119.063, 119.063, 127.0, 127.0, 127.0, 127.0
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q5_0, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ5_0>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q5_1(device: &Device) -> Result<()> {
+#[test]
+fn quantize_q5_1() -> Result<()> {
+    use k_quants::BlockQ5_1;
+
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_1)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ5_1::zeros(); 4];
+    BlockQ5_1::from_float(&src, &mut quant)?;
+    BlockQ5_1::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        round_vector(&dst.to_vec1::<f32>()?),
+        dst,
        &[
            0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
            16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
@ -276,11 +219,13 @@ fn quantize_q5_1(device: &Device) -> Result<()> {
            124.0, 125.0, 126.0, 127.0
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q5_1, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
+    ggml_quantization_error_test::<BlockQ5_1>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn get_test_vector2(bound: f32, size: usize, device: &Device) -> Result<Tensor> {
+/// Generates a small test vector ranging from -`bound` to `bound` with `size` steps
+fn get_test_vector(bound: f32, size: usize) -> (Vec<f32>, Vec<f32>) {
    assert!(
        size % crate::quantized::k_quants::QK_K == 0,
        "size must be a multiple of {}",
@ -290,8 +235,10 @@ fn get_test_vector2(bound: f32, size: usize, device: &Device) -> Result<Tensor>
    let src = (0..size)
        .map(|v| (v as f32 - size as f32 / 2.) * bound / (size as f32 / 2.))
        .collect::<Vec<_>>();
+
+    let dst = vec![0f32; size];
    assert_eq!([src[0], src[size / 2]], [-bound, 0.0]);
-    Tensor::from_vec(src, (size,), device)
+    (src, dst)
 }

 /// Round a vector
@ -318,8 +265,7 @@ fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) {
    }
 }

-/// Creates a vector similar to the ones used in GGML unit tests:
-/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
+/// Creates a vector simillarly to the one used in GGML unit tests: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
 fn create_ggml_like_vector(offset: f32) -> Vec<f32> {
    (0..GGML_TEST_SIZE)
        .map(|i| 0.1 + 2.0 * (i as f32 + offset).cos())
@ -338,16 +284,14 @@ fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 {
    sum / a.len() as f32
 }

-/// Similar to the GGML quantization unit test:
-/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
-fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f32) -> Result<()> {
+/// Mirrores the GGML quanitzation unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
+fn ggml_quantization_error_test<T: GgmlType>(max_error: f32) -> Result<()> {
    let src = create_ggml_like_vector(0.0);
-    let src = Tensor::from_slice(&src, (GGML_TEST_SIZE,), device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
-    let error = calculate_rmse(&src.to_vec1::<f32>()?, &dst.to_vec1::<f32>()?);
+    let mut dst = vec![0.0; GGML_TEST_SIZE];
+    let _quant = quantize_roundtrip::<T>(src.as_slice(), dst.as_mut_slice())?;
+    let error = calculate_rmse(src.as_slice(), dst.as_slice());
    if error > max_error {
-        bail!(
+        candle_core::bail!(
            "Quantization error {} exceeds max error {}",
            error,
            max_error
@ -356,15 +300,19 @@ fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f3
    Ok(())
 }

-fn quantize_q2k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q2K;
+fn quantize_roundtrip<T: GgmlType>(src: &[f32], dst: &mut [f32]) -> Result<Vec<T>> {
+    let mut quant = vec![T::zeros(); src.len() / T::BLCK_SIZE];
+    T::from_float(src, &mut quant)?;
+    T::to_float(&quant, dst)?;
+    Ok(quant)
+}

-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q2k() -> Result<()> {
+    use k_quants::BlockQ2K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ2K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.1);

    // Test some specific values
@ -378,26 +326,20 @@ fn quantize_q2k(device: &Device) -> Result<()> {
        [-0.499, -0.366, -0.249, 0.0, 0.295, 0.492]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ2K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 6.0);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS)?;
+    ggml_quantization_error_test::<BlockQ2K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS)?;
    Ok(())
 }

-fn quantize_q3k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q3K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q3k() -> Result<()> {
+    use k_quants::BlockQ3K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ3K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.03);

    // Test some specific values
@ -411,26 +353,20 @@ fn quantize_q3k(device: &Device) -> Result<()> {
        [-0.493, -0.37, -0.243, -0.0, 0.292, 0.492]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ3K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 3.5);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS)?;
+    ggml_quantization_error_test::<BlockQ3K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS)?;
    Ok(())
 }

-fn quantize_q4k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q4K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q4k() -> Result<()> {
+    use k_quants::BlockQ4K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ4K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.017);

    // Test some specific values
@ -444,27 +380,21 @@ fn quantize_q4k(device: &Device) -> Result<()> {
        [-0.5, -0.373, -0.25, 0.0, 0.288, 0.498]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ4K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 4.5);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ4K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q5k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q5K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q5k() -> Result<()> {
+    use k_quants::BlockQ5K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.009);
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ5K>(src.as_slice(), dst.as_mut_slice())?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);

    // Test some specific values
    assert_eq!(
@ -474,29 +404,24 @@ fn quantize_q5k(device: &Device) -> Result<()> {
    let dst = round_vector(&dst);
    assert_eq!(
        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.5, -0.373, -0.25, 0.0, 0.279, 0.499]
+        [-0.499, -0.372, -0.249, 0.001, 0.279, 0.499]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ5K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.5);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ5K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
    Ok(())
 }

-fn quantize_q6k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q6K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q6k() -> Result<()> {
+    use k_quants::BlockQ6K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ6K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);

    // Test some specific values
@ -510,27 +435,22 @@ fn quantize_q6k(device: &Device) -> Result<()> {
        [-0.497, -0.372, -0.25, -0.0, 0.284, 0.5]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ6K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.0);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ6K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
    Ok(())
 }

-fn quantize_q8k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q8K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q8k() -> Result<()> {
+    use k_quants::BlockQ8K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ8K>(src.as_slice(), dst.as_mut_slice())?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.003);

    // Test some specific values
    assert_eq!(
@ -543,79 +463,15 @@ fn quantize_q8k(device: &Device) -> Result<()> {
        [-0.5, -0.375, -0.25, -0.0, 0.281, 0.499]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ8K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 0.6);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ8K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
    Ok(())
 }

-test_device!(
-    quantize_q4_0,
-    quantize_q4_0_cpu,
-    quantize_q4_0_cuda,
-    quantize_q4_0_metal
-);
-test_device!(
-    quantize_q4_1,
-    quantize_q4_1_cpu,
-    quantize_q4_1_cuda,
-    quantize_q4_1_metal
-);
-test_device!(
-    quantize_q5_0,
-    quantize_q5_0_cpu,
-    quantize_q5_0_cuda,
-    quantize_q5_0_metal
-);
-test_device!(
-    quantize_q5_1,
-    quantize_q5_1_cpu,
-    quantize_q5_1_cuda,
-    quantize_q5_1_metal
-);
-test_device!(
-    quantize_q2k,
-    quantize_q2k_cpu,
-    quantize_q2k_cuda,
-    quantize_q2k_metal
-);
-test_device!(
-    quantize_q3k,
-    quantize_q3k_cpu,
-    quantize_q3k_cuda,
-    quantize_q3k_metal
-);
-test_device!(
-    quantize_q4k,
-    quantize_q4k_cpu,
-    quantize_q4k_cuda,
-    quantize_q4k_metal
-);
-test_device!(
-    quantize_q5k,
-    quantize_q5k_cpu,
-    quantize_q5k_cuda,
-    quantize_q5k_metal
-);
-test_device!(
-    quantize_q6k,
-    quantize_q6k_cpu,
-    quantize_q6k_cuda,
-    quantize_q6k_metal
-);
-test_device!(
-    quantize_q8k,
-    quantize_q8k_cpu,
-    quantize_q8k_cuda,
-    quantize_q8k_metal
-);
-
 /// Very simple dot product implementation
 fn vec_dot_reference(a: &[f32], b: &[f32]) -> f32 {
    a.iter().zip(b).map(|(a, b)| a * b).sum()
@ -631,66 +487,54 @@ fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> {
        GgmlDType::Q5K => 0.000740,
        GgmlDType::Q6K => 0.000952,
        GgmlDType::Q4_0 => 0.001143,
-        GgmlDType::Q4_1 => 0.008,
+        GgmlDType::Q4_1 => 0.007784,
        GgmlDType::Q5_0 => 0.001353,
-        GgmlDType::Q5_1 => 0.00149,
+        GgmlDType::Q5_1 => 0.001363,
        GgmlDType::Q8_0 => 0.000092,

        // Not from the ggml repo.
        GgmlDType::Q8K => 0.00065,
-        _ => bail!("No GGML results for quantization type {dtype:?}",),
+        _ => candle_core::bail!("No GGML results for quantization type {dtype:?}",),
    };
    Ok(err)
 }

-/// Similar to the GGML matmul unit test:
-/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
+/// Mirrores the GGML matmul unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
 fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
    let a = create_ggml_like_vector(0.0);
    let b = create_ggml_like_vector(1.0);
-    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 1.0)?;
-    // Another example that is more likely to trigger the overflow reported in #1526
-    let a = (0..GGML_TEST_SIZE)
-        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
-        .collect::<Vec<_>>();
-    let b = (0..GGML_TEST_SIZE)
-        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
-        .collect::<Vec<_>>();
-    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 2.0)?;
-    Ok(())
-}
-
-fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Result<()> {
    let length = a.len();

    let mut a_quant = vec![T::zeros(); length / T::BLCK_SIZE];
    let mut b_quant = vec![T::VecDotType::zeros(); length / T::VecDotType::BLCK_SIZE];
-    T::from_float(a, &mut a_quant)?;
-    T::VecDotType::from_float(b, &mut b_quant)?;
+    T::from_float(&a, &mut a_quant)?;
+    T::VecDotType::from_float(&b, &mut b_quant)?;

    let result = T::vec_dot(length, &a_quant, &b_quant)?;
    let result_unopt = T::vec_dot_unopt(length, &a_quant, &b_quant)?;
-    let reference_result = vec_dot_reference(a, b);
+    let reference_result = vec_dot_reference(&a, &b);

    if (result - result_unopt).abs() / length as f32 > 1e-6 {
-        bail!(
+        candle_core::bail!(
            "the opt and unopt vec-dot returned different values, opt {result}, unopt {result_unopt}"
        )
    }

    let error = (result - reference_result).abs() / length as f32;

-    let ggml_error = ggml_reference_matmul_error(T::DTYPE)? * err_m;
+    let ggml_error = ggml_reference_matmul_error(T::DTYPE)?;

    if !error.is_finite() || error > GGML_MAX_DOT_PRODUCT_ERROR {
-        bail!("Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",);
+        candle_core::bail!(
+            "Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",
+        );
    }

    // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML
    // => we use a slightly higher error threshold
    const ERROR_LENIENCY: f32 = 0.00001;
    if error - ERROR_LENIENCY > ggml_error {
-        bail!(
+        candle_core::bail!(
            "Dot product error {} exceeds ggml reference error {}",
            error,
            ggml_error
@ -699,16 +543,6 @@ fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Res
    Ok(())
 }

-#[test]
-fn quantized_mm() -> Result<()> {
-    ggml_matmul_error_test::<k_quants::BlockQ4_0>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ4_1>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ5_0>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ5_1>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ8_0>()?;
-    Ok(())
-}
-
 /// generates random tensors of size `m x k` and `n x k` and calculates their expected matrix multiplication result.
 fn get_random_tensors(
    m: usize,
@ -732,108 +566,6 @@ fn get_random_tensors(
    Ok((lhs, rhs, mm))
 }

-#[macro_export]
-macro_rules! quantized_matmul {
-    // TODO: Switch to generating the two last arguments automatically once concat_idents is
-    // stable. https://github.com/rust-lang/rust/issues/29599
-    ($fn_name: ident, $fn_name_cpu: ident, $fn_name_cuda: ident, $fn_name_metal: ident, $dtype: expr) => {
-        fn $fn_name(device: &Device) -> Result<()> {
-            test_matmul(device, (1, 3, 4, 256), $dtype)?;
-            Ok(())
-        }
-
-        test_device!($fn_name, $fn_name_cpu, $fn_name_cuda, $fn_name_metal);
-    };
-}
-
-quantized_matmul!(
-    quantized_matmul_q4_0_bis,
-    quantized_matmul_q4_0_cpu,
-    quantized_matmul_q4_0_cuda,
-    quantized_matmul_q4_0_metal,
-    GgmlDType::Q4_0
-);
-quantized_matmul!(
-    quantized_matmul_q4_1_bis,
-    quantized_matmul_q4_1_cpu,
-    quantized_matmul_q4_1_cuda,
-    quantized_matmul_q4_1_metal,
-    GgmlDType::Q4_1
-);
-quantized_matmul!(
-    quantized_matmul_q5_0_bis,
-    quantized_matmul_q5_0_cpu,
-    quantized_matmul_q5_0_cuda,
-    quantized_matmul_q5_0_metal,
-    GgmlDType::Q5_0
-);
-quantized_matmul!(
-    quantized_matmul_q5_1_bis,
-    quantized_matmul_q5_1_cpu,
-    quantized_matmul_q5_1_cuda,
-    quantized_matmul_q5_1_metal,
-    GgmlDType::Q5_1
-);
-quantized_matmul!(
-    quantized_matmul_q8_0_bis,
-    quantized_matmul_q8_0_cpu,
-    quantized_matmul_q8_0_cuda,
-    quantized_matmul_q8_0_metal,
-    GgmlDType::Q8_0
-);
-// Not implemented in Ggml
-// quantized_matmul!(
-//     quantized_matmul_q8_1_bis,
-//     quantized_matmul_q8_1_cpu,
-//     quantized_matmul_q8_1_cuda,
-//     quantized_matmul_q8_1_metal,
-//     GgmlDType::Q8_1
-// );
-// TODO This is bugged (also bugged in GGML
-quantized_matmul!(
-    quantized_matmul_q2k_bis,
-    quantized_matmul_q2k_cpu,
-    quantized_matmul_q2k_cuda,
-    quantized_matmul_q2k_metal,
-    GgmlDType::Q2K
-);
-quantized_matmul!(
-    quantized_matmul_q3k_bis,
-    quantized_matmul_q3k_cpu,
-    quantized_matmul_q3k_cuda,
-    quantized_matmul_q3k_metal,
-    GgmlDType::Q3K
-);
-quantized_matmul!(
-    quantized_matmul_q4k_bis,
-    quantized_matmul_q4k_cpu,
-    quantized_matmul_q4k_cuda,
-    quantized_matmul_q4k_metal,
-    GgmlDType::Q4K
-);
-quantized_matmul!(
-    quantized_matmul_q5k_bis,
-    quantized_matmul_q5k_cpu,
-    quantized_matmul_q5k_cuda,
-    quantized_matmul_q5k_metal,
-    GgmlDType::Q5K
-);
-quantized_matmul!(
-    quantized_matmul_q6k_bis,
-    quantized_matmul_q6k_cpu,
-    quantized_matmul_q6k_cuda,
-    quantized_matmul_q6k_metal,
-    GgmlDType::Q6K
-);
-// Not implemented on metal
-// quantized_matmul!(
-//     quantized_matmul_q8k_bis,
-//     quantized_matmul_q8k_cpu,
-//     quantized_matmul_q8k_cuda,
-//     quantized_matmul_q8k_metal,
-//     GgmlDType::Q8K
-// );
-
 #[test]
 fn quantized_matmul_q2k() -> Result<()> {
    use k_quants::BlockQ2K;
@ -846,7 +578,7 @@ fn quantized_matmul_q2k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q2K)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ2K>(&rhs)?;
    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
    let mm = rhs.forward(&lhs)?;

@ -872,7 +604,7 @@ fn quantized_matmul_q3k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q3K)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ3K>(&rhs)?;
    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
    let mm = rhs.forward(&lhs)?;

@ -898,7 +630,7 @@ fn quantized_matmul_q4k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q4K)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ4K>(&rhs)?;
    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
    let mm = rhs.forward(&lhs)?;

@ -924,7 +656,7 @@ fn quantized_matmul_q5k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q5K)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ5K>(&rhs)?;
    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
    let mm = rhs.forward(&lhs)?;

@ -951,7 +683,7 @@ fn quantized_matmul_q6k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q6K)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ6K>(&rhs)?;
    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
    let mm = rhs.forward(&lhs)?;

@ -976,7 +708,7 @@ fn quantized_matmul_q8k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q8K)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ8K>(&rhs)?;
    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
    let mm = rhs.forward(&lhs)?;

--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -1,4 +1,4 @@
-use candle_core::{test_device, test_utils, DType, Device, IndexOp, Result, Tensor, D};
+use candle_core::{test_device, test_utils, DType, Device, IndexOp, Result, Tensor};

 fn zeros(device: &Device) -> Result<()> {
    let tensor = Tensor::zeros((5, 2), DType::F32, device)?;
@ -32,14 +32,6 @@ fn ones(device: &Device) -> Result<()> {
    Ok(())
 }

-fn full(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::full(42u32, (2, 3), device)?.to_vec2::<u32>()?,
-        [[42, 42, 42], [42, 42, 42]],
-    );
-    Ok(())
-}
-
 fn arange(device: &Device) -> Result<()> {
    assert_eq!(
        Tensor::arange(0u8, 5u8, device)?.to_vec1::<u8>()?,
@ -120,13 +112,6 @@ fn unary_op(device: &Device) -> Result<()> {
            [0.9999, -0.9891, -0.3079, 0.9891, 0.9999]
        ]
    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.silu()?, 4)?,
-        [
-            [-0.1423, 0.7311, 3.9281, -0.0475, 0.3112],
-            [2.53, -0.2553, -0.1205, 1.5447, 2.6395]
-        ]
-    );
    assert_eq!(
        test_utils::to_vec2_round(&tensor.ceil()?, 4)?,
        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -1.0, -0.0, 2.0, 3.0]]
@ -672,31 +657,6 @@ fn cat(device: &Device) -> Result<()> {
            [2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0]
        ]
    );
-
-    // 3D
-    let t1 = Tensor::arange(0, 48i64, device)?.reshape((2, 6, 4))?;
-    let t2 = Tensor::arange(100, 124i64, device)?.reshape((2, 3, 4))?;
-    let t3 = Tensor::arange(10000, 10032i64, device)?.reshape((2, 4, 4))?;
-
-    let t_cat = Tensor::cat(&[&t1, &t2, &t3], 1)?;
-
-    let t1 = t1.t()?.contiguous()?.t()?;
-    let t2 = t2.t()?.contiguous()?.t()?;
-    let t3 = t3.t()?.contiguous()?.t()?;
-    let t_cat2 = Tensor::cat(&[&t1, &t2, &t3], 1)?;
-
-    let diff = t_cat.eq(&t_cat2)?.to_dtype(DType::F32)?.sum_all()?;
-    assert_eq!(diff.to_vec0::<f32>()?, 104.0);
-    assert_eq!(t_cat.i((0, 0, 0))?.to_vec0::<i64>()?, 0);
-    assert_eq!(t_cat.i((0, 4, 0))?.to_vec0::<i64>()?, 16);
-    assert_eq!(t_cat.i((0, 5, 0))?.to_vec0::<i64>()?, 20);
-    assert_eq!(t_cat.i((1, 5, 0))?.to_vec0::<i64>()?, 44);
-    assert_eq!(t_cat.i((0, 6, 0))?.to_vec0::<i64>()?, 100);
-    assert_eq!(t_cat.i((1, 6, 0))?.to_vec0::<i64>()?, 112);
-    assert_eq!(t_cat.i((0, 6, 1))?.to_vec0::<i64>()?, 101);
-    assert_eq!(t_cat.i((0, 7, 1))?.to_vec0::<i64>()?, 105);
-    assert_eq!(t_cat.i((0, 12, 1))?.to_vec0::<i64>()?, 10013);
-    assert_eq!(t_cat.i((1, 12, 3))?.to_vec0::<i64>()?, 10031);
    Ok(())
 }

@ -1105,39 +1065,13 @@ fn broadcasting(device: &Device) -> Result<()> {
 fn randn(device: &Device) -> Result<()> {
    let tensor = Tensor::randn(0f32, 1f32, (5, 3), device)?;
    assert_eq!(tensor.dims(), [5, 3]);
-    // Check that the seed gets updated by checking that
-    // a new series of numbers is generated each time
-    let tensor2 = Tensor::randn(0f32, 1f32, (5, 3), device)?;
-    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
    let tensor = Tensor::rand(0f32, 1f32, (5, 3), device)?;
    assert_eq!(tensor.dims(), [5, 3]);
-    // Check that the seed gets updated by checking that
-    // a new series of numbers is generated each time
-    let tensor2 = Tensor::rand(0f32, 1f32, (5, 3), device)?;
-    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
-    // We do not expect deterministic elements at any index.
-    // There once was a bug that had a deterministic zero element in evenly sized tensors.
-    const N: usize = 2;
-    let v = (0..100)
-        .map(|_| Tensor::randn(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
-        .collect::<Result<Vec<_>>>()?;
-    assert!(
-        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
-        "There are deterministic values in the randn tensors"
-    );
-    let v = (0..100)
-        .map(|_| Tensor::rand(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
-        .collect::<Result<Vec<_>>>()?;
-    assert!(
-        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
-        "There are deterministic values in the rand tensors"
-    );
    Ok(())
 }

 test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal);
 test_device!(ones, ones_cpu, ones_gpu, ones_metal);
-test_device!(full, full_cpu, full_gpu, full_metal);
 test_device!(arange, arange_cpu, arange_gpu, arange_metal);
 test_device!(add_mul, add_mul_cpu, add_mul_gpu, add_mul_metal);
 test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu, tensor_2d_metal);
@ -1225,100 +1159,3 @@ fn i64_abs() -> Result<()> {
    assert_eq!(t.to_vec1::<i64>()?, [42, 1337]);
    Ok(())
 }
-
-#[test]
-fn tril_triu_eye() -> Result<()> {
-    let t = Tensor::tril2(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 1.0, 0.0, 0.0],
-            [1.0, 1.0, 1.0, 0.0],
-            [1.0, 1.0, 1.0, 1.0]
-        ],
-    );
-    let t = Tensor::triu2(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 1.0, 1.0, 1.0],
-            [0.0, 1.0, 1.0, 1.0],
-            [0.0, 0.0, 1.0, 1.0],
-            [0.0, 0.0, 0.0, 1.0]
-        ]
-    );
-    let t = Tensor::eye(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 0.0, 0.0, 0.0],
-            [0.0, 1.0, 0.0, 0.0],
-            [0.0, 0.0, 1.0, 0.0],
-            [0.0, 0.0, 0.0, 1.0]
-        ]
-    );
-    Ok(())
-}
-
-#[test]
-fn cumsum() -> Result<()> {
-    let t = &[3f32, 1., 4., 1., 5.];
-    let t = Tensor::new(t, &Device::Cpu)?;
-    assert_eq!(t.cumsum(0)?.to_vec1::<f32>()?, [3., 4., 8., 9., 14.]);
-    let t = t.unsqueeze(1)?;
-    assert_eq!(
-        t.cumsum(0)?.to_vec2::<f32>()?,
-        [[3.0], [4.0], [8.0], [9.0], [14.0]]
-    );
-    assert_eq!(
-        t.cumsum(1)?.to_vec2::<f32>()?,
-        [[3.0], [1.0], [4.0], [1.0], [5.0]]
-    );
-    let t = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
-    let t = Tensor::new(t, &Device::Cpu)?;
-    assert_eq!(
-        t.cumsum(1)?.to_vec2::<f32>()?,
-        [[3.0, 4.0, 8.0, 9.0, 14.0], [2.0, 3.0, 10.0, 18.0, 20.0]],
-    );
-    assert_eq!(
-        t.cumsum(0)?.to_vec2::<f32>()?,
-        [[3.0, 1.0, 4.0, 1.0, 5.0], [5.0, 2.0, 11.0, 9.0, 7.0]]
-    );
-    Ok(())
-}
-
-/// A helper function for floating point comparison. Both a and b must be 1D Tensor and contains the same amount of data.
-/// Assertion passes if the difference of all pairs of a and b is smaller than epsilon.
-fn assert_close(a: &Tensor, b: &Tensor, epsilon: f64) -> Result<()> {
-    let a_vec: Vec<f64> = a.to_vec1()?;
-    let b_vec: Vec<f64> = b.to_vec1()?;
-
-    assert_eq!(a_vec.len(), b_vec.len());
-    for (a, b) in a_vec.iter().zip(b_vec.iter()) {
-        assert!((a - b).abs() < epsilon);
-    }
-    Ok(())
-}
-
-#[test]
-fn log_sum_exp() -> Result<()> {
-    let input = Tensor::new(&[[1f64, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
-    let output = input.log_sum_exp(D::Minus1)?;
-    // The expectations obtained from pytorch.
-    let expected = Tensor::new(&[3.4076, 6.4076], &Device::Cpu)?;
-    assert_close(&output, &expected, 0.00001)?;
-    Ok(())
-}
-
-#[test]
-fn pow() -> Result<()> {
-    let lhs = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
-    let rhs = (&lhs - 2.)?;
-    let res = lhs.pow(&rhs)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&res, 4)?,
-        [[1.0, 1.0, 3.0], [16.0, 125.0, 1296.0001]]
-    );
-    Ok(())
-}
--- a/candle-core/tests/test.pt
+++ b/candle-core/tests/test.pt
--- a/candle-core/tests/test_with_key.pt
+++ b/candle-core/tests/test_with_key.pt
--- a/candle-datasets/Cargo.toml
+++ b/candle-datasets/Cargo.toml
@ -11,8 +11,8 @@ readme = "README.md"

 [dependencies]
 byteorder = { workspace = true }
-candle = { workspace = true }
-candle-nn = { workspace = true }
+candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
+candle-nn = { path = "../candle-nn", version = "0.3.0" }
 hf-hub = { workspace = true}
 intel-mkl-src = { workspace = true, optional = true }
 memmap2 = { workspace = true }
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -11,60 +11,53 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { workspace = true }
-candle-datasets = { workspace = true, optional = true }
-candle-nn = { workspace = true }
-candle-transformers = { workspace = true }
-candle-flash-attn = { workspace = true, optional = true }
-candle-onnx = { workspace = true, optional = true }
-
-csv = "1.3.0"
+candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.3.0" }
+candle-nn = { path = "../candle-nn", version = "0.3.0" }
+candle-transformers = { path = "../candle-transformers", version = "0.3.0" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.0", optional = true }
+candle-onnx = { path = "../candle-onnx", version = "0.3.0", optional = true }
 cudarc = { workspace = true, optional = true }
 half = { workspace = true, optional = true }
-hf-hub = { workspace = true, features = ["tokio"] }
 image = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 num-traits = { workspace = true }
 pyo3 = { version = "0.20.0", features = ["auto-initialize"], optional = true }
 rayon = { workspace = true }
-rubato = { version = "0.15.0", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
-symphonia = { version = "0.5.3", features = ["all"], optional = true }
 tokenizers = { workspace = true, features = ["onig"] }
-cpal= { version = "0.15.2", optional = true }

 [dev-dependencies]
 anyhow = { workspace = true }
 byteorder = { workspace = true }
 clap = { workspace = true }
+hf-hub = { workspace = true, features=["tokio"]}
 imageproc = { workspace = true }
 memmap2 = { workspace = true }
 rand = { workspace = true }
-ab_glyph = { workspace = true }
+rusttype = { workspace = true }
 tracing = { workspace = true }
 tracing-chrome = { workspace = true }
 tracing-subscriber = { workspace = true }
+wav = { workspace = true }
 # Necessary to disambiguate with tokio in wasm examples which are 1.28.1
 tokio = "1.29.1"

 [build-dependencies]
 anyhow = { workspace = true }
-bindgen_cuda = { version = "0.1.1", optional = true }

 [features]
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
-cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda", "dep:bindgen_cuda"]
+cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
 cudnn = ["candle/cudnn"]
 flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
 nccl = ["cuda", "cudarc/nccl", "dep:half"]
 onnx = ["candle-onnx"]
 metal = ["candle/metal", "candle-nn/metal"]
-microphone = ["cpal"]
-encodec = ["cpal", "symphonia", "rubato"]

 [[example]]
 name = "llama_multiprocess"
@ -81,23 +74,3 @@ required-features = ["onnx"]
 [[example]]
 name = "onnx_basics"
 required-features = ["onnx"]
-
-[[example]]
-name = "whisper"
-required-features = ["symphonia"]
-
-[[example]]
-name = "whisper-microphone"
-required-features = ["microphone"]
-
-[[example]]
-name = "mnist-training"
-required-features = ["candle-datasets"]
-
-[[example]]
-name = "llama2-c"
-required-features = ["candle-datasets"]
-
-[[example]]
-name = "encodec"
-required-features = ["encodec"]
--- a/candle-examples/build.rs
+++ b/candle-examples/build.rs
@ -4,28 +4,235 @@ use std::io::Write;
 use std::path::PathBuf;

 struct KernelDirectories {
-    kernel_glob: &'static str,
+    kernel_dir: &'static str,
    rust_target: &'static str,
    include_dirs: &'static [&'static str],
 }

-const KERNEL_DIRS: [KernelDirectories; 1] = [KernelDirectories {
-    kernel_glob: "examples/custom-ops/kernels/*.cu",
+const DIRS: [KernelDirectories; 1] = [KernelDirectories {
+    kernel_dir: "examples/custom-ops/kernels/",
    rust_target: "examples/custom-ops/cuda_kernels.rs",
    include_dirs: &[],
 }];

+impl KernelDirectories {
+    fn maybe_build_ptx(
+        &self,
+        cu_file: &std::path::Path,
+        ptx_file: &std::path::Path,
+        compute_cap: usize,
+    ) -> Result<()> {
+        let should_compile = if ptx_file.exists() {
+            let ptx_modified = ptx_file.metadata()?.modified()?;
+            let cu_modified = cu_file.metadata()?.modified()?;
+            cu_modified.duration_since(ptx_modified).is_ok()
+        } else {
+            true
+        };
+        if should_compile {
+            #[cfg(feature = "cuda")]
+            {
+                let mut command = std::process::Command::new("nvcc");
+                let out_dir = ptx_file.parent().context("no parent for ptx file")?;
+                let include_dirs: Vec<String> =
+                    self.include_dirs.iter().map(|c| format!("-I{c}")).collect();
+                command
+                    .arg(format!("--gpu-architecture=sm_{compute_cap}"))
+                    .arg("--ptx")
+                    .args(["--default-stream", "per-thread"])
+                    .args(["--output-directory", out_dir.to_str().unwrap()])
+                    .arg(format!("-I/{}", self.kernel_dir))
+                    .args(include_dirs)
+                    .arg(cu_file);
+                let output = command
+                    .spawn()
+                    .context("failed spawning nvcc")?
+                    .wait_with_output()?;
+                if !output.status.success() {
+                    anyhow::bail!(
+                    "nvcc error while compiling {cu_file:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
+                    String::from_utf8_lossy(&output.stdout),
+                    String::from_utf8_lossy(&output.stderr)
+                )
+                }
+            }
+            #[cfg(not(feature = "cuda"))]
+            std::fs::OpenOptions::new()
+                .create(true)
+                .write(true)
+                .open(ptx_file)?;
+        }
+        Ok(())
+    }
+    fn process(&self, out_dir: &std::path::Path, compute_cap: usize) -> Result<()> {
+        println!("cargo:rerun-if-changed={}", self.kernel_dir);
+        let kernel_dir = PathBuf::from(self.kernel_dir);
+        let out_dir = out_dir.join(self.kernel_dir);
+        if !out_dir.exists() {
+            std::fs::create_dir_all(&out_dir)?;
+        }
+        let mut cu_files = vec![];
+        let mut cuh_files = vec![];
+        for file in std::fs::read_dir(kernel_dir)?.flatten() {
+            let file = file.path();
+            match file.extension().and_then(|v| v.to_str()) {
+                Some("cu") => cu_files.push(file),
+                Some("cuh") => cuh_files.push(file),
+                _ => {}
+            }
+        }
+
+        let mut ptx_paths = vec![];
+        for cu_file in cu_files.iter() {
+            let file_stem = cu_file
+                .file_stem()
+                .with_context(|| format!("no stem {cu_file:?}"))?;
+            let file_stem = file_stem.to_string_lossy().into_owned();
+            let ptx_file = out_dir.join(&format!("{file_stem}.ptx"));
+            self.maybe_build_ptx(cu_file, &ptx_file, compute_cap)?;
+            ptx_paths.push(ptx_file);
+        }
+
+        let regenerate_rs_file = true;
+        if regenerate_rs_file {
+            let mut file = std::fs::File::create(self.rust_target)?;
+            for ptx_path in ptx_paths {
+                let name = ptx_path
+                    .file_stem()
+                    .context("empty stem")?
+                    .to_string_lossy();
+                file.write_all(b"#[rustfmt::skip]\n")?;
+                let const_definition = format!(
+                    r#"pub const {}: &str = include_str!(concat!(env!("OUT_DIR"), "/{}/{name}.ptx"));"#,
+                    name.to_uppercase().replace('.', "_"),
+                    self.kernel_dir,
+                );
+                file.write_all(const_definition.as_bytes())?;
+                file.write_all(b"\n")?;
+            }
+        }
+        Ok(())
+    }
+}
+
 fn main() -> Result<()> {
    println!("cargo:rerun-if-changed=build.rs");

+    let out_dir = std::env::var("OUT_DIR").context("OUT_DIR not set")?;
+    let out_dir = PathBuf::from(out_dir);
    #[cfg(feature = "cuda")]
-    {
-        for kdir in KERNEL_DIRS.iter() {
-            let builder = bindgen_cuda::Builder::default().kernel_paths_glob(kdir.kernel_glob);
-            println!("cargo:info={builder:?}");
-            let bindings = builder.build_ptx().unwrap();
-            bindings.write(kdir.rust_target).unwrap()
-        }
+    set_cuda_include_dir()?;
+    #[cfg(feature = "cuda")]
+    let compute_cap = compute_cap()?;
+    #[cfg(not(feature = "cuda"))]
+    let compute_cap = 0;
+    for d in DIRS {
+        d.process(&out_dir, compute_cap)?
    }
    Ok(())
 }
+
+fn set_cuda_include_dir() -> Result<()> {
+    // NOTE: copied from cudarc build.rs.
+    let env_vars = [
+        "CUDA_PATH",
+        "CUDA_ROOT",
+        "CUDA_TOOLKIT_ROOT_DIR",
+        "CUDNN_LIB",
+    ];
+    let env_vars = env_vars
+        .into_iter()
+        .map(std::env::var)
+        .filter_map(Result::ok)
+        .map(Into::<PathBuf>::into);
+
+    let roots = [
+        "/usr",
+        "/usr/local/cuda",
+        "/opt/cuda",
+        "/usr/lib/cuda",
+        "C:/Program Files/NVIDIA GPU Computing Toolkit",
+        "C:/CUDA",
+    ];
+    let roots = roots.into_iter().map(Into::<PathBuf>::into);
+    let root = env_vars
+        .chain(roots)
+        .find(|path| path.join("include").join("cuda.h").is_file())
+        .context("cannot find include/cuda.h")?;
+    println!(
+        "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
+        root.join("include").display()
+    );
+    Ok(())
+}
+
+#[allow(unused)]
+fn compute_cap() -> Result<usize> {
+    // Grab compute code from nvidia-smi
+    let mut compute_cap = {
+        let out = std::process::Command::new("nvidia-smi")
+                    .arg("--query-gpu=compute_cap")
+                    .arg("--format=csv")
+                    .output()
+                    .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+        let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
+        let mut lines = out.lines();
+        assert_eq!(
+            lines.next().context("missing line in stdout")?,
+            "compute_cap"
+        );
+        let cap = lines
+            .next()
+            .context("missing line in stdout")?
+            .replace('.', "");
+        cap.parse::<usize>()
+            .with_context(|| format!("cannot parse as int {cap}"))?
+    };
+
+    // Grab available GPU codes from nvcc and select the highest one
+    let max_nvcc_code = {
+        let out = std::process::Command::new("nvcc")
+                    .arg("--list-gpu-code")
+                    .output()
+                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+        let out = std::str::from_utf8(&out.stdout).unwrap();
+
+        let out = out.lines().collect::<Vec<&str>>();
+        let mut codes = Vec::with_capacity(out.len());
+        for code in out {
+            let code = code.split('_').collect::<Vec<&str>>();
+            if !code.is_empty() && code.contains(&"sm") {
+                if let Ok(num) = code[1].parse::<usize>() {
+                    codes.push(num);
+                }
+            }
+        }
+        codes.sort();
+        if !codes.contains(&compute_cap) {
+            anyhow::bail!(
+                "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
+            );
+        }
+        *codes.last().unwrap()
+    };
+
+    // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
+    // then choose the highest gpu code in nvcc
+    if compute_cap > max_nvcc_code {
+        println!(
+            "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
+        );
+        compute_cap = max_nvcc_code;
+    }
+
+    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
+
+    if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+        compute_cap = compute_cap_str
+            .parse::<usize>()
+            .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
+        println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
+    }
+    println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
+    Ok(compute_cap)
+}
--- a/candle-examples/examples/bert/README.md
+++ b/candle-examples/examples/bert/README.md
@ -2,10 +2,10 @@

 Bert is a general large language model. In this example it can be used for two
 different tasks:
-
 - Compute sentence embeddings for a prompt.
 - Compute similarities between a set of sentences.

+
 ## Sentence embeddings

 Bert is used to compute the sentence embeddings for a prompt. The model weights
@ -24,48 +24,6 @@ cargo run --example bert --release -- --prompt "Here is a test sentence"
 > Tensor[[1, 7, 384], f32]
 ```

-### Custom models
-
-You can specify different models, such as BGE, with the `--model-id` flag:
-
-```bash
-cargo run  --example bert --release -- \
--model-id BAAI/bge-large-zh-v1.5 \
--prompt "Here is a test sentence"
-Loaded and encoded 435.70775ms
-[[[ 3.0944e-1, -7.8455e-5,  -1.2768e0, ...,  1.3755e-2, -3.2371e-1,  2.3819e-1],
-  [-2.8506e-1,  1.9953e-1,  -1.3076e0, ...,  6.9819e-2,  1.0833e-2,  -1.1512e0],
-  [ 3.9892e-1,  2.0000e-1, -9.3178e-1, ..., -4.1393e-1, -4.9644e-2, -3.3786e-1],
-  ...
-  [ 6.0345e-1,  3.5744e-1,  -1.2672e0, ..., -6.9165e-1, -3.4973e-3, -8.4214e-1],
-  [ 3.9218e-1, -3.2735e-1,  -1.3123e0, ..., -4.9318e-1, -5.1334e-1, -3.6391e-1],
-  [ 3.0978e-1,  2.5662e-4,  -1.2773e0, ...,  1.3357e-2, -3.2390e-1,  2.3858e-1]]]
-Tensor[[1, 9, 1024], f32]
-Took 176.744667ms
-```
-
-### Gelu approximation
-
-You can get a speedup by using an approximation of the gelu activation, with a
-small loss of precision, by passing the `--approximate-gelu` flag:
-
-```bash
-$ cargo run  --example bert --release -- \
--model-id BAAI/bge-large-zh-v1.5 \
--prompt "Here is a test sentence" \
--approximate-gelu
-Loaded and encoded 244.388042ms
-[[[ 3.1048e-1, -6.0339e-4,  -1.2758e0, ...,  1.3718e-2, -3.2362e-1,  2.3775e-1],
-  [-2.8354e-1,  1.9984e-1,  -1.3077e0, ...,  6.9390e-2,  9.9681e-3,  -1.1531e0],
-  [ 3.9947e-1,  1.9917e-1, -9.3178e-1, ..., -4.1301e-1, -5.0719e-2, -3.3955e-1],
-  ...
-  [ 6.0499e-1,  3.5664e-1,  -1.2642e0, ..., -6.9134e-1, -3.4581e-3, -8.4471e-1],
-  [ 3.9311e-1, -3.2812e-1,  -1.3105e0, ..., -4.9291e-1, -5.1270e-1, -3.6543e-1],
-  [ 3.1082e-1, -2.6737e-4,  -1.2762e0, ...,  1.3319e-2, -3.2381e-1,  2.3815e-1]]]
-Tensor[[1, 9, 1024], f32]
-Took 116.840791ms
-```
-
 ## Similarities

 In this example, Bert is used to compute the sentence embeddings for a set of
--- a/candle-examples/examples/bert/main.rs
+++ b/candle-examples/examples/bert/main.rs
@ -3,7 +3,7 @@ extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;
-use candle_transformers::models::bert::{BertModel, Config, HiddenAct, DTYPE};
+use candle_transformers::models::bert::{BertModel, Config, DTYPE};

 use anyhow::{Error as E, Result};
 use candle::Tensor;
@ -45,10 +45,6 @@ struct Args {
    /// L2 normalization for embeddings.
    #[arg(long, default_value = "true")]
    normalize_embeddings: bool,
-
-    /// Use tanh based approximation for Gelu instead of erf implementation.
-    #[arg(long, default_value = "false")]
-    approximate_gelu: bool,
 }

 impl Args {
@ -77,7 +73,7 @@ impl Args {
            (config, tokenizer, weights)
        };
        let config = std::fs::read_to_string(config_filename)?;
-        let mut config: Config = serde_json::from_str(&config)?;
+        let config: Config = serde_json::from_str(&config)?;
        let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

        let vb = if self.use_pth {
@ -85,9 +81,6 @@ impl Args {
        } else {
            unsafe { VarBuilder::from_mmaped_safetensors(&[weights_filename], DTYPE, &device)? }
        };
-        if self.approximate_gelu {
-            config.hidden_act = HiddenAct::GeluApproximate;
-        }
        let model = BertModel::load(vb, &config)?;
        Ok((model, tokenizer))
    }
--- a/candle-examples/examples/blip/main.rs
+++ b/candle-examples/examples/blip/main.rs
@ -106,17 +106,17 @@ pub fn main() -> anyhow::Result<()> {

    let config = blip::Config::image_captioning_large();

-    let device = candle_examples::device(args.cpu)?;
    let (image_embeds, device, mut model) = if args.quantized {
        let device = Device::Cpu;
        let image = load_image(args.image)?.to_device(&device)?;
        println!("loaded image {image:?}");

-        let vb = quantized_blip::VarBuilder::from_gguf(model_file, &device)?;
+        let vb = quantized_blip::VarBuilder::from_gguf(model_file)?;
        let model = quantized_blip::BlipForConditionalGeneration::new(&config, vb)?;
        let image_embeds = image.unsqueeze(0)?.apply(model.vision_model())?;
        (image_embeds, device, Model::Q(model))
    } else {
+        let device = candle_examples::device(args.cpu)?;
        let image = load_image(args.image)?.to_device(&device)?;
        println!("loaded image {image:?}");

--- a/candle-examples/examples/chatglm/main.rs
+++ b/candle-examples/examples/chatglm/main.rs
@ -1,237 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::{Error as E, Result};
-use clap::Parser;
-
-use candle_transformers::models::chatglm::{Config, Model};
-
-use candle::{DType, Device, Tensor};
-use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-struct TextGeneration {
-    model: Model,
-    device: Device,
-    tokenizer: Tokenizer,
-    logits_processor: LogitsProcessor,
-    repeat_penalty: f32,
-    repeat_last_n: usize,
-    verbose_prompt: bool,
-}
-
-impl TextGeneration {
-    #[allow(clippy::too_many_arguments)]
-    fn new(
-        model: Model,
-        tokenizer: Tokenizer,
-        seed: u64,
-        temp: Option<f64>,
-        top_p: Option<f64>,
-        repeat_penalty: f32,
-        repeat_last_n: usize,
-        verbose_prompt: bool,
-        device: &Device,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-        Self {
-            model,
-            tokenizer,
-            logits_processor,
-            repeat_penalty,
-            repeat_last_n,
-            verbose_prompt,
-            device: device.clone(),
-        }
-    }
-
-    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
-        use std::io::Write;
-        println!("starting the inference loop");
-        let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?;
-        if tokens.is_empty() {
-            anyhow::bail!("Empty prompts are not supported in the chatglm model.")
-        }
-        if self.verbose_prompt {
-            for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
-                let token = token.replace('▁', " ").replace("<0x0A>", "\n");
-                println!("{id:7} -> '{token}'");
-            }
-        }
-        let mut tokens = tokens.get_ids().to_vec();
-        let mut generated_tokens = 0usize;
-        let eos_token = match self.tokenizer.get_vocab(true).get("</s>") {
-            Some(token) => *token,
-            None => anyhow::bail!("cannot find the endoftext token"),
-        };
-        print!("{prompt}");
-        std::io::stdout().flush()?;
-        let start_gen = std::time::Instant::now();
-        for index in 0..sample_len {
-            let context_size = if index > 0 { 1 } else { tokens.len() };
-            let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
-            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
-            let logits = self.model.forward(&input)?;
-            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
-            let logits = if self.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    self.repeat_penalty,
-                    &tokens[start_at..],
-                )?
-            };
-
-            let next_token = self.logits_processor.sample(&logits)?;
-            tokens.push(next_token);
-            generated_tokens += 1;
-            if next_token == eos_token {
-                break;
-            }
-            let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?;
-            print!("{token}");
-            std::io::stdout().flush()?;
-        }
-        let dt = start_gen.elapsed();
-        println!(
-            "\n{generated_tokens} tokens generated ({:.2} token/s)",
-            generated_tokens as f64 / dt.as_secs_f64(),
-        );
-        Ok(())
-    }
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    /// Display the token for the specified prompt.
-    #[arg(long)]
-    verbose_prompt: bool,
-
-    #[arg(long)]
-    prompt: String,
-
-    /// The temperature used to generate samples.
-    #[arg(long)]
-    temperature: Option<f64>,
-
-    /// Nucleus sampling probability cutoff.
-    #[arg(long)]
-    top_p: Option<f64>,
-
-    /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
-    seed: u64,
-
-    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 5000)]
-    sample_len: usize,
-
-    #[arg(long)]
-    model_id: Option<String>,
-
-    #[arg(long)]
-    revision: Option<String>,
-
-    #[arg(long)]
-    weight_file: Option<String>,
-
-    #[arg(long)]
-    tokenizer: Option<String>,
-
-    /// Penalty to be applied for repeating tokens, 1. means no penalty.
-    #[arg(long, default_value_t = 1.1)]
-    repeat_penalty: f32,
-
-    /// The context size to consider for the repeat penalty.
-    #[arg(long, default_value_t = 64)]
-    repeat_last_n: usize,
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    println!(
-        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        candle::utils::with_avx(),
-        candle::utils::with_neon(),
-        candle::utils::with_simd128(),
-        candle::utils::with_f16c()
-    );
-    println!(
-        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
-        args.temperature.unwrap_or(0.),
-        args.repeat_penalty,
-        args.repeat_last_n
-    );
-
-    let start = std::time::Instant::now();
-    let api = Api::new()?;
-    let model_id = match args.model_id {
-        Some(model_id) => model_id.to_string(),
-        None => "THUDM/chatglm3-6b".to_string(),
-    };
-    let revision = match args.revision {
-        Some(rev) => rev.to_string(),
-        None => "main".to_string(),
-    };
-    let repo = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));
-    let tokenizer_filename = match args.tokenizer {
-        Some(file) => std::path::PathBuf::from(file),
-        None => api
-            .model("lmz/candle-chatglm".to_string())
-            .get("chatglm-tokenizer.json")?,
-    };
-    let filenames = match args.weight_file {
-        Some(weight_file) => vec![std::path::PathBuf::from(weight_file)],
-        None => candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?,
-    };
-    println!("retrieved the files in {:?}", start.elapsed());
-    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-    let start = std::time::Instant::now();
-    let config = Config::glm3_6b();
-    let device = candle_examples::device(args.cpu)?;
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
-    let model = Model::new(&config, vb)?;
-
-    println!("loaded the model in {:?}", start.elapsed());
-
-    let mut pipeline = TextGeneration::new(
-        model,
-        tokenizer,
-        args.seed,
-        args.temperature,
-        args.top_p,
-        args.repeat_penalty,
-        args.repeat_last_n,
-        args.verbose_prompt,
-        &device,
-    );
-    pipeline.run(&args.prompt, args.sample_len)?;
-    Ok(())
-}
--- a/candle-examples/examples/convmixer/main.rs
+++ b/candle-examples/examples/convmixer/main.rs
@ -28,7 +28,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/convnext/README.md
+++ b/candle-examples/examples/convnext/README.md
@ -1,23 +0,0 @@
-# candle-convnext
-
-[A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) and
-[ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808).
-
-This candle implementation uses a pre-trained ConvNeXt network for inference. The
-classification head has been trained on the ImageNet dataset and returns the
-probabilities for the top-5 classes.
-
-## Running an example
-
-```
-$ cargo run --example convnext --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg --which tiny
-
-loaded image Tensor[dims 3, 224, 224; f32]
-model built
-mountain bike, all-terrain bike, off-roader: 84.09%
-bicycle-built-for-two, tandem bicycle, tandem: 4.15%
-maillot                 : 0.74%
-crash helmet            : 0.54%
-unicycle, monocycle     : 0.44%
-
-```
--- a/candle-examples/examples/convnext/main.rs
+++ b/candle-examples/examples/convnext/main.rs
@ -1,126 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use clap::{Parser, ValueEnum};
-
-use candle::{DType, IndexOp, D};
-use candle_nn::{Module, VarBuilder};
-use candle_transformers::models::convnext;
-
-#[derive(Clone, Copy, Debug, ValueEnum)]
-enum Which {
-    Atto,
-    Femto,
-    Pico,
-    Nano,
-    Tiny,
-    Small,
-    Base,
-    Large,
-    AttoV2,
-    FemtoV2,
-    PicoV2,
-    NanoV2,
-    TinyV2,
-    BaseV2,
-    LargeV2,
-    XLarge,
-    Huge,
-}
-
-impl Which {
-    fn model_filename(&self) -> String {
-        let name = match self {
-            Self::Atto => "convnext_atto.d2_in1k",
-            Self::Femto => "convnext_femto.d1_in1k",
-            Self::Pico => "convnext_pico.d1_in1k",
-            Self::Nano => "convnext_nano.d1h_in1k",
-            Self::Tiny => "convnext_tiny.fb_in1k",
-            Self::Small => "convnext_small.fb_in1k",
-            Self::Base => "convnext_base.fb_in1k",
-            Self::Large => "convnext_large.fb_in1k",
-            Self::AttoV2 => "convnextv2_atto.fcmae_ft_in1k",
-            Self::FemtoV2 => "convnextv2_femto.fcmae_ft_in1k",
-            Self::PicoV2 => "convnextv2_pico.fcmae_ft_in1k",
-            Self::NanoV2 => "convnextv2_nano.fcmae_ft_in1k",
-            Self::TinyV2 => "convnextv2_tiny.fcmae_ft_in1k",
-            Self::BaseV2 => "convnextv2_base.fcmae_ft_in1k",
-            Self::LargeV2 => "convnextv2_large.fcmae_ft_in1k",
-            Self::XLarge => "convnext_xlarge.fb_in22k_ft_in1k",
-            Self::Huge => "convnextv2_huge.fcmae_ft_in1k",
-        };
-
-        format!("timm/{name}")
-    }
-
-    fn config(&self) -> convnext::Config {
-        match self {
-            Self::Atto | Self::AttoV2 => convnext::Config::atto(),
-            Self::Femto | Self::FemtoV2 => convnext::Config::femto(),
-            Self::Pico | Self::PicoV2 => convnext::Config::pico(),
-            Self::Nano | Self::NanoV2 => convnext::Config::nano(),
-            Self::Tiny | Self::TinyV2 => convnext::Config::tiny(),
-            Self::Small => convnext::Config::small(),
-            Self::Base | Self::BaseV2 => convnext::Config::base(),
-            Self::Large | Self::LargeV2 => convnext::Config::large(),
-            Self::XLarge => convnext::Config::xlarge(),
-            Self::Huge => convnext::Config::huge(),
-        }
-    }
-}
-
-#[derive(Parser)]
-struct Args {
-    #[arg(long)]
-    model: Option<String>,
-
-    #[arg(long)]
-    image: String,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    #[arg(value_enum, long, default_value_t=Which::Tiny)]
-    which: Which,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-
-    let device = candle_examples::device(args.cpu)?;
-
-    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
-    println!("loaded image {image:?}");
-
-    let model_file = match args.model {
-        None => {
-            let model_name = args.which.model_filename();
-            let api = hf_hub::api::sync::Api::new()?;
-            let api = api.model(model_name);
-            api.get("model.safetensors")?
-        }
-        Some(model) => model.into(),
-    };
-
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
-    let model = convnext::convnext(&args.which.config(), 1000, vb)?;
-    println!("model built");
-    let logits = model.forward(&image.unsqueeze(0)?)?;
-    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
-        .i(0)?
-        .to_vec1::<f32>()?;
-    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
-    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
-    for &(category_idx, pr) in prs.iter().take(5) {
-        println!(
-            "{:24}: {:.2}%",
-            candle_examples::imagenet::CLASSES[category_idx],
-            100. * pr
-        );
-    }
-    Ok(())
-}
--- a/candle-examples/examples/custom-ops/cuda_kernels.rs
+++ b/candle-examples/examples/custom-ops/cuda_kernels.rs
@ -1 +1,2 @@
-pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/layernorm_kernels.ptx"));
+#[rustfmt::skip]
+pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/examples/custom-ops/kernels//layernorm_kernels.ptx"));
--- a/candle-examples/examples/custom-ops/main.rs
+++ b/candle-examples/examples/custom-ops/main.rs
@ -6,8 +6,7 @@
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

-#[rustfmt::skip]
-#[cfg(feature = "cuda")]
+#[allow(unused)]
 mod cuda_kernels;

 use clap::Parser;
--- a/candle-examples/examples/dinov2/main.rs
+++ b/candle-examples/examples/dinov2/main.rs
@ -31,7 +31,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/distilbert/README.md
+++ b/candle-examples/examples/distilbert/README.md
@ -1,22 +0,0 @@
-# candle-distilbert
-
-DistilBert is a distiled version of the Bert model.
-
-## Sentence embeddings
-
-DistilBert is used to compute the sentence embeddings for a prompt. The model weights
-are downloaded from the hub on the first run.
-
-```bash
-cargo run --example distilbert --release -- --prompt "Here is a test sentence"
-
-> [[[ 0.5109,  0.1280, -0.2635, ...,  0.3462, -1.0434,  0.1441],
->   [ 0.1735,  0.0818, -0.5549, ...,  0.3472, -0.8264, -0.0244],
->   [ 0.0702, -0.1311, -0.4914, ...,  0.3483, -0.6194,  0.1829],
->   ...
->   [ 0.2993, -0.0106, -0.4640, ...,  0.2844, -0.6732,  0.0042],
->   [ 0.1066, -0.0081, -0.4299, ...,  0.3435, -0.7729,  0.0190],
->   [ 0.8903,  0.2055, -0.2541, ...,  0.3208, -0.6585,  0.0586]]]
-> Tensor[[1, 7, 768], f32]
-
-```
--- a/candle-examples/examples/distilbert/main.rs
+++ b/candle-examples/examples/distilbert/main.rs
@ -1,135 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-use candle_transformers::models::distilbert::{Config, DistilBertModel, DTYPE};
-
-use anyhow::{Error as E, Result};
-use candle::{Device, Tensor};
-use candle_nn::VarBuilder;
-use clap::Parser;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    /// The model to use, check out available models: https://huggingface.co/models?library=sentence-transformers&sort=trending
-    #[arg(long)]
-    model_id: Option<String>,
-
-    #[arg(long)]
-    revision: Option<String>,
-
-    /// When set, compute embeddings for this prompt.
-    #[arg(long)]
-    prompt: String,
-
-    /// Use the pytorch weights rather than the safetensors ones
-    #[arg(long)]
-    use_pth: bool,
-
-    /// The number of times to run the prompt.
-    #[arg(long, default_value = "1")]
-    n: usize,
-
-    /// L2 normalization for embeddings.
-    #[arg(long, default_value = "true")]
-    normalize_embeddings: bool,
-}
-
-impl Args {
-    fn build_model_and_tokenizer(&self) -> Result<(DistilBertModel, Tokenizer)> {
-        let device = candle_examples::device(self.cpu)?;
-        let default_model = "distilbert-base-uncased".to_string();
-        let default_revision = "main".to_string();
-        let (model_id, revision) = match (self.model_id.to_owned(), self.revision.to_owned()) {
-            (Some(model_id), Some(revision)) => (model_id, revision),
-            (Some(model_id), None) => (model_id, "main".to_string()),
-            (None, Some(revision)) => (default_model, revision),
-            (None, None) => (default_model, default_revision),
-        };
-
-        let repo = Repo::with_revision(model_id, RepoType::Model, revision);
-        let (config_filename, tokenizer_filename, weights_filename) = {
-            let api = Api::new()?;
-            let api = api.repo(repo);
-            let config = api.get("config.json")?;
-            let tokenizer = api.get("tokenizer.json")?;
-            let weights = if self.use_pth {
-                api.get("pytorch_model.bin")?
-            } else {
-                api.get("model.safetensors")?
-            };
-            (config, tokenizer, weights)
-        };
-        let config = std::fs::read_to_string(config_filename)?;
-        let config: Config = serde_json::from_str(&config)?;
-        let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-        let vb = if self.use_pth {
-            VarBuilder::from_pth(&weights_filename, DTYPE, &device)?
-        } else {
-            unsafe { VarBuilder::from_mmaped_safetensors(&[weights_filename], DTYPE, &device)? }
-        };
-        let model = DistilBertModel::load(vb, &config)?;
-        Ok((model, tokenizer))
-    }
-}
-
-fn get_mask(size: usize, device: &Device) -> Tensor {
-    let mask: Vec<_> = (0..size)
-        .flat_map(|i| (0..size).map(move |j| u8::from(j > i)))
-        .collect();
-    Tensor::from_slice(&mask, (size, size), device).unwrap()
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        println!("tracing...");
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    let (model, mut tokenizer) = args.build_model_and_tokenizer()?;
-    let device = &model.device;
-
-    let tokenizer = tokenizer
-        .with_padding(None)
-        .with_truncation(None)
-        .map_err(E::msg)?;
-    let tokens = tokenizer
-        .encode(args.prompt, true)
-        .map_err(E::msg)?
-        .get_ids()
-        .to_vec();
-    let token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?;
-    let mask = get_mask(tokens.len(), device);
-
-    println!("token_ids: {:?}", token_ids.to_vec2::<u32>());
-    println!("mask: {:?}", mask.to_vec2::<u8>());
-
-    let ys = model.forward(&token_ids, &mask)?;
-    println!("{ys}");
-
-    Ok(())
-}
-
-pub fn normalize_l2(v: &Tensor) -> Result<Tensor> {
-    Ok(v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)?)
-}
--- a/candle-examples/examples/efficientnet/main.rs
+++ b/candle-examples/examples/efficientnet/main.rs
@ -47,7 +47,7 @@ pub fn main() -> anyhow::Result<()> {

    let device = candle_examples::device(args.cpu)?;

-    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
+    let image = candle_examples::imagenet::load_image224(args.image)?;
    println!("loaded image {image:?}");

    let model_file = match args.model {
--- a/candle-examples/examples/efficientvit/README.md
+++ b/candle-examples/examples/efficientvit/README.md
@ -1,20 +0,0 @@
-# candle-efficientvit
-
-[EfﬁcientViT: Memory Efﬁcient Vision Transformer with Cascaded Group Attention](https://arxiv.org/abs/2305.07027).
-
-This candle implementation uses a pre-trained EfficientViT (from Microsoft Research Asia) network for inference.
-The classification head has been trained on the ImageNet dataset and returns the probabilities for the top-5 classes.
-
-## Running an example
-
-```
-$ cargo run --example efficientvit --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg --which m1
-
-loaded image Tensor[dims 3, 224, 224; f32]
-model built
-mountain bike, all-terrain bike, off-roader: 69.80%
-unicycle, monocycle     : 13.03%
-bicycle-built-for-two, tandem bicycle, tandem: 9.28%
-crash helmet            : 2.25%
-alp                     : 0.46%
-```
--- a/candle-examples/examples/efficientvit/main.rs
+++ b/candle-examples/examples/efficientvit/main.rs
@ -1,99 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use clap::{Parser, ValueEnum};
-
-use candle::{DType, IndexOp, D};
-use candle_nn::{Module, VarBuilder};
-use candle_transformers::models::efficientvit;
-
-#[derive(Clone, Copy, Debug, ValueEnum)]
-enum Which {
-    M0,
-    M1,
-    M2,
-    M3,
-    M4,
-    M5,
-}
-
-impl Which {
-    fn model_filename(&self) -> String {
-        let name = match self {
-            Self::M0 => "m0",
-            Self::M1 => "m1",
-            Self::M2 => "m2",
-            Self::M3 => "m3",
-            Self::M4 => "m4",
-            Self::M5 => "m5",
-        };
-        format!("timm/efficientvit_{}.r224_in1k", name)
-    }
-
-    fn config(&self) -> efficientvit::Config {
-        match self {
-            Self::M0 => efficientvit::Config::m0(),
-            Self::M1 => efficientvit::Config::m1(),
-            Self::M2 => efficientvit::Config::m2(),
-            Self::M3 => efficientvit::Config::m3(),
-            Self::M4 => efficientvit::Config::m4(),
-            Self::M5 => efficientvit::Config::m5(),
-        }
-    }
-}
-
-#[derive(Parser)]
-struct Args {
-    #[arg(long)]
-    model: Option<String>,
-
-    #[arg(long)]
-    image: String,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    #[arg(value_enum, long, default_value_t=Which::M0)]
-    which: Which,
-}
-
-pub fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-
-    let device = candle_examples::device(args.cpu)?;
-
-    let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?;
-    println!("loaded image {image:?}");
-
-    let model_file = match args.model {
-        None => {
-            let model_name = args.which.model_filename();
-            let api = hf_hub::api::sync::Api::new()?;
-            let api = api.model(model_name);
-            api.get("model.safetensors")?
-        }
-        Some(model) => model.into(),
-    };
-
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
-    let model = efficientvit::efficientvit(&args.which.config(), 1000, vb)?;
-    println!("model built");
-    let logits = model.forward(&image.unsqueeze(0)?)?;
-    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
-        .i(0)?
-        .to_vec1::<f32>()?;
-    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
-    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
-    for &(category_idx, pr) in prs.iter().take(5) {
-        println!(
-            "{:24}: {:.2}%",
-            candle_examples::imagenet::CLASSES[category_idx],
-            100. * pr
-        );
-    }
-    Ok(())
-}
--- a/candle-examples/examples/encodec/README.md
+++ b/candle-examples/examples/encodec/README.md
@ -1,25 +0,0 @@
-# candle-endocec
-
-[EnCodec](https://huggingface.co/facebook/encodec_24khz) is a high-quality audio
-compression model using an encoder/decoder architecture with residual vector
-quantization.
-
-## Running one example
-
-```bash
-cargo run --example encodec --features symphonia --release -- code-to-audio \
-    candle-examples/examples/encodec/jfk-codes.safetensors \
-    jfk.wav
-```
-
-This decodes the EnCodec tokens stored in `jfk-codes.safetensors` and generates
-an output wav file containing the audio data.
-
-Instead of `code-to-audio` one can use:
- `audio-to-audio in.mp3 out.wav`: encodes the input audio file then decodes it to a wav file.
- `audio-to-code in.mp3 out.safetensors`: generates a safetensors file
-  containing EnCodec tokens for the input audio file.
-
-If the audio output file name is set to `-`, the audio content directly gets
-played on default audio output device. If the audio input file is set to `-`, the audio
-gets recorded from the default audio input.
--- a/candle-examples/examples/encodec/audio_io.rs
+++ b/candle-examples/examples/encodec/audio_io.rs
@ -1,275 +0,0 @@
-#![allow(unused)]
-use anyhow::{Context, Result};
-use std::sync::{Arc, Mutex};
-
-pub const SAMPLE_RATE: usize = 24_000;
-
-pub(crate) struct AudioOutputData_ {
-    resampled_data: std::collections::VecDeque<f32>,
-    resampler: rubato::FastFixedIn<f32>,
-    output_buffer: Vec<f32>,
-    input_buffer: Vec<f32>,
-    input_len: usize,
-}
-
-impl AudioOutputData_ {
-    pub(crate) fn new(input_sample_rate: usize, output_sample_rate: usize) -> Result<Self> {
-        use rubato::Resampler;
-
-        let resampled_data = std::collections::VecDeque::with_capacity(output_sample_rate * 10);
-        let resample_ratio = output_sample_rate as f64 / input_sample_rate as f64;
-        let resampler = rubato::FastFixedIn::new(
-            resample_ratio,
-            f64::max(resample_ratio, 1.0),
-            rubato::PolynomialDegree::Septic,
-            1024,
-            1,
-        )?;
-        let input_buffer = resampler.input_buffer_allocate(true).remove(0);
-        let output_buffer = resampler.output_buffer_allocate(true).remove(0);
-        Ok(Self {
-            resampled_data,
-            resampler,
-            input_buffer,
-            output_buffer,
-            input_len: 0,
-        })
-    }
-
-    pub fn reset(&mut self) {
-        use rubato::Resampler;
-        self.output_buffer.fill(0.);
-        self.input_buffer.fill(0.);
-        self.resampler.reset();
-        self.resampled_data.clear();
-    }
-
-    pub(crate) fn take_all(&mut self) -> Vec<f32> {
-        let mut data = Vec::with_capacity(self.resampled_data.len());
-        while let Some(elem) = self.resampled_data.pop_back() {
-            data.push(elem);
-        }
-        data
-    }
-
-    pub(crate) fn is_empty(&self) -> bool {
-        self.resampled_data.is_empty()
-    }
-
-    // Assumes that the input buffer is large enough.
-    fn push_input_buffer(&mut self, samples: &[f32]) {
-        self.input_buffer[self.input_len..self.input_len + samples.len()].copy_from_slice(samples);
-        self.input_len += samples.len()
-    }
-
-    pub(crate) fn push_samples(&mut self, samples: &[f32]) -> Result<()> {
-        use rubato::Resampler;
-
-        let mut pos_in = 0;
-        loop {
-            let rem = self.input_buffer.len() - self.input_len;
-            let pos_end = usize::min(pos_in + rem, samples.len());
-            self.push_input_buffer(&samples[pos_in..pos_end]);
-            pos_in = pos_end;
-            if self.input_len < self.input_buffer.len() {
-                break;
-            }
-            let (_, out_len) = self.resampler.process_into_buffer(
-                &[&self.input_buffer],
-                &mut [&mut self.output_buffer],
-                None,
-            )?;
-            for &elem in self.output_buffer[..out_len].iter() {
-                self.resampled_data.push_front(elem)
-            }
-            self.input_len = 0;
-        }
-        Ok(())
-    }
-}
-
-type AudioOutputData = Arc<Mutex<AudioOutputData_>>;
-
-pub(crate) fn setup_output_stream() -> Result<(cpal::Stream, AudioOutputData)> {
-    use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
-
-    println!("Setup audio output stream!");
-    let host = cpal::default_host();
-    let device = host
-        .default_output_device()
-        .context("no output device available")?;
-    let mut supported_configs_range = device.supported_output_configs()?;
-    let config_range = match supported_configs_range.find(|c| c.channels() == 1) {
-        // On macOS, it's commonly the case that there are only stereo outputs.
-        None => device
-            .supported_output_configs()?
-            .next()
-            .context("no audio output available")?,
-        Some(config_range) => config_range,
-    };
-    let sample_rate = cpal::SampleRate(SAMPLE_RATE as u32).clamp(
-        config_range.min_sample_rate(),
-        config_range.max_sample_rate(),
-    );
-    let config: cpal::StreamConfig = config_range.with_sample_rate(sample_rate).into();
-    let channels = config.channels as usize;
-    println!(
-        "cpal device: {} {} {config:?}",
-        device.name().unwrap_or_else(|_| "unk".to_string()),
-        config.sample_rate.0
-    );
-    let audio_data = Arc::new(Mutex::new(AudioOutputData_::new(
-        SAMPLE_RATE,
-        config.sample_rate.0 as usize,
-    )?));
-    let ad = audio_data.clone();
-    let stream = device.build_output_stream(
-        &config,
-        move |data: &mut [f32], _: &cpal::OutputCallbackInfo| {
-            data.fill(0.);
-            let mut ad = ad.lock().unwrap();
-            let mut last_elem = 0f32;
-            for (idx, elem) in data.iter_mut().enumerate() {
-                if idx % channels == 0 {
-                    match ad.resampled_data.pop_back() {
-                        None => break,
-                        Some(v) => {
-                            last_elem = v;
-                            *elem = v
-                        }
-                    }
-                } else {
-                    *elem = last_elem
-                }
-            }
-        },
-        move |err| eprintln!("cpal error: {err}"),
-        None, // None=blocking, Some(Duration)=timeout
-    )?;
-    stream.play()?;
-    Ok((stream, audio_data))
-}
-
-pub(crate) fn setup_input_stream() -> Result<(cpal::Stream, AudioOutputData)> {
-    use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
-
-    println!("Setup audio input stream!");
-    let host = cpal::default_host();
-    let device = host
-        .default_input_device()
-        .context("no input device available")?;
-    let mut supported_configs_range = device.supported_input_configs()?;
-    let config_range = supported_configs_range
-        .find(|c| c.channels() == 1)
-        .context("no audio input available")?;
-    let sample_rate = cpal::SampleRate(SAMPLE_RATE as u32).clamp(
-        config_range.min_sample_rate(),
-        config_range.max_sample_rate(),
-    );
-    let config: cpal::StreamConfig = config_range.with_sample_rate(sample_rate).into();
-    println!(
-        "cpal device: {} {} {config:?}",
-        device.name().unwrap_or_else(|_| "unk".to_string()),
-        config.sample_rate.0
-    );
-    let audio_data = Arc::new(Mutex::new(AudioOutputData_::new(
-        config.sample_rate.0 as usize,
-        SAMPLE_RATE,
-    )?));
-    let ad = audio_data.clone();
-    let stream = device.build_input_stream(
-        &config,
-        move |data: &[f32], _: &cpal::InputCallbackInfo| {
-            let mut ad = ad.lock().unwrap();
-            if let Err(err) = ad.push_samples(data) {
-                eprintln!("error processing audio input {err:?}")
-            }
-        },
-        move |err| eprintln!("cpal error: {err}"),
-        None, // None=blocking, Some(Duration)=timeout
-    )?;
-    stream.play()?;
-    Ok((stream, audio_data))
-}
-
-fn conv<T>(samples: &mut Vec<f32>, data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>)
-where
-    T: symphonia::core::sample::Sample,
-    f32: symphonia::core::conv::FromSample<T>,
-{
-    use symphonia::core::audio::Signal;
-    use symphonia::core::conv::FromSample;
-    samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
-}
-
-pub(crate) fn pcm_decode<P: AsRef<std::path::Path>>(path: P) -> Result<(Vec<f32>, u32)> {
-    use symphonia::core::audio::{AudioBufferRef, Signal};
-
-    let src = std::fs::File::open(path)?;
-    let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default());
-    let hint = symphonia::core::probe::Hint::new();
-    let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
-    let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
-    let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?;
-    let mut format = probed.format;
-    let track = format
-        .tracks()
-        .iter()
-        .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
-        .expect("no supported audio tracks");
-    let mut decoder = symphonia::default::get_codecs()
-        .make(&track.codec_params, &Default::default())
-        .expect("unsupported codec");
-    let track_id = track.id;
-    let sample_rate = track.codec_params.sample_rate.unwrap_or(0);
-    let mut pcm_data = Vec::new();
-    while let Ok(packet) = format.next_packet() {
-        while !format.metadata().is_latest() {
-            format.metadata().pop();
-        }
-        if packet.track_id() != track_id {
-            continue;
-        }
-        match decoder.decode(&packet)? {
-            AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
-            AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
-            AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
-        }
-    }
-    Ok((pcm_data, sample_rate))
-}
-
-pub(crate) fn resample(pcm_in: &[f32], sr_in: usize, sr_out: usize) -> Result<Vec<f32>> {
-    use rubato::Resampler;
-
-    let mut pcm_out =
-        Vec::with_capacity((pcm_in.len() as f64 * sr_out as f64 / sr_in as f64) as usize + 1024);
-
-    let mut resampler = rubato::FftFixedInOut::<f32>::new(sr_in, sr_out, 1024, 1)?;
-    let mut output_buffer = resampler.output_buffer_allocate(true);
-    let mut pos_in = 0;
-    while pos_in + resampler.input_frames_next() < pcm_in.len() {
-        let (in_len, out_len) =
-            resampler.process_into_buffer(&[&pcm_in[pos_in..]], &mut output_buffer, None)?;
-        pos_in += in_len;
-        pcm_out.extend_from_slice(&output_buffer[0][..out_len]);
-    }
-
-    if pos_in < pcm_in.len() {
-        let (_in_len, out_len) = resampler.process_partial_into_buffer(
-            Some(&[&pcm_in[pos_in..]]),
-            &mut output_buffer,
-            None,
-        )?;
-        pcm_out.extend_from_slice(&output_buffer[0][..out_len]);
-    }
-
-    Ok(pcm_out)
-}
--- a/candle-examples/examples/encodec/jfk-codes.safetensors
+++ b/candle-examples/examples/encodec/jfk-codes.safetensors
--- a/candle-examples/examples/encodec/main.rs
+++ b/candle-examples/examples/encodec/main.rs
@ -1,131 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::Result;
-use candle::{DType, IndexOp, Tensor};
-use candle_nn::VarBuilder;
-use candle_transformers::models::encodec::{Config, Model};
-use clap::{Parser, ValueEnum};
-use hf_hub::api::sync::Api;
-
-mod audio_io;
-
-#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
-enum Action {
-    AudioToAudio,
-    AudioToCode,
-    CodeToAudio,
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// The action to be performed, specifies the format for the input and output data.
-    action: Action,
-
-    /// The input file, either an audio file or some encodec tokens stored as safetensors.
-    in_file: String,
-
-    /// The output file, either a wave audio file or some encodec tokens stored as safetensors.
-    out_file: String,
-
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// The model weight file, in safetensor format.
-    #[arg(long)]
-    model: Option<String>,
-}
-
-fn main() -> Result<()> {
-    let args = Args::parse();
-    let device = candle_examples::device(args.cpu)?;
-    let model = match args.model {
-        Some(model) => std::path::PathBuf::from(model),
-        None => Api::new()?
-            .model("facebook/encodec_24khz".to_string())
-            .get("model.safetensors")?,
-    };
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model], DType::F32, &device)? };
-    let config = Config::default();
-    let model = Model::new(&config, vb)?;
-
-    let codes = match args.action {
-        Action::CodeToAudio => {
-            let codes = candle::safetensors::load(args.in_file, &device)?;
-            codes.get("codes").expect("no codes in input file").clone()
-        }
-        Action::AudioToCode | Action::AudioToAudio => {
-            let pcm = if args.in_file == "-" {
-                println!(">>>> RECORDING AUDIO, PRESS ENTER ONCE DONE <<<<");
-                let (stream, input_audio) = audio_io::setup_input_stream()?;
-                let mut pcms = vec![];
-                let stdin = std::thread::spawn(|| {
-                    let mut s = String::new();
-                    std::io::stdin().read_line(&mut s)
-                });
-                while !stdin.is_finished() {
-                    let input = input_audio.lock().unwrap().take_all();
-                    if input.is_empty() {
-                        std::thread::sleep(std::time::Duration::from_millis(100));
-                        continue;
-                    }
-                    pcms.push(input)
-                }
-                drop(stream);
-                pcms.concat()
-            } else {
-                let (pcm, sample_rate) = audio_io::pcm_decode(args.in_file)?;
-                if sample_rate != 24_000 {
-                    println!("WARNING: encodec uses a 24khz sample rate, input uses {sample_rate}, resampling...");
-                    audio_io::resample(&pcm, sample_rate as usize, 24_000)?
-                } else {
-                    pcm
-                }
-            };
-            let pcm_len = pcm.len();
-            let pcm = Tensor::from_vec(pcm, (1, 1, pcm_len), &device)?;
-            println!("input pcm shape: {:?}", pcm.shape());
-            model.encode(&pcm)?
-        }
-    };
-    println!("codes shape: {:?}", codes.shape());
-
-    match args.action {
-        Action::AudioToCode => {
-            codes.save_safetensors("codes", &args.out_file)?;
-        }
-        Action::AudioToAudio | Action::CodeToAudio => {
-            let pcm = model.decode(&codes)?;
-            println!("output pcm shape: {:?}", pcm.shape());
-            let pcm = pcm.i(0)?.i(0)?;
-            let pcm = candle_examples::audio::normalize_loudness(&pcm, 24_000, true)?;
-            let pcm = pcm.to_vec1::<f32>()?;
-            if args.out_file == "-" {
-                let (stream, ad) = audio_io::setup_output_stream()?;
-                {
-                    let mut ad = ad.lock().unwrap();
-                    ad.push_samples(&pcm)?;
-                }
-                loop {
-                    let ad = ad.lock().unwrap();
-                    if ad.is_empty() {
-                        break;
-                    }
-                    // That's very weird, calling thread::sleep here triggers the stream to stop
-                    // playing (the callback doesn't seem to be called anymore).
-                    // std::thread::sleep(std::time::Duration::from_millis(100));
-                }
-                drop(stream)
-            } else {
-                let mut output = std::fs::File::create(&args.out_file)?;
-                candle_examples::wav::write_pcm_as_wav(&mut output, &pcm, 24_000)?;
-            }
-        }
-    }
-    Ok(())
-}
--- a/candle-examples/examples/falcon/main.rs
+++ b/candle-examples/examples/falcon/main.rs
@ -165,7 +165,14 @@ fn main() -> Result<()> {
        args.revision,
    ));
    let tokenizer_filename = repo.get("tokenizer.json")?;
-    let filenames = candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?;
+    let mut filenames = vec![];
+    for rfilename in [
+        "model-00001-of-00002.safetensors",
+        "model-00002-of-00002.safetensors",
+    ] {
+        let filename = repo.get(rfilename)?;
+        filenames.push(filename);
+    }
    println!("retrieved the files in {:?}", start.elapsed());
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

--- a/candle-examples/examples/gemma/README.md
+++ b/candle-examples/examples/gemma/README.md
@ -1,27 +0,0 @@
-# candle-gemma: 2b and 7b LLMs from Google DeepMind
-
-[Gemma](https://ai.google.dev/gemma/docs) is a collection of lightweight open
-models published by Google Deepmind with a 2b and a 7b variant.
-
-In order to use the example below, you have to accept the license on the
-[HuggingFace Hub Gemma repo](https://huggingface.co/google/gemma-7b) and set up
-your access token via the [HuggingFace cli login
-command](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-login).
-
-## Running the example
-
-```bash
-$ cargo run --example gemma --release -- --prompt "fn count_primes(max_n: usize)"
-fn count_primes(max_n: usize) -> usize {
-    let mut primes = vec![true; max_n];
-    for i in 2..=max_n {
-        if primes[i] {
-            for j in i * i..max_n {
-                primes[j] = false;
-             }
-         }
-    }
-    primes.len()
-}
-```
-
--- a/candle-examples/examples/gemma/main.rs
+++ b/candle-examples/examples/gemma/main.rs
@ -1,256 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::{Error as E, Result};
-use clap::Parser;
-
-use candle_transformers::models::gemma::{Config, Model};
-
-use candle::{DType, Device, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-struct TextGeneration {
-    model: Model,
-    device: Device,
-    tokenizer: TokenOutputStream,
-    logits_processor: LogitsProcessor,
-    repeat_penalty: f32,
-    repeat_last_n: usize,
-}
-
-impl TextGeneration {
-    #[allow(clippy::too_many_arguments)]
-    fn new(
-        model: Model,
-        tokenizer: Tokenizer,
-        seed: u64,
-        temp: Option<f64>,
-        top_p: Option<f64>,
-        repeat_penalty: f32,
-        repeat_last_n: usize,
-        device: &Device,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-        Self {
-            model,
-            tokenizer: TokenOutputStream::new(tokenizer),
-            logits_processor,
-            repeat_penalty,
-            repeat_last_n,
-            device: device.clone(),
-        }
-    }
-
-    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
-        use std::io::Write;
-        self.tokenizer.clear();
-        let mut tokens = self
-            .tokenizer
-            .tokenizer()
-            .encode(prompt, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        for &t in tokens.iter() {
-            if let Some(t) = self.tokenizer.next_token(t)? {
-                print!("{t}")
-            }
-        }
-        std::io::stdout().flush()?;
-
-        let mut generated_tokens = 0usize;
-        let eos_token = match self.tokenizer.get_token("<eos>") {
-            Some(token) => token,
-            None => anyhow::bail!("cannot find the <eos> token"),
-        };
-        let start_gen = std::time::Instant::now();
-        for index in 0..sample_len {
-            let context_size = if index > 0 { 1 } else { tokens.len() };
-            let start_pos = tokens.len().saturating_sub(context_size);
-            let ctxt = &tokens[start_pos..];
-            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
-            let logits = self.model.forward(&input, start_pos)?;
-            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
-            let logits = if self.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    self.repeat_penalty,
-                    &tokens[start_at..],
-                )?
-            };
-
-            let next_token = self.logits_processor.sample(&logits)?;
-            tokens.push(next_token);
-            generated_tokens += 1;
-            if next_token == eos_token {
-                break;
-            }
-            if let Some(t) = self.tokenizer.next_token(next_token)? {
-                print!("{t}");
-                std::io::stdout().flush()?;
-            }
-        }
-        let dt = start_gen.elapsed();
-        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
-            print!("{rest}");
-        }
-        std::io::stdout().flush()?;
-        println!(
-            "\n{generated_tokens} tokens generated ({:.2} token/s)",
-            generated_tokens as f64 / dt.as_secs_f64(),
-        );
-        Ok(())
-    }
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    #[arg(long)]
-    prompt: String,
-
-    /// The temperature used to generate samples.
-    #[arg(long)]
-    temperature: Option<f64>,
-
-    /// Nucleus sampling probability cutoff.
-    #[arg(long)]
-    top_p: Option<f64>,
-
-    /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
-    seed: u64,
-
-    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 10000)]
-    sample_len: usize,
-
-    #[arg(long)]
-    model_id: Option<String>,
-
-    #[arg(long, default_value = "main")]
-    revision: String,
-
-    #[arg(long)]
-    tokenizer_file: Option<String>,
-
-    #[arg(long)]
-    config_file: Option<String>,
-
-    #[arg(long)]
-    weight_files: Option<String>,
-
-    /// Penalty to be applied for repeating tokens, 1. means no penalty.
-    #[arg(long, default_value_t = 1.1)]
-    repeat_penalty: f32,
-
-    /// The context size to consider for the repeat penalty.
-    #[arg(long, default_value_t = 64)]
-    repeat_last_n: usize,
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    println!(
-        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        candle::utils::with_avx(),
-        candle::utils::with_neon(),
-        candle::utils::with_simd128(),
-        candle::utils::with_f16c()
-    );
-    println!(
-        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
-        args.temperature.unwrap_or(0.),
-        args.repeat_penalty,
-        args.repeat_last_n
-    );
-
-    let start = std::time::Instant::now();
-    let api = Api::new()?;
-    let model_id = match &args.model_id {
-        Some(model_id) => match model_id.as_str() {
-            "7b-it" => "google/gemma-7b-it".to_string(),
-            "7b" => "google/gemma-7b".to_string(),
-            "2b-it" => "google/gemma-2b-it".to_string(),
-            "2b" => "google/gemma-2b".to_string(),
-            _ => model_id.to_string(),
-        },
-        None => "google/gemma-2b".to_string(),
-    };
-    let repo = api.repo(Repo::with_revision(
-        model_id,
-        RepoType::Model,
-        args.revision,
-    ));
-    let tokenizer_filename = match args.tokenizer_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("tokenizer.json")?,
-    };
-    let config_filename = match args.config_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("config.json")?,
-    };
-    let filenames = match args.weight_files {
-        Some(files) => files
-            .split(',')
-            .map(std::path::PathBuf::from)
-            .collect::<Vec<_>>(),
-        None => candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?,
-    };
-    println!("retrieved the files in {:?}", start.elapsed());
-    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-    let config: Config = serde_json::from_reader(std::fs::File::open(config_filename)?)?;
-
-    let start = std::time::Instant::now();
-    let device = candle_examples::device(args.cpu)?;
-    let dtype = if device.is_cuda() {
-        DType::BF16
-    } else {
-        DType::F32
-    };
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
-    let model = Model::new(&config, vb)?;
-
-    println!("loaded the model in {:?}", start.elapsed());
-
-    let mut pipeline = TextGeneration::new(
-        model,
-        tokenizer,
-        args.seed,
-        args.temperature,
-        args.top_p,
-        args.repeat_penalty,
-        args.repeat_last_n,
-        &device,
-    );
-    pipeline.run(&args.prompt, args.sample_len)?;
-    Ok(())
-}
--- a/candle-examples/examples/llama/main.rs
+++ b/candle-examples/examples/llama/main.rs
@ -13,7 +13,7 @@ extern crate accelerate_src;
 extern crate intel_mkl_src;

 use anyhow::{bail, Error as E, Result};
-use clap::{Parser, ValueEnum};
+use clap::Parser;

 use candle::{DType, Tensor};
 use candle_nn::VarBuilder;
@ -22,21 +22,11 @@ use hf_hub::{api::sync::Api, Repo, RepoType};
 use std::io::Write;

 use candle_transformers::models::llama as model;
-use model::{Llama, LlamaConfig};
+use model::{Config, Llama, LlamaConfig};

 const EOS_TOKEN: &str = "</s>";
 const DEFAULT_PROMPT: &str = "My favorite theorem is ";

-#[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
-enum Which {
-    V1,
-    V2,
-    #[value(name = "solar-10.7b")]
-    Solar10_7B,
-    #[value(name = "tiny-llama-1.1b-chat")]
-    TinyLlama1_1BChat,
-}
-
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
@ -44,6 +34,10 @@ struct Args {
    #[arg(long)]
    cpu: bool,

+    /// Use npy instead of safetensors
+    #[arg(long)]
+    npy: Option<String>,
+
    /// The temperature used to generate samples.
    #[arg(long)]
    temperature: Option<f64>,
@ -57,7 +51,7 @@ struct Args {
    seed: u64,

    /// The length of the sample to generate (in tokens).
-    #[arg(long, default_value_t = 10000)]
+    #[arg(long, default_value_t = 100)]
    sample_len: usize,

    /// Disable the key-value cache.
@ -82,13 +76,17 @@ struct Args {
    #[arg(long)]
    revision: Option<String>,

-    /// The model size to use.
-    #[arg(long, default_value = "v2")]
-    which: Which,
+    #[arg(long)]
+    v1: bool,

    #[arg(long)]
    use_flash_attn: bool,

+    /// The folder name that contains safetensor weights and json files
+    /// (same structure as huggingface online)
+    #[arg(long)]
+    local_weights: Option<String>,
+
    /// Penalty to be applied for repeating tokens, 1. means no penalty.
    #[arg(long, default_value_t = 1.0)]
    repeat_penalty: f32,
@ -120,33 +118,65 @@ fn main() -> Result<()> {
        Some(dtype) => bail!("Unsupported dtype {dtype}"),
        None => DType::F16,
    };
-    let (llama, tokenizer_filename, mut cache) = {
-        let api = Api::new()?;
-        let model_id = args.model_id.unwrap_or_else(|| match args.which {
-            Which::V1 => "Narsil/amall-7b".to_string(),
-            Which::V2 => "meta-llama/Llama-2-7b-hf".to_string(),
-            Which::Solar10_7B => "upstage/SOLAR-10.7B-v1.0".to_string(),
-            Which::TinyLlama1_1BChat => "TinyLlama/TinyLlama-1.1B-Chat-v1.0".to_string(),
-        });
-        println!("loading the model weights from {model_id}");
-        let revision = args.revision.unwrap_or("main".to_string());
-        let api = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));
+    let (llama, tokenizer_filename, cache) = match args.npy {
+        Some(filename) => {
+            let config = if args.v1 {
+                Config::config_7b_v1(args.use_flash_attn)
+            } else {
+                Config::config_7b_v2(args.use_flash_attn)
+            };
+            let cache = model::Cache::new(!args.no_kv_cache, dtype, &config, &device)?;
+            let vb = VarBuilder::from_npz(filename, dtype, &device)?;
+            let tokenizer = std::path::PathBuf::from("llama-tokenizer.json");
+            (Llama::load(vb, &cache, &config)?, tokenizer, cache)
+        }
+        None => {
+            let api = Api::new()?;
+            let model_id = args.model_id.unwrap_or_else(|| {
+                if args.v1 {
+                    "Narsil/amall-7b".to_string()
+                } else {
+                    "meta-llama/Llama-2-7b-hf".to_string()
+                }
+            });
+            println!("loading the model weights from {model_id}");
+            let revision = args.revision.unwrap_or("main".to_string());
+            let api = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));

-        let tokenizer_filename = api.get("tokenizer.json")?;
-        let config_filename = api.get("config.json")?;
-        let config: LlamaConfig = serde_json::from_slice(&std::fs::read(config_filename)?)?;
-        let config = config.into_config(args.use_flash_attn);
+            let tokenizer_filename = match &args.local_weights {
+                Some(path) => (path.to_owned() + "tokenizer.json").into(),
+                _ => api.get("tokenizer.json")?,
+            };

-        let filenames = match args.which {
-            Which::V1 | Which::V2 | Which::Solar10_7B => {
-                candle_examples::hub_load_safetensors(&api, "model.safetensors.index.json")?
+            let config_filename = match &args.local_weights {
+                Some(path) => (path.to_owned() + "config.json").into(),
+                _ => api.get("config.json")?,
+            };
+            let config: LlamaConfig = serde_json::from_slice(&std::fs::read(config_filename)?)?;
+            let config = config.into_config(args.use_flash_attn);
+
+            let mut filenames = vec![];
+            for rfilename in [
+                "model-00001-of-00002.safetensors",
+                "model-00002-of-00002.safetensors",
+            ] {
+                match &args.local_weights {
+                    Some(path) => {
+                        filenames.push((path.to_owned() + rfilename).into());
+                    }
+                    _ => {
+                        let filename = api.get(rfilename)?;
+                        filenames.push(filename);
+                    }
+                };
            }
-            Which::TinyLlama1_1BChat => vec![api.get("model.safetensors")?],
-        };
-        let cache = model::Cache::new(!args.no_kv_cache, dtype, &config, &device)?;

-        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
-        (Llama::load(vb, &config)?, tokenizer_filename, cache)
+            println!("building the model");
+            let cache = model::Cache::new(!args.no_kv_cache, dtype, &config, &device)?;
+
+            let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
+            (Llama::load(vb, &cache, &config)?, tokenizer_filename, cache)
+        }
    };
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
    let eos_token_id = tokenizer.token_to_id(EOS_TOKEN);
@ -156,7 +186,6 @@ fn main() -> Result<()> {
        .map_err(E::msg)?
        .get_ids()
        .to_vec();
-    let mut tokenizer = candle_examples::token_output_stream::TokenOutputStream::new(tokenizer);

    println!("starting the inference loop");
    print!("{prompt}");
@ -165,14 +194,14 @@ fn main() -> Result<()> {
    let mut index_pos = 0;
    let mut token_generated = 0;
    for index in 0..args.sample_len {
-        let (context_size, context_index) = if cache.use_kv_cache && index > 0 {
-            (1, index_pos)
+        let context_size = if cache.use_kv_cache && index > 0 {
+            1
        } else {
-            (tokens.len(), 0)
+            tokens.len()
        };
        let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
        let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
-        let logits = llama.forward(&input, context_index, &mut cache)?;
+        let logits = llama.forward(&input, index_pos)?;
        let logits = logits.squeeze(0)?;
        let logits = if args.repeat_penalty == 1. {
            logits
@ -190,16 +219,18 @@ fn main() -> Result<()> {
        token_generated += 1;
        tokens.push(next_token);

+        // Extracting the last token as a string is complicated, here we just apply some simple
+        // heuristics as it seems to work well enough for this example. See the following for more
+        // details:
+        // https://github.com/huggingface/tokenizers/issues/1141#issuecomment-1562644141
+        if let Some(text) = tokenizer.id_to_token(next_token) {
+            let text = text.replace('▁', " ").replace("<0x0A>", "\n");
+            print!("{text}");
+            std::io::stdout().flush()?;
+        }
        if Some(next_token) == eos_token_id {
            break;
        }
-        if let Some(t) = tokenizer.next_token(next_token)? {
-            print!("{t}");
-            std::io::stdout().flush()?;
-        }
-    }
-    if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
-        print!("{rest}");
    }
    let dt = start_gen.elapsed();
    println!(
--- a/candle-examples/examples/llama2-c/main.rs
+++ b/candle-examples/examples/llama2-c/main.rs
@ -19,7 +19,7 @@ use candle_transformers::generation::LogitsProcessor;
 use std::io::Write;
 use tokenizers::Tokenizer;

-use model::{Cache, Config, Llama};
+use model::{Config, Llama};
 use qmodel::QLlama;
 use weights::TransformerWeights;

@ -160,10 +160,10 @@ enum Model {
 }

 impl Model {
-    fn forward(&self, xs: &Tensor, pos: usize, cache: &mut Cache) -> anyhow::Result<Tensor> {
+    fn forward(&self, xs: &Tensor, pos: usize) -> anyhow::Result<Tensor> {
        match self {
-            Self::Llama(l) => Ok(l.forward(xs, pos, cache)?),
-            Self::QLlama(l) => Ok(l.forward(xs, pos, cache)?),
+            Self::Llama(l) => Ok(l.forward(xs, pos)?),
+            Self::QLlama(l) => Ok(l.forward(xs, pos)?),
        }
    }
 }
@ -188,8 +188,8 @@ fn run_eval(args: &EvaluationCmd, common_args: &Args) -> Result<()> {
    let config = Config::from_reader(&mut file)?;
    let weights = TransformerWeights::from_reader(&mut file, &config, &device)?;
    let vb = weights.var_builder(&config, &device)?;
-    let mut cache = Cache::new(false, &config, vb.pp("rot"))?;
-    let model = Llama::load(vb, config)?;
+    let cache = model::Cache::new(false, &config, vb.pp("rot"))?;
+    let model = Llama::load(vb, &cache, config)?;

    let tokens = match &args.pretokenized_dir {
        None => {
@ -235,7 +235,7 @@ fn run_eval(args: &EvaluationCmd, common_args: &Args) -> Result<()> {
    let batch_iter = candle_datasets::Batcher::new_r2(iter).batch_size(args.batch_size);
    for inp_tgt in batch_iter {
        let (inp, tgt) = inp_tgt?;
-        let logits = model.forward(&inp, 0, &mut cache)?;
+        let logits = model.forward(&inp, 0)?;
        let loss = candle_nn::loss::cross_entropy(&logits.flatten_to(1)?, &tgt.flatten_to(1)?)?;
        println!("{}", loss.to_vec0::<f32>()?);
    }
@ -261,8 +261,8 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
    let is_safetensors = config_path
        .extension()
        .map_or(false, |v| v == "safetensors");
-    let (model, config, mut cache) = if is_gguf {
-        let vb = qmodel::VarBuilder::from_gguf(config_path, &device)?;
+    let (model, config) = if is_gguf {
+        let vb = qmodel::VarBuilder::from_gguf(config_path)?;
        let (_vocab_size, dim) = vb
            .get_no_shape("model.embed_tokens.weight")?
            .shape()
@ -279,13 +279,13 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
                (config.seq_len, config.head_size() / 2),
                "rot.freq_cis_real",
            )?
-            .dequantize(&device)?;
+            .dequantize(&candle::Device::Cpu)?;
        let freq_cis_imag = vb
            .get(
                (config.seq_len, config.head_size() / 2),
                "rot.freq_cis_imag",
            )?
-            .dequantize(&device)?;
+            .dequantize(&candle::Device::Cpu)?;

        let fake_vb = candle_nn::VarBuilder::from_tensors(
            [
@ -295,18 +295,18 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
            .into_iter()
            .collect(),
            candle::DType::F32,
-            &device,
+            &candle::Device::Cpu,
        );
        let cache = model::Cache::new(true, &config, fake_vb)?;
-        let model = Model::QLlama(QLlama::load(vb, config.clone())?);
-        (model, config, cache)
+        let model = Model::QLlama(QLlama::load(vb, &cache, config.clone())?);
+        (model, config)
    } else if is_safetensors {
        let config = Config::tiny_15m();
        let tensors = candle::safetensors::load(config_path, &device)?;
        let vb = candle_nn::VarBuilder::from_tensors(tensors, candle::DType::F32, &device);
        let cache = model::Cache::new(true, &config, vb.pp("rot"))?;
-        let model = Model::Llama(Llama::load(vb, config.clone())?);
-        (model, config, cache)
+        let model = Model::Llama(Llama::load(vb, &cache, config.clone())?);
+        (model, config)
    } else {
        let mut file = std::fs::File::open(config_path)?;
        let config = Config::from_reader(&mut file)?;
@ -314,8 +314,8 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
        let weights = TransformerWeights::from_reader(&mut file, &config, &device)?;
        let vb = weights.var_builder(&config, &device)?;
        let cache = model::Cache::new(true, &config, vb.pp("rot"))?;
-        let model = Model::Llama(Llama::load(vb, config.clone())?);
-        (model, config, cache)
+        let model = Model::Llama(Llama::load(vb, &cache, config.clone())?);
+        (model, config)
    };

    println!("starting the inference loop");
@ -328,7 +328,6 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
        .map_err(E::msg)?
        .get_ids()
        .to_vec();
-    let mut tokenizer = candle_examples::token_output_stream::TokenOutputStream::new(tokenizer);

    let start_gen = std::time::Instant::now();
    for index in 0.. {
@ -338,7 +337,7 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
        let context_size = if index > 0 { 1 } else { tokens.len() };
        let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
        let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
-        let logits = model.forward(&input, index_pos, &mut cache)?;
+        let logits = model.forward(&input, index_pos)?;
        let logits = logits.i((0, logits.dim(1)? - 1))?;
        let logits = if common_args.repeat_penalty == 1. || tokens.is_empty() {
            logits
@ -354,14 +353,16 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {

        let next_token = logits_processor.sample(&logits)?;
        tokens.push(next_token);
-        if let Some(t) = tokenizer.next_token(next_token)? {
-            print!("{t}");
+        // Extracting the last token as a string is complicated, here we just apply some simple
+        // heuristics as it seems to work well enough for this example. See the following for more
+        // details:
+        // https://github.com/huggingface/tokenizers/issues/1141#issuecomment-1562644141
+        if let Some(text) = tokenizer.id_to_token(next_token) {
+            let text = text.replace('▁', " ").replace("<0x0A>", "\n");
+            print!("{text}");
            std::io::stdout().flush()?;
        }
    }
-    if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
-        print!("{rest}");
-    }
    let dt = start_gen.elapsed();
    println!(
        "\n{} tokens generated ({:.2} token/s)\n",
--- a/candle-examples/examples/llama2-c/training.rs
+++ b/candle-examples/examples/llama2-c/training.rs
@ -8,7 +8,6 @@ fn valid_loss(
    model: &Llama,
    args: &crate::TrainingCmd,
    device: &Device,
-    cache: &mut Cache,
 ) -> Result<f64> {
    let iter = DatasetRandomIter::new(dataset, true, model.config.seq_len, device.clone());
    let batch_iter = candle_datasets::Batcher::new_r2(iter).batch_size(args.batch_size);
@ -16,7 +15,7 @@ fn valid_loss(
    let mut cnt = 0usize;
    for inp_tgt in batch_iter.take(50) {
        let (inp, tgt) = inp_tgt?;
-        let logits = model.forward(&inp, 0, cache)?;
+        let logits = model.forward(&inp, 0)?;
        let loss = candle_nn::loss::cross_entropy(&logits.flatten_to(1)?, &tgt.flatten_to(1)?)?;
        sum_ce += loss.to_vec0::<f32>()? as f64;
        cnt += 1;
@ -38,8 +37,8 @@ pub fn run(args: &crate::TrainingCmd, common_args: &crate::Args) -> Result<()> {
    let iter = DatasetRandomIter::new(&dataset, false, config.seq_len, device.clone());
    let batch_iter = candle_datasets::Batcher::new_r2(iter).batch_size(args.batch_size);

-    let mut cache = Cache::new(false, &config, vb.pp("rot"))?;
-    let model = Llama::load(vb, config)?;
+    let cache = Cache::new(false, &config, vb.pp("rot"))?;
+    let model = Llama::load(vb, &cache, config)?;
    let params = candle_nn::ParamsAdamW {
        lr: args.learning_rate,
        ..Default::default()
@ -47,14 +46,14 @@ pub fn run(args: &crate::TrainingCmd, common_args: &crate::Args) -> Result<()> {
    let mut opt = candle_nn::AdamW::new(varmap.all_vars(), params)?;
    for (batch_index, batch) in batch_iter.enumerate() {
        let (inp, tgt) = batch?;
-        let logits = model.forward(&inp, 0, &mut cache)?;
+        let logits = model.forward(&inp, 0)?;
        let loss = candle_nn::loss::cross_entropy(&logits.flatten_to(1)?, &tgt.flatten_to(1)?)?;
        opt.backward_step(&loss)?;

        if batch_index > 0 && batch_index % 100 == 0 {
            // TODO: Add a way to deactivate the backprop graph tracking when computing the
            // validation loss.
-            let loss = valid_loss(&dataset, &model, args, &device, &mut cache)?;
+            let loss = valid_loss(&dataset, &model, args, &device)?;
            println!("{batch_index} {loss}");
        }
        if batch_index > 0 && batch_index % 1000 == 0 {
--- a/candle-examples/examples/llama_multiprocess/main.rs
+++ b/candle-examples/examples/llama_multiprocess/main.rs
@ -143,7 +143,14 @@ fn main() -> Result<()> {
    let config_filename = api.get("config.json")?;
    let config: Config = serde_json::from_slice(&std::fs::read(config_filename)?)?;
    let tokenizer_filename = api.get("tokenizer.json")?;
-    let filenames = candle_examples::hub_load_safetensors(&api, "model.safetensors.index.json")?;
+    let mut filenames = vec![];
+    for rfilename in [
+        "model-00001-of-00002.safetensors",
+        "model-00002-of-00002.safetensors",
+    ] {
+        let filename = api.get(rfilename)?;
+        filenames.push(filename);
+    }

    if args.rank.is_none() {
        let children: Vec<_> = (0..args.num_shards)
--- a/candle-examples/examples/mamba-minimal/README.md
+++ b/candle-examples/examples/mamba-minimal/README.md
@ -1,15 +0,0 @@
-# candle-mamba-minimal: minimal implementation of Mamba
-
-This is based on [mamba-minimal](https://github.com/johnma2006/mamba-minimal).
-
-Compared to the mamba example, this version can handle training but is much
-slower.
-
-## Running the example
-
-```bash
-$ cargo run --example mamba-minimal --release -- --prompt "Mamba is the"
-Mamba is the most popular and best-selling game in the world. It has been downloaded more than 1,000 times by over 1 million people worldwide since its release on March 18th 2016.
-
-The Mamba series of games are a collection that combines elements from all genres including action, adventure, strategy & puzzle games with some unique gameplay features such as stealth and survival. The game is also known for its innovative graphics and the ability to play in a variety of different modes like single player or multiplayer.
-```
--- a/candle-examples/examples/mamba-minimal/main.rs
+++ b/candle-examples/examples/mamba-minimal/main.rs
@ -1,287 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::{Error as E, Result};
-use clap::{Parser, ValueEnum};
-
-mod model;
-use model::{Config, Model};
-
-use candle::{DType, Device, Module, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-struct TextGeneration {
-    model: Model,
-    device: Device,
-    tokenizer: TokenOutputStream,
-    logits_processor: LogitsProcessor,
-    repeat_penalty: f32,
-    repeat_last_n: usize,
-}
-
-impl TextGeneration {
-    #[allow(clippy::too_many_arguments)]
-    fn new(
-        model: Model,
-        tokenizer: Tokenizer,
-        seed: u64,
-        temp: Option<f64>,
-        top_p: Option<f64>,
-        repeat_penalty: f32,
-        repeat_last_n: usize,
-        device: &Device,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-        Self {
-            model,
-            tokenizer: TokenOutputStream::new(tokenizer),
-            logits_processor,
-            repeat_penalty,
-            repeat_last_n,
-            device: device.clone(),
-        }
-    }
-
-    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
-        use std::io::Write;
-        self.tokenizer.clear();
-        let mut tokens = self
-            .tokenizer
-            .tokenizer()
-            .encode(prompt, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        for &t in tokens.iter() {
-            if let Some(t) = self.tokenizer.next_token(t)? {
-                print!("{t}")
-            }
-        }
-        std::io::stdout().flush()?;
-
-        let mut generated_tokens = 0usize;
-        let eos_token = match self.tokenizer.get_token("<|endoftext|>") {
-            Some(token) => token,
-            None => anyhow::bail!("cannot find the </s> token"),
-        };
-        let start_gen = std::time::Instant::now();
-        for _ in 0..sample_len {
-            let input = Tensor::new(tokens.as_slice(), &self.device)?.unsqueeze(0)?;
-            let logits = self.model.forward(&input)?;
-            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
-            let logits = if self.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    self.repeat_penalty,
-                    &tokens[start_at..],
-                )?
-            };
-
-            let next_token = self.logits_processor.sample(&logits)?;
-            tokens.push(next_token);
-            generated_tokens += 1;
-            if next_token == eos_token {
-                break;
-            }
-            if let Some(t) = self.tokenizer.next_token(next_token)? {
-                print!("{t}");
-                std::io::stdout().flush()?;
-            }
-        }
-        let dt = start_gen.elapsed();
-        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
-            print!("{rest}");
-        }
-        std::io::stdout().flush()?;
-        println!(
-            "\n{generated_tokens} tokens generated ({:.2} token/s)",
-            generated_tokens as f64 / dt.as_secs_f64(),
-        );
-        Ok(())
-    }
-}
-
-#[derive(Parser, ValueEnum, Clone, Copy, PartialEq, Eq, Debug)]
-enum Which {
-    Mamba130m,
-    Mamba370m,
-    Mamba790m,
-    Mamba1_4b,
-    Mamba2_8b,
-    Mamba2_8bSlimPj,
-}
-
-impl std::fmt::Display for Which {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-impl Which {
-    fn model_id(&self) -> &'static str {
-        match self {
-            Self::Mamba130m => "state-spaces/mamba-130m",
-            Self::Mamba370m => "state-spaces/mamba-370m",
-            Self::Mamba790m => "state-spaces/mamba-790m",
-            Self::Mamba1_4b => "state-spaces/mamba-1.4b",
-            Self::Mamba2_8b => "state-spaces/mamba-2.8b",
-            Self::Mamba2_8bSlimPj => "state-spaces/mamba-2.8b-slimpj'",
-        }
-    }
-
-    fn revision(&self) -> &'static str {
-        match self {
-            Self::Mamba130m
-            | Self::Mamba370m
-            | Self::Mamba790m
-            | Self::Mamba1_4b
-            | Self::Mamba2_8bSlimPj => "refs/pr/1",
-            Self::Mamba2_8b => "refs/pr/4",
-        }
-    }
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    #[arg(long)]
-    prompt: String,
-
-    /// The temperature used to generate samples.
-    #[arg(long)]
-    temperature: Option<f64>,
-
-    /// Nucleus sampling probability cutoff.
-    #[arg(long)]
-    top_p: Option<f64>,
-
-    /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
-    seed: u64,
-
-    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 5000)]
-    sample_len: usize,
-
-    #[arg(long, default_value = "mamba130m")]
-    which: Which,
-
-    #[arg(long)]
-    model_id: Option<String>,
-
-    #[arg(long)]
-    revision: Option<String>,
-
-    #[arg(long)]
-    tokenizer_file: Option<String>,
-
-    #[arg(long)]
-    weight_files: Option<String>,
-
-    #[arg(long)]
-    config_file: Option<String>,
-
-    /// Penalty to be applied for repeating tokens, 1. means no penalty.
-    #[arg(long, default_value_t = 1.1)]
-    repeat_penalty: f32,
-
-    /// The context size to consider for the repeat penalty.
-    #[arg(long, default_value_t = 64)]
-    repeat_last_n: usize,
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    println!(
-        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        candle::utils::with_avx(),
-        candle::utils::with_neon(),
-        candle::utils::with_simd128(),
-        candle::utils::with_f16c()
-    );
-    println!(
-        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
-        args.temperature.unwrap_or(0.),
-        args.repeat_penalty,
-        args.repeat_last_n
-    );
-
-    let start = std::time::Instant::now();
-    let api = Api::new()?;
-    let repo = api.repo(Repo::with_revision(
-        args.model_id
-            .unwrap_or_else(|| args.which.model_id().to_string()),
-        RepoType::Model,
-        args.revision
-            .unwrap_or_else(|| args.which.revision().to_string()),
-    ));
-    let tokenizer_filename = match args.tokenizer_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => api
-            .model("EleutherAI/gpt-neox-20b".to_string())
-            .get("tokenizer.json")?,
-    };
-    let config_filename = match args.config_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("config.json")?,
-    };
-    let filenames = match args.weight_files {
-        Some(files) => files
-            .split(',')
-            .map(std::path::PathBuf::from)
-            .collect::<Vec<_>>(),
-        None => {
-            vec![repo.get("model.safetensors")?]
-        }
-    };
-    println!("retrieved the files in {:?}", start.elapsed());
-    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-    let start = std::time::Instant::now();
-    let config: Config = serde_json::from_slice(&std::fs::read(config_filename)?)?;
-    let device = candle_examples::device(args.cpu)?;
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
-    let model = Model::new(&config, vb.pp("backbone"))?;
-    println!("loaded the model in {:?}", start.elapsed());
-
-    let mut pipeline = TextGeneration::new(
-        model,
-        tokenizer,
-        args.seed,
-        args.temperature,
-        args.top_p,
-        args.repeat_penalty,
-        args.repeat_last_n,
-        &device,
-    );
-    pipeline.run(&args.prompt, args.sample_len)?;
-    Ok(())
-}
--- a/candle-examples/examples/mamba-minimal/model.rs
+++ b/candle-examples/examples/mamba-minimal/model.rs
@ -1,204 +0,0 @@
-/// This follows the lines of:
-/// https://github.com/johnma2006/mamba-minimal/blob/master/model.py
-/// Simple, minimal implementation of Mamba in one file of PyTorch.
-use candle::{IndexOp, Module, Result, Tensor, D};
-use candle_nn::{RmsNorm, VarBuilder};
-
-use candle_transformers::models::with_tracing::{linear, linear_no_bias, Linear};
-
-#[derive(Debug, Clone, serde::Deserialize)]
-pub struct Config {
-    d_model: usize,
-    n_layer: usize,
-    vocab_size: usize,
-    pad_vocab_size_multiple: usize,
-}
-
-impl Config {
-    fn vocab_size(&self) -> usize {
-        let pad = self.pad_vocab_size_multiple;
-        (self.vocab_size + pad - 1) / pad * pad
-    }
-
-    fn dt_rank(&self) -> usize {
-        (self.d_model + 15) / 16
-    }
-
-    fn d_conv(&self) -> usize {
-        4
-    }
-
-    fn d_state(&self) -> usize {
-        16
-    }
-
-    fn d_inner(&self) -> usize {
-        self.d_model * 2
-    }
-}
-
-// https://github.com/johnma2006/mamba-minimal/blob/61f01953ca153f8c4a850d7111beecbf4be9cee1/model.py#L177
-#[derive(Clone, Debug)]
-pub struct MambaBlock {
-    in_proj: Linear,
-    conv1d: candle_nn::Conv1d,
-    x_proj: Linear,
-    dt_proj: Linear,
-    a_log: Tensor,
-    d: Tensor,
-    out_proj: Linear,
-    dt_rank: usize,
-}
-
-impl MambaBlock {
-    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let d_inner = cfg.d_inner();
-        let d_conv = cfg.d_conv();
-        let d_state = cfg.d_state();
-        let dt_rank = cfg.dt_rank();
-        let in_proj = linear_no_bias(cfg.d_model, d_inner * 2, vb.pp("in_proj"))?;
-        let conv_cfg = candle_nn::Conv1dConfig {
-            groups: d_inner,
-            padding: d_conv - 1,
-            ..Default::default()
-        };
-        let conv1d = candle_nn::conv1d(d_inner, d_inner, d_conv, conv_cfg, vb.pp("conv1d"))?;
-        let x_proj = linear_no_bias(d_inner, dt_rank + d_state * 2, vb.pp("x_proj"))?;
-        let dt_proj = linear(dt_rank, d_inner, vb.pp("dt_proj"))?;
-        let a_log = vb.get((d_inner, d_state), "A_log")?;
-        let d = vb.get(d_inner, "D")?;
-        let out_proj = linear_no_bias(d_inner, cfg.d_model, vb.pp("out_proj"))?;
-        Ok(Self {
-            in_proj,
-            conv1d,
-            x_proj,
-            dt_proj,
-            a_log,
-            d,
-            out_proj,
-            dt_rank,
-        })
-    }
-
-    fn ssm(&self, xs: &Tensor) -> Result<Tensor> {
-        let (_d_in, n) = self.a_log.dims2()?;
-        let a = self.a_log.to_dtype(candle::DType::F32)?.exp()?.neg()?;
-        let d = self.d.to_dtype(candle::DType::F32)?;
-        let x_dbl = xs.apply(&self.x_proj)?;
-        let delta = x_dbl.narrow(D::Minus1, 0, self.dt_rank)?;
-        let b = x_dbl.narrow(D::Minus1, self.dt_rank, n)?;
-        let c = x_dbl.narrow(D::Minus1, self.dt_rank + n, n)?;
-        let delta = delta.contiguous()?.apply(&self.dt_proj)?;
-        // softplus without threshold
-        let delta = (delta.exp()? + 1.)?.log()?;
-        let ss = selective_scan(xs, &delta, &a, &b, &c, &d)?;
-        Ok(ss)
-    }
-}
-
-// https://github.com/johnma2006/mamba-minimal/blob/61f01953ca153f8c4a850d7111beecbf4be9cee1/model.py#L275
-fn selective_scan(
-    u: &Tensor,
-    delta: &Tensor,
-    a: &Tensor,
-    b: &Tensor,
-    c: &Tensor,
-    d: &Tensor,
-) -> Result<Tensor> {
-    let (b_sz, l, d_in) = u.dims3()?;
-    let n = a.dim(1)?;
-    let delta = delta.t()?.reshape((b_sz, d_in, l, 1))?; // b d_in l 1
-    let delta_a = delta.broadcast_mul(&a.reshape((1, d_in, 1, n))?)?.exp()?;
-    let delta_b_u = delta
-        .broadcast_mul(&b.reshape((b_sz, 1, l, n))?)?
-        .broadcast_mul(&u.t()?.reshape((b_sz, d_in, l, 1))?)?;
-    let mut xs = Tensor::zeros((b_sz, d_in, n), delta_a.dtype(), delta_a.device())?;
-    let mut ys = Vec::with_capacity(l);
-    for i in 0..l {
-        xs = ((delta_a.i((.., .., i))? * xs)? + delta_b_u.i((.., .., i))?)?;
-        let y = xs.matmul(&c.i((.., i, ..))?.unsqueeze(2)?)?.squeeze(2)?;
-        ys.push(y)
-    }
-    let ys = Tensor::stack(ys.as_slice(), 1)?;
-    ys + u.broadcast_mul(d)
-}
-
-impl Module for MambaBlock {
-    // https://github.com/johnma2006/mamba-minimal/blob/61f01953ca153f8c4a850d7111beecbf4be9cee1/model.py#L206
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        let (_b_sz, seq_len, _dim) = xs.dims3()?;
-        let xs_and_res = xs.apply(&self.in_proj)?.chunk(2, D::Minus1)?;
-        let (xs, res) = (&xs_and_res[0], &xs_and_res[1]);
-        let xs = xs
-            .t()?
-            .apply(&self.conv1d)?
-            .narrow(D::Minus1, 0, seq_len)?
-            .t()?;
-        let xs = candle_nn::ops::silu(&xs)?;
-        let ys = (self.ssm(&xs)? * candle_nn::ops::silu(res))?;
-        ys.apply(&self.out_proj)
-    }
-}
-
-// https://github.com/johnma2006/mamba-minimal/blob/61f01953ca153f8c4a850d7111beecbf4be9cee1/model.py#L143
-#[derive(Clone, Debug)]
-pub struct ResidualBlock {
-    mixer: MambaBlock,
-    norm: RmsNorm,
-}
-
-impl ResidualBlock {
-    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let norm = candle_nn::rms_norm(cfg.d_model, 1e-5, vb.pp("norm"))?;
-        let mixer = MambaBlock::new(cfg, vb.pp("mixer"))?;
-        Ok(Self { mixer, norm })
-    }
-}
-
-impl Module for ResidualBlock {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        xs.apply(&self.norm)?.apply(&self.mixer)? + xs
-    }
-}
-
-// https://github.com/johnma2006/mamba-minimal/blob/61f01953ca153f8c4a850d7111beecbf4be9cee1/model.py#L56
-#[derive(Clone, Debug)]
-pub struct Model {
-    embedding: candle_nn::Embedding,
-    layers: Vec<ResidualBlock>,
-    norm_f: RmsNorm,
-    lm_head: Linear,
-}
-
-impl Model {
-    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
-        let embedding = candle_nn::embedding(cfg.vocab_size(), cfg.d_model, vb.pp("embedding"))?;
-        let mut layers = Vec::with_capacity(cfg.n_layer);
-        let vb_l = vb.pp("layers");
-        for layer_idx in 0..cfg.n_layer {
-            let layer = ResidualBlock::new(cfg, vb_l.pp(layer_idx))?;
-            layers.push(layer)
-        }
-        let norm_f = candle_nn::rms_norm(cfg.d_model, 1e-5, vb.pp("norm_f"))?;
-        let lm_head = Linear::from_weights(embedding.embeddings().clone(), None);
-        Ok(Self {
-            embedding,
-            layers,
-            norm_f,
-            lm_head,
-        })
-    }
-}
-
-impl Module for Model {
-    fn forward(&self, input_ids: &Tensor) -> Result<Tensor> {
-        let (_b_size, seq_len) = input_ids.dims2()?;
-        let mut xs = self.embedding.forward(input_ids)?;
-        for layer in self.layers.iter() {
-            xs = layer.forward(&xs)?
-        }
-        xs.narrow(1, seq_len - 1, 1)?
-            .apply(&self.norm_f)?
-            .apply(&self.lm_head)
-    }
-}
--- a/candle-examples/examples/mamba/README.md
+++ b/candle-examples/examples/mamba/README.md
@ -1,17 +0,0 @@
-# candle-mamba: Mamba implementation
-
-Candle implementation of *Mamba* [1] inference only. Mamba is an alternative to
-the transformer architecture. It leverages State Space Models (SSMs) with the
-goal of being computationally efficient on long sequences. The implementation is
-based on [mamba.rs](https://github.com/LaurentMazare/mamba.rs).
-
- [1]. [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752).
-
-Compared to the mamba-minimal example, this version is far more efficient but
-would only work for inference.
-## Running the example
-
-```bash
-$ cargo run --example mamba-minimal --release -- --prompt "Mamba is the"
-```
-
--- a/candle-examples/examples/mamba/main.rs
+++ b/candle-examples/examples/mamba/main.rs
@ -1,299 +0,0 @@
-#[cfg(feature = "mkl")]
-extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-use anyhow::{Error as E, Result};
-use clap::{Parser, ValueEnum};
-
-use candle_transformers::models::mamba::{Config, Model, State};
-
-use candle::{DType, Device, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
-use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use tokenizers::Tokenizer;
-
-struct TextGeneration {
-    model: Model,
-    config: Config,
-    device: Device,
-    tokenizer: TokenOutputStream,
-    logits_processor: LogitsProcessor,
-    repeat_penalty: f32,
-    repeat_last_n: usize,
-}
-
-impl TextGeneration {
-    #[allow(clippy::too_many_arguments)]
-    fn new(
-        model: Model,
-        config: Config,
-        tokenizer: Tokenizer,
-        seed: u64,
-        temp: Option<f64>,
-        top_p: Option<f64>,
-        repeat_penalty: f32,
-        repeat_last_n: usize,
-        device: &Device,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-        Self {
-            model,
-            config,
-            tokenizer: TokenOutputStream::new(tokenizer),
-            logits_processor,
-            repeat_penalty,
-            repeat_last_n,
-            device: device.clone(),
-        }
-    }
-
-    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
-        use std::io::Write;
-        self.tokenizer.clear();
-        let mut tokens = self
-            .tokenizer
-            .tokenizer()
-            .encode(prompt, true)
-            .map_err(E::msg)?
-            .get_ids()
-            .to_vec();
-        let mut generated_tokens = 0usize;
-        let eos_token = match self.tokenizer.get_token("<|endoftext|>") {
-            Some(token) => token,
-            None => anyhow::bail!("cannot find the </s> token"),
-        };
-        let mut state = State::new(1, &self.config, &self.device)?;
-        let mut next_logits = None;
-        for &t in tokens.iter() {
-            let input = Tensor::new(&[t], &self.device)?;
-            let logits = self.model.forward(&input, &mut state)?;
-            next_logits = Some(logits);
-            if let Some(t) = self.tokenizer.next_token(t)? {
-                print!("{t}")
-            }
-        }
-        std::io::stdout().flush()?;
-
-        let start_gen = std::time::Instant::now();
-        for _ in 0..sample_len {
-            let logits = match next_logits.as_ref() {
-                Some(logits) => logits,
-                None => anyhow::bail!("cannot work on an empty prompt"),
-            };
-            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
-            let logits = if self.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    self.repeat_penalty,
-                    &tokens[start_at..],
-                )?
-            };
-            let next_token = self.logits_processor.sample(&logits)?;
-            tokens.push(next_token);
-            generated_tokens += 1;
-            if next_token == eos_token {
-                break;
-            }
-            if let Some(t) = self.tokenizer.next_token(next_token)? {
-                print!("{t}");
-                std::io::stdout().flush()?;
-            }
-
-            let input = Tensor::new(&[next_token], &self.device)?;
-            next_logits = Some(self.model.forward(&input, &mut state)?)
-        }
-        let dt = start_gen.elapsed();
-        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
-            print!("{rest}");
-        }
-        std::io::stdout().flush()?;
-        println!(
-            "\n{generated_tokens} tokens generated ({:.2} token/s)",
-            generated_tokens as f64 / dt.as_secs_f64(),
-        );
-        Ok(())
-    }
-}
-
-#[derive(Parser, ValueEnum, Clone, Copy, PartialEq, Eq, Debug)]
-enum Which {
-    Mamba130m,
-    Mamba370m,
-    Mamba790m,
-    Mamba1_4b,
-    Mamba2_8b,
-    Mamba2_8bSlimPj,
-}
-
-impl std::fmt::Display for Which {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-impl Which {
-    fn model_id(&self) -> &'static str {
-        match self {
-            Self::Mamba130m => "state-spaces/mamba-130m",
-            Self::Mamba370m => "state-spaces/mamba-370m",
-            Self::Mamba790m => "state-spaces/mamba-790m",
-            Self::Mamba1_4b => "state-spaces/mamba-1.4b",
-            Self::Mamba2_8b => "state-spaces/mamba-2.8b",
-            Self::Mamba2_8bSlimPj => "state-spaces/mamba-2.8b-slimpj'",
-        }
-    }
-
-    fn revision(&self) -> &'static str {
-        match self {
-            Self::Mamba130m
-            | Self::Mamba370m
-            | Self::Mamba790m
-            | Self::Mamba1_4b
-            | Self::Mamba2_8bSlimPj => "refs/pr/1",
-            Self::Mamba2_8b => "refs/pr/4",
-        }
-    }
-}
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Run on CPU rather than on GPU.
-    #[arg(long)]
-    cpu: bool,
-
-    /// Enable tracing (generates a trace-timestamp.json file).
-    #[arg(long)]
-    tracing: bool,
-
-    #[arg(long)]
-    prompt: String,
-
-    /// The temperature used to generate samples.
-    #[arg(long)]
-    temperature: Option<f64>,
-
-    /// Nucleus sampling probability cutoff.
-    #[arg(long)]
-    top_p: Option<f64>,
-
-    /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
-    seed: u64,
-
-    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 5000)]
-    sample_len: usize,
-
-    #[arg(long, default_value = "mamba130m")]
-    which: Which,
-
-    #[arg(long)]
-    model_id: Option<String>,
-
-    #[arg(long)]
-    revision: Option<String>,
-
-    #[arg(long)]
-    tokenizer_file: Option<String>,
-
-    #[arg(long)]
-    weight_files: Option<String>,
-
-    #[arg(long)]
-    config_file: Option<String>,
-
-    /// Penalty to be applied for repeating tokens, 1. means no penalty.
-    #[arg(long, default_value_t = 1.1)]
-    repeat_penalty: f32,
-
-    /// The context size to consider for the repeat penalty.
-    #[arg(long, default_value_t = 64)]
-    repeat_last_n: usize,
-}
-
-fn main() -> Result<()> {
-    use tracing_chrome::ChromeLayerBuilder;
-    use tracing_subscriber::prelude::*;
-
-    let args = Args::parse();
-    let _guard = if args.tracing {
-        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
-        tracing_subscriber::registry().with(chrome_layer).init();
-        Some(guard)
-    } else {
-        None
-    };
-    println!(
-        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        candle::utils::with_avx(),
-        candle::utils::with_neon(),
-        candle::utils::with_simd128(),
-        candle::utils::with_f16c()
-    );
-    println!(
-        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
-        args.temperature.unwrap_or(0.),
-        args.repeat_penalty,
-        args.repeat_last_n
-    );
-
-    let start = std::time::Instant::now();
-    let api = Api::new()?;
-    let repo = api.repo(Repo::with_revision(
-        args.model_id
-            .unwrap_or_else(|| args.which.model_id().to_string()),
-        RepoType::Model,
-        args.revision
-            .unwrap_or_else(|| args.which.revision().to_string()),
-    ));
-    let tokenizer_filename = match args.tokenizer_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => api
-            .model("EleutherAI/gpt-neox-20b".to_string())
-            .get("tokenizer.json")?,
-    };
-    let config_filename = match args.config_file {
-        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("config.json")?,
-    };
-    let filenames = match args.weight_files {
-        Some(files) => files
-            .split(',')
-            .map(std::path::PathBuf::from)
-            .collect::<Vec<_>>(),
-        None => {
-            vec![repo.get("model.safetensors")?]
-        }
-    };
-    println!("retrieved the files in {:?}", start.elapsed());
-    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
-
-    let start = std::time::Instant::now();
-    let config: Config = serde_json::from_slice(&std::fs::read(config_filename)?)?;
-    let device = candle_examples::device(args.cpu)?;
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
-    let model = Model::new(&config, vb.pp("backbone"))?;
-    println!("loaded the model in {:?}", start.elapsed());
-
-    let mut pipeline = TextGeneration::new(
-        model,
-        config,
-        tokenizer,
-        args.seed,
-        args.temperature,
-        args.top_p,
-        args.repeat_penalty,
-        args.repeat_last_n,
-        &device,
-    );
-    pipeline.run(&args.prompt, args.sample_len)?;
-    Ok(())
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Nicolas Patry	c65f68e988	Tmp gemm.	2023-11-19 20:43:59 +01:00
Nicolas Patry	eed1631ee2	Reuse buffers on our own reference counts.	2023-11-18 23:28:59 +01:00
Nicolas Patry	251c65f9f1	Metal operational.	2023-11-18 00:52:38 +01:00
Nicolas Patry	a0010898cc	Better batched matmul.	2023-11-17 10:36:57 +01:00
Nicolas Patry	2801541e5f	new_owned -> new()..to_owned().	2023-11-16 11:07:56 +01:00
Nicolas Patry	4289984d32	Remove some prints.	2023-11-13 14:51:40 +01:00
Nicolas Patry	1471f98f0b	BF16 metal fix.	2023-11-13 14:44:20 +01:00
Nicolas Patry	dd4a40f1c0	Fixes + cache compute_pipeline_state.	2023-11-13 14:33:16 +01:00
Nicolas Patry	79845bd93b	Working version for llama2-c.	2023-11-13 12:36:27 +01:00
Nicolas Patry	6071797450	Add erf.	2023-11-11 18:22:16 +01:00
Nicolas Patry	b58b247323	Putting back f16 index select.	2023-11-11 17:43:35 +01:00
Nicolas Patry	3900091e75	All tests are panicking instead of random failure.	2023-11-11 17:43:35 +01:00
Nicolas Patry	54355ff997	Adding some half kernels.	2023-11-11 17:43:35 +01:00
Nicolas Patry	e02f1912bb	Reusing a single buffer (for now) to speed things up.	2023-11-11 17:43:35 +01:00
Nicolas Patry	a52b71686b	Going back on remote metal-rs.	2023-11-11 17:43:35 +01:00
Nicolas Patry	7adfb70dff	Few fixes.	2023-11-11 17:43:35 +01:00
Nicolas Patry	3ad02147e4	Starting to fix some tests.	2023-11-11 17:43:34 +01:00
Nicolas Patry	4f39695465	Missing new test.	2023-11-11 17:42:53 +01:00
Nicolas Patry	4cf4844c9d	Adding the test scaffolding.	2023-11-11 17:27:19 +01:00
Nicolas Patry	d840838e95	Cleanup fixed a few ops removed debugging scaffolding.	2023-11-11 17:18:00 +01:00
Nicolas Patry	61a070fdd1	Debugging rope.	2023-11-11 17:18:00 +01:00
Nicolas Patry	e35669647d	Fixed matmul (display still broken without casting back to CPU first? )	2023-11-11 17:18:00 +01:00
Nicolas Patry	53e8b7ee3e	Tmp state.	2023-11-11 17:18:00 +01:00
Nicolas Patry	cc26cce23c	Fixing the kernels + launches to make them faster. Cool work by @ivarflakstad Co-authored-by: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com>	2023-11-11 17:18:00 +01:00
Nicolas Patry	02c2ec2c71	Adding indexing. Co-authored-by: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com>	2023-11-11 17:18:00 +01:00
Nicolas Patry	9a2784b8ab	Refactor to simplify our lives for settings the params in the encoder.	2023-11-11 17:18:00 +01:00
Nicolas Patry	0f652f0e3d	Adding the actual backend	2023-11-11 17:18:00 +01:00
Nicolas Patry	ddee9dc1dd	Remove tracing.	2023-11-11 17:18:00 +01:00
Nicolas Patry	fc9bb7784a	Metal part 1 - Scaffolding for metal.	2023-11-11 17:18:00 +01:00