CustomOp for einsum.

2025-06-16 18:48:51 +00:00 · 2023-09-08 20:46:30 +01:00
561 changed files with 8290 additions and 87960 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,7 +0,0 @@
-version: 2
-updates:
-  - package-ecosystem: "cargo"
-    directory: "/"
-    schedule:
-      interval: "weekly"
-    open-pull-requests-limit: 5
--- a/.github/workflows/ci_cuda.yaml
+++ b/.github/workflows/ci_cuda.yaml
@ -5,15 +5,47 @@ on:
  pull_request:

 jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+      EC2_AMI_ID: ami-03cfed9ea28f4b002
+      EC2_INSTANCE_TYPE: g5.xlarge
+      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
+      EC2_SECURITY_GROUP: sg-030175c435ac141d6
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ env.EC2_AMI_ID }}
+          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
+          subnet-id: ${{ env.EC2_SUBNET_ID }}
+          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
+          aws-resource-tags: > # optional, requires additional permissions
+            [
+              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
  test-cuda:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
-    container:
-      image: nvidia/cuda:12.3.1-devel-ubuntu22.04
-      options: --gpus 0 
-    if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
+    needs: start-runner # required to start the main job when the runner is ready
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    permissions:
      contents: write
      packages: write
@ -24,10 +56,32 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
-      - name: Install dependencies
-        run: apt-get update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y
      - name: Install Rust Stable
-        uses: actions-rust-lang/setup-rust-toolchain@v1
+        run: curl https://sh.rustup.rs -sSf | sh -s -- -y
      - uses: Swatinem/rust-cache@v2
+      - run: apt-get update -y && apt-get install libssl-dev -y
      - name: Test (cuda)
-        run: cargo test --features cuda
+        run: PATH=$PATH:/usr/local/cuda-11.8/bin/ /root/.cargo/bin/cargo test --features cuda
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner
+      - test-cuda
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/.github/workflows/maturin.yml
+++ b/.github/workflows/maturin.yml
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@ -1,68 +0,0 @@
-name: PyO3-CI
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - main
-    paths:
-      - candle-pyo3/**
-  pull_request:
-    paths:
-      - candle-pyo3/**
-
-jobs:
-  build_and_test:
-    name: Check everything builds & tests
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest] # For now, only test on Linux
-    steps: 
-      - name: Checkout repository
-        uses: actions/checkout@v2
-
-      - name: Install Rust
-        uses: actions-rs/toolchain@v1
-        with:
-          toolchain: stable
-
-      - name: Install Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-          architecture: "x64"
-
-      - name: Cache Cargo Registry
-        uses: actions/cache@v1
-        with:
-          path: ~/.cargo/registry
-          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
-
-      - name: Install Protoc
-        uses: arduino/setup-protoc@v2
-        with:
-            version: "25.0"
-            repo-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Install
-        working-directory: ./candle-pyo3
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          pip install -U pip
-          pip install pytest maturin black
-          python -m maturin develop -r --features onnx
-
-      - name: Check style
-        working-directory: ./candle-pyo3
-        run: |
-          source .env/bin/activate
-          python stub.py --check
-          black --check .
-
-      - name: Run tests
-        working-directory: ./candle-pyo3
-        run: |
-          source .env/bin/activate
-          python -m pytest -s -v tests
--- a/.gitignore
+++ b/.gitignore
@ -23,16 +23,14 @@ flamegraph.svg
 *.dylib
 *.so
 *.swp
-*.swo
 trace-*.json

 candle-wasm-examples/*/build
 candle-wasm-examples/*/*.bin
 candle-wasm-examples/*/*.jpeg
-candle-wasm-examples/*/audios/*.wav
-candle-wasm-examples/**/*.safetensors
-candle-wasm-examples/**/*.gguf
+candle-wasm-examples/*/*.wav
+candle-wasm-examples/*/*.safetensors
 candle-wasm-examples/*/package-lock.json
-candle-wasm-examples/**/config*.json
+
 .DS_Store
 .idea/*
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,11 +0,0 @@
-{
-    "[python]": {
-        "editor.defaultFormatter": "ms-python.black-formatter"
-    },
-    "python.formatting.provider": "none",
-    "python.testing.pytestArgs": [
-        "candle-pyo3"
-    ],
-    "python.testing.unittestEnabled": false,
-    "python.testing.pytestEnabled": true
-}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,84 +1,13 @@
 # Changelog
 This documents the main changes to the `candle` crate.

-## v0.3.1 - Unreleased
+## v0.2.1 - Unreleased

 ### Added

-### Modified
-
-## v0.3.0 - 2023-10-01
-
-### Added
-
- Added the Mistral 7b v0.1 model
-  [983](https://github.com/huggingface/candle/pull/983).
- Quantized version of the Mistral model
-  [1009](https://github.com/huggingface/candle/pull/1009).
- Add the gelu-erf op and activation function
-  [969](https://github.com/huggingface/candle/pull/969).
- Add the mixformer/phi-v1.5 model
-  [930](https://github.com/huggingface/candle/pull/930).
- Add the sclice-scatter op
-  [927](https://github.com/huggingface/candle/pull/927).
- Add the Wuerstchen diffusion model
-  [911](https://github.com/huggingface/candle/pull/911).
-
-### Modified
-
- Support for simd128 intrinsics in some quantized vecdots
-  [982](https://github.com/huggingface/candle/pull/982).
- Optimize the index-select cuda kernel
-  [976](https://github.com/huggingface/candle/pull/976).
- Self-contained safetensor wrappers
-  [946](https://github.com/huggingface/candle/pull/946).
-
-## v0.2.2 - 2023-09-18
-
-### Added
- Support for `top_p` sampling
-  [819](https://github.com/huggingface/candle/pull/819).
- T5 model including decoding
-  [864](https://github.com/huggingface/candle/pull/864).
- 1-d upsampling
-  [839](https://github.com/huggingface/candle/pull/839).
-
-### Modified
- Bugfix for conv2d
-  [820](https://github.com/huggingface/candle/pull/820).
- Support tensor based indexing using `.i`
-  [842](https://github.com/huggingface/candle/pull/842).
-
-## v0.2.1 - 2023-09-11
-
-### Added
- Add some RNNs (GRU and LSTM) in `candle-nn`
-  [674](https://github.com/huggingface/candle/pull/674),
-  [688](https://github.com/huggingface/candle/pull/688).
- gguf v2 support
-  [725](https://github.com/huggingface/candle/pull/725).
- Quantized llama example in Python using the pyo3 api
-  [716](https://github.com/huggingface/candle/pull/716).
- `candle-nn` layer for conv2d-transposed
-  [760](https://github.com/huggingface/candle/pull/760).
- Add the Segment-Anything Model (SAM) as an example
-  [773](https://github.com/huggingface/candle/pull/773).
- TinyViT backbone for the segment anything example
-  [787](https://github.com/huggingface/candle/pull/787).
- Shape with holes support
-  [770](https://github.com/huggingface/candle/pull/770).
-
 ### Modified
 - Dilations are now supported in conv-transpose2d.
  [671](https://github.com/huggingface/candle/pull/671).
- Interactive mode for the quantized model
-  [690](https://github.com/huggingface/candle/pull/690).
- Faster softmax operation
-  [747](https://github.com/huggingface/candle/pull/747).
- Faster convolution operations on CPU and CUDA via im2col
-  [802](https://github.com/huggingface/candle/pull/802).
- Moving some models to a more central location
-  [796](https://github.com/huggingface/candle/pull/796).

 ## v0.2.0 - 2023-08-30

--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,20 +7,18 @@ members = [
    "candle-nn",
    "candle-pyo3",
    "candle-transformers",
-    "candle-wasm-examples/*",
-    "candle-wasm-tests",
-    "tensor-tools",
+    "candle-wasm-examples/llama2-c",
+    "candle-wasm-examples/whisper",
+    "candle-wasm-examples/yolo",
 ]
 exclude = [
-   "candle-flash-attn",
-   "candle-kernels",
-   "candle-metal-kernels",
-   "candle-onnx",
+    "candle-flash-attn",
+    "candle-kernels",
 ]
 resolver = "2"

 [workspace.package]
-version = "0.5.0"
+version = "0.2.1"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@ -29,50 +27,38 @@ categories = ["science"]
 license = "MIT OR Apache-2.0"

 [workspace.dependencies]
-ab_glyph = "0.2.23"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
-candle = { path = "./candle-core", package = "candle-core", version = "0.5.0" }
-candle-datasets = { path = "./candle-datasets", version = "0.5.0" }
-candle-flash-attn = { path = "./candle-flash-attn", version = "0.5.0" }
-candle-kernels = { path = "./candle-kernels", version = "0.5.0" }
-candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.5.0" }
-candle-nn = { path = "./candle-nn", version = "0.5.0" }
-candle-onnx = { path = "./candle-onnx", version = "0.5.0" }
-candle-transformers = { path = "./candle-transformers", version = "0.5.0" }
 clap = { version = "4.2.4", features = ["derive"] }
-criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "0.10.0", features = ["f16"] }
-fancy-regex = "0.13.0"
-gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
+cudarc = { version = "0.9.14", features = ["f16"] }
+# TODO: Switch back to the official gemm implementation once it has caught up.
+gemm = { version = "0.15.6", package = "candle-gemm" }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
-image = { version = "0.25.0", default-features = false, features = ["jpeg", "png"] }
-imageproc = { version = "0.24.0", default-features = false }
+image = { version = "0.24.7", default-features = false, features = ["jpeg", "png"] }
+imageproc = { version = "0.23.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
-memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
+memmap2 = "0.7.1"
 num_cpus = "1.15.0"
 num-traits = "0.2.15"
-parquet = { version = "51.0.0" }
 rand = "0.8.5"
 rand_distr = "0.4.3"
 rayon = "1.7.0"
-safetensors = "0.4.1"
+rusttype = { version = "0.9", default-features = false }
+safetensors = "0.3.1"
 serde = { version = "1.0.171", features = ["derive"] }
-serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
-tokenizers = { version = "0.15.0", default-features = false }
+tokenizers = { version = "0.13.4", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
 wav = "1.0.0"
-yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "0.6.6", default-features = false }
-metal = { version = "0.27.0", features = ["mps"]}
+parquet = { version = "45.0.0" }

 [profile.release-with-debug]
 inherits = "release"
--- a/README.md
+++ b/README.md
@ -8,10 +8,7 @@ Candle is a minimalist ML framework for Rust with a focus on performance (includ
 and ease of use. Try our online demos: 
 [whisper](https://huggingface.co/spaces/lmz/candle-whisper),
 [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2),
-[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
-[yolo](https://huggingface.co/spaces/lmz/candle-yolo),
-[Segment
-Anything](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).
+[yolo](https://huggingface.co/spaces/lmz/candle-yolo).

 ## Get started

@ -48,95 +45,40 @@ For more advanced examples, please have a look at the following section.

 ## Check out our examples

-These online demos run entirely in your browser:
- [yolo](https://huggingface.co/spaces/lmz/candle-yolo): pose estimation and
-  object recognition.
- [whisper](https://huggingface.co/spaces/lmz/candle-whisper): speech recognition.
- [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2): text generation.
- [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm): text generation.
- [Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
- [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm): Image segmentation.
- [BLIP](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning): image captioning.
+Check out our [examples](./candle-examples/examples/):

-We also provide a some command line based examples using state of the art models:
-
- [LLaMA and LLaMA-v2](./candle-examples/examples/llama/): general LLM, includes
-  the SOLAR-10.7B variant.
- [Falcon](./candle-examples/examples/falcon/): general LLM.
- [Gemma](./candle-examples/examples/gemma/): 2b and 7b general LLMs from Google
-  Deepmind.
- [Phi-1, Phi-1.5, and Phi-2](./candle-examples/examples/phi/): 1.3b and 2.7b general LLMs with performance on par with LLaMA-v2 7b.
- [StableLM-3B-4E1T](./candle-examples/examples/stable-lm/): a 3b general LLM
-  pre-trained on 1T tokens of English and code datasets. Also supports
-  StableLM-2, a 1.6b LLM trained on 2T tokens, as well as the code variants.
- [Mamba](./candle-examples/examples/mamba/): an inference only
-  implementation of the Mamba state space model.
- [Mistral7b-v0.1](./candle-examples/examples/mistral/): a 7b general LLM with
-  better performance than all publicly available 13b models as of 2023-09-28.
- [Mixtral8x7b-v0.1](./candle-examples/examples/mixtral/): a sparse mixture of
-  experts 8x7b general LLM with better performance than a Llama 2 70B model with
-  much faster inference.
- [StarCoder](./candle-examples/examples/bigcode/) and
-  [StarCoder2](./candle-examples/examples/starcoder2/): LLM specialized to code generation.
- [Qwen1.5](./candle-examples/examples/qwen/): Bilingual (English/Chinese) LLMs.
- [RWKV v5 and v6](./candle-examples/examples/rwkv/): An RNN with transformer level LLM
-  performance.
- [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
- [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
-  (English/Chinese) general LLMs with 6b and 34b parameters.
- [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
-  the LLaMA model using the same quantization techniques as
-  [llama.cpp](https://github.com/ggerganov/llama.cpp).
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/quantized/assets/aoc.gif" width="600">
-  
- [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
-  image generative model, support for the 1.5, 2.1, SDXL 1.0 and Turbo versions.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg" width="200">
-
- [Wuerstchen](./candle-examples/examples/wuerstchen/): another text to
-  image generative model.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/wuerstchen/assets/cat.jpg" width="200">
-
- [yolo-v3](./candle-examples/examples/yolo-v3/) and
-  [yolo-v8](./candle-examples/examples/yolo-v8/): object detection and pose
-  estimation models.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.od.jpg" width="200"><img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.pose.jpg" width="200">
- [segment-anything](./candle-examples/examples/segment-anything/): image
-  segmentation model with prompt.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/segment-anything/assets/sam_merged.jpg" width="200">
-
- [SegFormer](./candle-examples/examples/segformer/): transformer based semantic segmantation model.
 - [Whisper](./candle-examples/examples/whisper/): speech recognition model.
- [EnCodec](./candle-examples/examples/encodec/): high-quality audio compression
-  model using residual vector quantization.
- [MetaVoice](./candle-examples/examples/metavoice/): foundational model for
-  text-to-speech.
- [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
-  [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
+- [LLaMA and LLaMA-v2](./candle-examples/examples/llama/): general LLM.
+- [Falcon](./candle-examples/examples/falcon/): general LLM.
+- [Bert](./candle-examples/examples/bert/): useful for sentence embeddings.
+- [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code
+  generation.
+- [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
+  image generative model, support for the 1.5, 2.1, and SDXL 1.0 versions.
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
  using self-supervision (can be used for imagenet classification, depth
  evaluation, segmentation).
- [VGG](./candle-examples/examples/vgg/),
-  [RepVGG](./candle-examples/examples/repvgg): computer vision models.
- [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
-  generate captions for an image.
- [CLIP](./candle-examples/examples/clip/): multi-model vision and language
-  model.
- [TrOCR](./candle-examples/examples/trocr/): a transformer OCR model, with
-  dedicated submodels for hand-writing and printed recognition.
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
-  model, generates the translated text from the input text.
- [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model 
-  that can answer real-world questions about images.
-
-Run them using commands like:
+- [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
+  the LLaMA model using the same quantization techniques as
+  [llama.cpp](https://github.com/ggerganov/llama.cpp).
+- [yolo-v3](./candle-examples/examples/yolo-v3/) and
+  [yolo-v8](./candle-examples/examples/yolo-v8/): object detection and pose
+  estimation models.
+  [segment-anything](./candle-examples/examples/segment-anything/): image
+  segmentation model with prompt.
+Run them using the following commands:
 ```
+cargo run --example whisper --release
+cargo run --example llama --release
+cargo run --example falcon --release
+cargo run --example bert --release
+cargo run --example bigcode --release
+cargo run --example stable-diffusion --release -- --prompt "a rusty robot holding a fire torch"
+cargo run --example dinov2 --release -- --image path/to/myinput.jpg
 cargo run --example quantized --release
+cargo run --example yolo-v3 --release -- myimage.jpg
+cargo run --example yolo-v8 --release -- myimage.jpg # for pose estimation, add --task pose 
+cargo run --example segment-anything --release -- --image myimage.jpg
 ```

 In order to use **CUDA** add `--features cuda` to the example command line. If
@ -146,10 +88,7 @@ There are also some wasm examples for whisper and
 [llama2.c](https://github.com/karpathy/llama2.c). You can either build them with
 `trunk` or try them online:
 [whisper](https://huggingface.co/spaces/lmz/candle-whisper),
-[llama2](https://huggingface.co/spaces/lmz/candle-llama2),
-[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
-[Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm),
-[Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).
+[llama2](https://huggingface.co/spaces/lmz/candle-llama2).

 For LLaMA2, run the following command to retrieve the weight files and start a
 test server:
@ -162,30 +101,6 @@ trunk serve --release --port 8081
 And then head over to
 [http://localhost:8081/](http://localhost:8081/).

-<!--- ANCHOR: useful_libraries --->
-
-## Useful External Resources
- [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): A
-  very detailed tutorial showing how to convert a PyTorch model to Candle.
- [`candle-lora`](https://github.com/EricLBuehler/candle-lora): Efficient and
-  ergonomic LoRA implementation for Candle. `candle-lora` has      
-  out-of-the-box LoRA support for many models from Candle, which can be found
-  [here](https://github.com/EricLBuehler/candle-lora/tree/master/candle-lora-transformers/examples).
- [`optimisers`](https://github.com/KGrewal1/optimisers): A collection of optimisers
-  including SGD with momentum, AdaGrad, AdaDelta, AdaMax, NAdam, RAdam, and RMSprop.
- [`candle-vllm`](https://github.com/EricLBuehler/candle-vllm): Efficient platform for inference and
-  serving local LLMs including an OpenAI compatible API server.
- [`candle-ext`](https://github.com/mokeyish/candle-ext): An extension library to Candle that provides PyTorch functions not currently available in Candle.
- [`candle-coursera-ml`](https://github.com/vishpat/candle-coursera-ml): Implementation of ML algorithms from Coursera's [Machine Learning Specialization](https://www.coursera.org/specializations/machine-learning-introduction) course.
- [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
- [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
- [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
- [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.
-
-If you have an addition to this list, please submit a pull request.
-
-<!--- ANCHOR_END: useful_libraries --->
-
 <!--- ANCHOR: features --->

 ## Features
@ -198,46 +113,10 @@ If you have an addition to this list, please submit a pull request.
    - CUDA backend for efficiently running on GPUs, multiple GPU distribution via NCCL.
    - WASM support, run your models in a browser.
 - Included models.
-    - Language Models.
-        - LLaMA v1 and v2 with variants such as SOLAR-10.7B.
-        - Falcon.
-        - StarCoder, StarCoder2.
-        - Phi 1, 1.5, and 2.
-        - Mamba, Minimal Mamba
-        - Gemma 2b and 7b.
-        - Mistral 7b v0.1.
-        - Mixtral 8x7b v0.1.
-        - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
-        - Replit-code-v1.5-3B.
-        - Bert.
-        - Yi-6B and Yi-34B.
-        - Qwen1.5, Qwen1.5 MoE.
-        - RWKV v5 and v6.
-    - Quantized LLMs.
-        - Llama 7b, 13b, 70b, as well as the chat and code variants.
-        - Mistral 7b, and 7b instruct.
-        - Mixtral 8x7b.
-        - Zephyr 7b a and b (Mistral-7b based).
-        - OpenChat 3.5 (Mistral-7b based).
-    - Text to text.
-        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
-        - Marian MT (Machine Translation).
-    - Text to image.
-        - Stable Diffusion v1.5, v2.1, XL v1.0.
-        - Wurstchen v2.
-    - Image to text.
-        - BLIP.
-        - TrOCR.
-    - Audio.
-        - Whisper, multi-lingual speech-to-text.
-        - EnCodec, audio compression model.
-        - MetaVoice-1B, text-to-speech model.
-    - Computer Vision Models.
-        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
-          ConvNeXTv2, MobileOne, EfficientVit (MSRA).
-        - yolo-v3, yolo-v8.
-        - Segment-Anything Model (SAM).
-        - SegFormer.
+    - LLMs: LLaMA v1 and v2, Falcon, StarCoder.
+    - Whisper (multi-lingual support).
+    - Stable Diffusion.
+    - Computer Vision: DINOv2, EfficientNet, yolo-v3, yolo-v8.
 - File formats: load models from safetensors, npz, ggml, or PyTorch files.
 - Serverless (on CPU), small and fast deployments.
 - Quantization support using the llama.cpp quantized types.
@ -274,7 +153,6 @@ Cheatsheet:
 - [candle-datasets](./candle-datasets/): Datasets and data loaders.
 - [candle-transformers](./candle-transformers): transformers-related utilities.
 - [candle-flash-attn](./candle-flash-attn): Flash attention v2 layer.
- [candle-onnx](./candle-onnx/): ONNX model evaluation.

 ## FAQ

@ -379,29 +257,6 @@ This is a bug in gcc-11 triggered by the Cuda compiler. To fix this, install a d
 env CANDLE_NVCC_CCBIN=/usr/lib/gcc/x86_64-linux-gnu/10 cargo ...
 ```

-#### Linking error on windows when running rustdoc or mdbook tests
-
-```
-Couldn't compile the test.
---- .\candle-book\src\inference\hub.md - Using_the_hub::Using_in_a_real_model_ (line 50) stdout ----
-error: linking with `link.exe` failed: exit code: 1181
-//very long chain of linking
- = note: LINK : fatal error LNK1181: cannot open input file 'windows.0.48.5.lib'
-```
-
-Make sure you link all native libraries that might be located outside a project target, e.g., to run mdbook tests, you should run:
-
-```
-mdbook test candle-book -L .\target\debug\deps\ `
-L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.42.2\lib `
-L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.48.5\lib
-```
-
-#### Extremely slow model load time with WSL
-
-This may be caused by the models being loaded from `/mnt/c`, more details on
-[stackoverflow](https://stackoverflow.com/questions/68972448/why-is-wsl-extremely-slow-when-compared-with-native-windows-npm-yarn-processing).
-
 #### Tracking down errors

 You can set `RUST_BACKTRACE=1` to be provided with backtraces when a candle
--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@ -11,11 +11,11 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { workspace = true }
-candle-datasets = { workspace = true }
-candle-nn = { workspace = true }
-candle-transformers = { workspace = true }
-candle-flash-attn = { workspace = true, optional = true }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
+candle-datasets = { path = "../candle-datasets", version = "0.2.1" }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
+candle-transformers = { path = "../candle-transformers", version = "0.2.1" }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.2.1", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
@ -24,10 +24,9 @@ intel-mkl-src = { workspace = true, optional = true }
 cudarc = { workspace = true, optional = true }
 half = { workspace = true, optional = true }
 image = { workspace = true, optional = true }
-anyhow = { workspace = true }
-tokio = "1.29.1"

 [dev-dependencies]
+anyhow = { workspace = true }
 byteorder = { workspace = true }
 hf-hub = { workspace = true, features=["tokio"]}
 clap = { workspace = true }
@ -39,6 +38,7 @@ tracing-chrome = { workspace = true }
 tracing-subscriber = { workspace = true }
 wav = { workspace = true }
 # Necessary to disambiguate with tokio in wasm examples which are 1.28.1
+tokio = "1.29.1"
 parquet = { workspace = true }
 image = { workspace = true }

--- a/candle-book/src/SUMMARY.md
+++ b/candle-book/src/SUMMARY.md
@ -10,11 +10,10 @@

 # Reference Guide

- [Running a model](inference/inference.md)
+- [Running a model](inference/README.md)
    - [Using the hub](inference/hub.md)
 - [Error management](error_manage.md)
- [Training](training/training.md)
-    - [Simplified](training/simplified.md)
+- [Training](training/README.md)
    - [MNIST](training/mnist.md)
    - [Fine-tuning]()
    - [Serialization]()
--- a/candle-book/src/apps/dekstop.md
+++ b/candle-book/src/apps/dekstop.md
--- a/candle-book/src/error_manage.md
+++ b/candle-book/src/error_manage.md
@ -29,7 +29,7 @@ After adding `RUST_BACKTRACE=1`:
 Error: WithBacktrace { inner: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }, backtrace: Backtrace [{ fn: "candle::error::Error::bt", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/error.rs", line: 200 }, { fn: "candle::tensor::Tensor::matmul", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/tensor.rs", line: 816 }, { fn: "myapp::main", file: "./src/main.rs", line: 29 }, { fn: "core::ops::function::FnOnce::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 250 }, { fn: "std::sys_common::backtrace::__rust_begin_short_backtrace", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/sys_common/backtrace.rs", line: 135 }, { fn: "std::rt::lang_start::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 166 }, { fn: "core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 284 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::rt::lang_start", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 165 }, { fn: "main" }, { fn: "__libc_start_main" }, { fn: "_start" }] }
 ```

-Not super pretty at the moment, but we can see error occurred on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`
+Not super pretty at the moment, but we can see error occured on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`


 Another thing to note, is that since Rust is compiled it is not necessarily as easy to recover proper stacktraces
--- a/candle-book/src/guide/hello_world.md
+++ b/candle-book/src/guide/hello_world.md
@ -6,7 +6,7 @@ Open `src/main.rs` and fill in this content:

 ```rust
 # extern crate candle_core;
-use candle_core::{Device, Result, Tensor};
+use candle_core::{DType, Device, Result, Tensor};

 struct Model {
    first: Tensor,
@ -25,11 +25,11 @@ fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; to use the GPU.
    let device = Device::Cpu;

-    let first = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
-    let second = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
+    let first = Tensor::zeros((784, 100), DType::F32, &device)?;
+    let second = Tensor::zeros((100, 10), DType::F32, &device)?;
    let model = Model { first, second };

-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
+    let dummy_image = Tensor::zeros((1, 784), DType::F32, &device)?;

    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
@ -50,7 +50,7 @@ the classical `Linear` layer. We can do as such

 ```rust
 # extern crate candle_core;
-# use candle_core::{Device, Result, Tensor};
+# use candle_core::{DType, Device, Result, Tensor};
 struct Linear{
    weight: Tensor,
    bias: Tensor,
@ -80,7 +80,7 @@ This will change the model running code into a new function

 ```rust
 # extern crate candle_core;
-# use candle_core::{Device, Result, Tensor};
+# use candle_core::{DType, Device, Result, Tensor};
 # struct Linear{
 #     weight: Tensor,
 #     bias: Tensor,
@ -110,15 +110,15 @@ fn main() -> Result<()> {
    let device = Device::cuda_if_available(0)?;

    // Creating a dummy model
-    let weight = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
+    let weight = Tensor::zeros((784, 100), DType::F32, &device)?;
+    let bias = Tensor::zeros((100, ), DType::F32, &device)?;
    let first = Linear{weight, bias};
-    let weight = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
+    let weight = Tensor::zeros((100, 10), DType::F32, &device)?;
+    let bias = Tensor::zeros((10, ), DType::F32, &device)?;
    let second = Linear{weight, bias};
    let model = Model { first, second };

-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
+    let dummy_image = Tensor::zeros((1, 784), DType::F32, &device)?;

    // Inference on the model
    let digit = model.forward(&dummy_image)?;
@ -146,7 +146,7 @@ And rewrite our examples using it
 ```rust
 # extern crate candle_core;
 # extern crate candle_nn;
-use candle_core::{Device, Result, Tensor};
+use candle_core::{DType, Device, Result, Tensor};
 use candle_nn::{Linear, Module};

 struct Model {
@ -167,15 +167,15 @@ fn main() -> Result<()> {
    let device = Device::Cpu;

    // This has changed (784, 100) -> (100, 784) !
-    let weight = Tensor::randn(0f32, 1.0, (100, 784), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
+    let weight = Tensor::zeros((100, 784), DType::F32, &device)?;
+    let bias = Tensor::zeros((100, ), DType::F32, &device)?;
    let first = Linear::new(weight, Some(bias));
-    let weight = Tensor::randn(0f32, 1.0, (10, 100), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
+    let weight = Tensor::zeros((10, 100), DType::F32, &device)?;
+    let bias = Tensor::zeros((10, ), DType::F32, &device)?;
    let second = Linear::new(weight, Some(bias));
    let model = Model { first, second };

-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
+    let dummy_image = Tensor::zeros((1, 784), DType::F32, &device)?;

    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
@ -188,8 +188,8 @@ Feel free to modify this example to use `Conv2d` to create a classical convnet i

 Now that we have the running dummy code we can get to more advanced topics:

- [For PyTorch users](../guide/cheatsheet.md)
- [Running existing models](../inference/inference.md)
- [Training models](../training/training.md)
+- [For PyTorch users](./guide/cheatsheet.md)
+- [Running existing models](./inference/README.md)
+- [Training models](./training/README.md)


--- a/candle-book/src/guide/installation.md
+++ b/candle-book/src/guide/installation.md
@ -12,9 +12,6 @@ compute_cap
 8.9
 ```

-You can also compile the Cuda kernels for a specific compute cap using the 
-`CUDA_COMPUTE_CAP=<compute cap>` environment variable.
-
 If any of the above commands errors out, please make sure to update your Cuda version.

 2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support.
--- a/candle-book/src/inference/inference.md
+++ b/candle-book/src/inference/inference.md
--- a/candle-book/src/lib.rs
+++ b/candle-book/src/lib.rs
@ -1,6 +1,3 @@
-#[cfg(test)]
-pub mod simplified;
-
 #[cfg(test)]
 mod tests {
    use anyhow::Result;
@ -28,7 +25,6 @@ let weights = candle::safetensors::load(weights_filename, &Device::Cpu).unwrap()
    #[rustfmt::skip]
    #[test]
    fn book_hub_2() {
-        {
 // ANCHOR: book_hub_2
 use candle::Device;
 use hf_hub::api::sync::Api;
@ -46,10 +42,9 @@ let weights = candle::safetensors::load_buffer(&mmap[..], &Device::Cpu).unwrap()
        assert_eq!(weights.len(), 206);
    }

-    // #[rustfmt::skip]
-    // #[test]
-    // fn book_hub_3() {
-    {
+    #[rustfmt::skip]
+    #[test]
+    fn book_hub_3() {
 // ANCHOR: book_hub_3
 use candle::{DType, Device, Tensor};
 use hf_hub::api::sync::Api;
@ -104,7 +99,6 @@ let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).un
        assert_eq!(view.shape(), &[768, 768]);
        assert_eq!(tp_tensor.dims(), &[192, 768]);
    }
-}

    #[rustfmt::skip]
    #[test]
--- a/candle-book/src/simplified.rs
+++ b/candle-book/src/simplified.rs
@ -1,196 +0,0 @@
-//! #A simplified example in Rust of training a neural network and then using it based on the Candle Framework by Hugging Face.
-//! Author: Evgeny Igumnov 2023 igumnovnsk@gmail.com
-//! This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
-//!
-//! ##Basic moments:
-//!
-//! A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
-//! The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
-//! The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
-//! For training, samples with real data on the results of the first and second stages of different elections are used.
-//! The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
-//! Model parameters (weights of neurons) are initialized randomly, then optimized during training.
-//! After training, the model is tested on a deferred sample to evaluate the accuracy.
-//! If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
-//! Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.
-
-#[rustfmt::skip]
-mod tests {
-
-use candle::{DType, Result, Tensor, D, Device};
-use candle_nn::{loss, ops, Linear, Module, VarBuilder, VarMap, Optimizer};
-
-// ANCHOR: book_training_simplified1
-const VOTE_DIM: usize = 2;
-const RESULTS: usize = 1;
-const EPOCHS: usize = 10;
-const LAYER1_OUT_SIZE: usize = 4;
-const LAYER2_OUT_SIZE: usize = 2;
-const LEARNING_RATE: f64 = 0.05;
-
-#[derive(Clone)]
-pub struct Dataset {
-    pub train_votes: Tensor,
-    pub train_results: Tensor,
-    pub test_votes: Tensor,
-    pub test_results: Tensor,
-}
-
-struct MultiLevelPerceptron {
-    ln1: Linear,
-    ln2: Linear,
-    ln3: Linear,
-}
-
-impl MultiLevelPerceptron {
-    fn new(vs: VarBuilder) -> Result<Self> {
-        let ln1 = candle_nn::linear(VOTE_DIM, LAYER1_OUT_SIZE, vs.pp("ln1"))?;
-        let ln2 = candle_nn::linear(LAYER1_OUT_SIZE, LAYER2_OUT_SIZE, vs.pp("ln2"))?;
-        let ln3 = candle_nn::linear(LAYER2_OUT_SIZE, RESULTS + 1, vs.pp("ln3"))?;
-        Ok(Self { ln1, ln2, ln3 })
-    }
-
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        let xs = self.ln1.forward(xs)?;
-        let xs = xs.relu()?;
-        let xs = self.ln2.forward(&xs)?;
-        let xs = xs.relu()?;
-        self.ln3.forward(&xs)
-    }
-}
-
-// ANCHOR_END: book_training_simplified1
-
-
-
-// ANCHOR: book_training_simplified3
-#[tokio::test]
-async fn simplified() -> anyhow::Result<()> {
-
-    let dev = Device::cuda_if_available(0)?;
-
-    let train_votes_vec: Vec<u32> = vec![
-        15, 10,
-        10, 15,
-        5, 12,
-        30, 20,
-        16, 12,
-        13, 25,
-        6, 14,
-        31, 21,
-    ];
-    let train_votes_tensor = Tensor::from_vec(train_votes_vec.clone(), (train_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
-
-    let train_results_vec: Vec<u32> = vec![
-        1,
-        0,
-        0,
-        1,
-        1,
-        0,
-        0,
-        1,
-    ];
-    let train_results_tensor = Tensor::from_vec(train_results_vec, train_votes_vec.len() / VOTE_DIM, &dev)?;
-
-    let test_votes_vec: Vec<u32> = vec![
-        13, 9,
-        8, 14,
-        3, 10,
-    ];
-    let test_votes_tensor = Tensor::from_vec(test_votes_vec.clone(), (test_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
-
-    let test_results_vec: Vec<u32> = vec![
-        1,
-        0,
-        0,
-    ];
-    let test_results_tensor = Tensor::from_vec(test_results_vec.clone(), test_results_vec.len(), &dev)?;
-
-    let m = Dataset {
-        train_votes: train_votes_tensor,
-        train_results: train_results_tensor,
-        test_votes: test_votes_tensor,
-        test_results: test_results_tensor,
-    };
-
-    let trained_model: MultiLevelPerceptron;
-    loop {
-        println!("Trying to train neural network.");
-        match train(m.clone(), &dev) {
-            Ok(model) => {
-                trained_model = model;
-                break;
-            },
-            Err(e) => {
-                println!("Error: {}", e);
-                continue;
-            }
-        }
-
-    }
-
-    let real_world_votes: Vec<u32> = vec![
-        13, 22,
-    ];
-
-    let tensor_test_votes = Tensor::from_vec(real_world_votes.clone(), (1, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
-
-    let final_result = trained_model.forward(&tensor_test_votes)?;
-
-    let result = final_result
-        .argmax(D::Minus1)?
-        .to_dtype(DType::F32)?
-        .get(0).map(|x| x.to_scalar::<f32>())??;
-    println!("real_life_votes: {:?}", real_world_votes);
-    println!("neural_network_prediction_result: {:?}", result);
-
-    Ok(())
-
-}
-// ANCHOR_END: book_training_simplified3
-
-// ANCHOR: book_training_simplified2
-fn train(m: Dataset, dev: &Device) -> anyhow::Result<MultiLevelPerceptron> {
-    let train_results = m.train_results.to_device(dev)?;
-    let train_votes = m.train_votes.to_device(dev)?;
-    let varmap = VarMap::new();
-    let vs = VarBuilder::from_varmap(&varmap, DType::F32, dev);
-    let model = MultiLevelPerceptron::new(vs.clone())?;
-    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), LEARNING_RATE)?;
-    let test_votes = m.test_votes.to_device(dev)?;
-    let test_results = m.test_results.to_device(dev)?;
-    let mut final_accuracy: f32 = 0.0;
-    for epoch in 1..EPOCHS + 1 {
-        let logits = model.forward(&train_votes)?;
-        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
-        let loss = loss::nll(&log_sm, &train_results)?;
-        sgd.backward_step(&loss)?;
-
-        let test_logits = model.forward(&test_votes)?;
-        let sum_ok = test_logits
-            .argmax(D::Minus1)?
-            .eq(&test_results)?
-            .to_dtype(DType::F32)?
-            .sum_all()?
-            .to_scalar::<f32>()?;
-        let test_accuracy = sum_ok / test_results.dims1()? as f32;
-        final_accuracy = 100. * test_accuracy;
-        println!("Epoch: {epoch:3} Train loss: {:8.5} Test accuracy: {:5.2}%",
-                 loss.to_scalar::<f32>()?,
-                 final_accuracy
-        );
-        if final_accuracy == 100.0 {
-            break;
-        }
-    }
-    if final_accuracy < 100.0 {
-        Err(anyhow::Error::msg("The model is not trained well enough."))
-    } else {
-        Ok(model)
-    }
-}
-// ANCHOR_END: book_training_simplified2
-
-
-}
--- a/candle-book/src/training/training.md
+++ b/candle-book/src/training/training.md
--- a/candle-book/src/training/simplified.md
+++ b/candle-book/src/training/simplified.md
@ -1,45 +0,0 @@
-# Simplified
-
-## How its works
-
-This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
-
-Basic moments:
-
-1. A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
-2. The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
-3. The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
-4. For training, samples with real data on the results of the first and second stages of different elections are used.
-5. The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
-6. Model parameters (weights of neurons) are initialized randomly, then optimized during training.
-7. After training, the model is tested on a deferred sample to evaluate the accuracy.
-8. If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
-
-Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.
-
-
-```rust,ignore
-{{#include ../simplified.rs:book_training_simplified1}}
-```
-
-```rust,ignore
-{{#include ../simplified.rs:book_training_simplified2}}
-```
-
-```rust,ignore
-{{#include ../simplified.rs:book_training_simplified3}}
-```
-
-
-## Example output
-
-```bash
-Trying to train neural network.
-Epoch:   1 Train loss:  4.42555 Test accuracy:  0.00%
-Epoch:   2 Train loss:  0.84677 Test accuracy: 33.33%
-Epoch:   3 Train loss:  2.54335 Test accuracy: 33.33%
-Epoch:   4 Train loss:  0.37806 Test accuracy: 33.33%
-Epoch:   5 Train loss:  0.36647 Test accuracy: 100.00%
-real_life_votes: [13, 22]
-neural_network_prediction_result: 0.0
-```
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -12,9 +12,7 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { workspace = true, optional = true }
-candle-metal-kernels = { workspace = true, optional = true }
-metal = { workspace = true, optional = true}
+candle-kernels = { path = "../candle-kernels", version = "0.2.1", optional = true }
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
@ -28,14 +26,11 @@ rand_distr = { workspace = true }
 rayon = { workspace = true }
 safetensors = { workspace = true }
 thiserror = { workspace = true }
-yoke = { workspace = true }
 zip = { workspace = true }

 [dev-dependencies]
 anyhow = { workspace = true }
 clap = { workspace = true }
-criterion = { workspace = true }
-

 [features]
 default = []
@ -43,8 +38,3 @@ cuda = ["cudarc", "dep:candle-kernels"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
-metal = ["dep:metal", "dep:candle-metal-kernels"]
-
-[[bench]]
-name = "bench_main"
-harness = false
--- a/candle-core/benches/bench_main.rs
+++ b/candle-core/benches/bench_main.rs
@ -1,10 +0,0 @@
-mod benchmarks;
-
-use criterion::criterion_main;
-criterion_main!(
-    benchmarks::affine::benches,
-    benchmarks::matmul::benches,
-    benchmarks::random::benches,
-    benchmarks::where_cond::benches,
-    benchmarks::conv_transpose2d::benches,
-);
--- a/candle-core/benches/benchmarks/affine.rs
+++ b/candle-core/benches/benchmarks/affine.rs
@ -1,43 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(a: &Tensor) {
-    a.affine(12.34, 56.78).unwrap();
-}
-
-fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let b = 1;
-    let m = 1024;
-    let k = 1024;
-
-    let tensor = Tensor::zeros((b, m, k), dtype, &device).unwrap();
-
-    let flops = b * m * k * dtype.size_in_bytes();
-
-    let mut group = c.benchmark_group(device.bench_name(name));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(black_box(&tensor));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_affine_benchmark(c, &device, DType::F32, "affine_f32");
-        run_affine_benchmark(c, &device, DType::F16, "affine_f16");
-        run_affine_benchmark(c, &device, DType::BF16, "affine_bf16");
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/conv_transpose2d.rs
+++ b/candle-core/benches/benchmarks/conv_transpose2d.rs
@ -1,59 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(
-    x: &Tensor,
-    k: &Tensor,
-    padding: usize,
-    output_padding: usize,
-    stride: usize,
-    dilation: usize,
-) {
-    x.conv_transpose2d(k, padding, output_padding, stride, dilation)
-        .unwrap();
-}
-
-fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let t = Tensor::arange(0.0f32, 10000.0, device)
-        .unwrap()
-        .reshape((1, 4, 50, 50))
-        .unwrap()
-        .to_dtype(dtype)
-        .unwrap();
-
-    let kernel = Tensor::arange(0.0f32, 100.0, device)
-        .unwrap()
-        .reshape((4, 1, 5, 5))
-        .unwrap()
-        .to_dtype(dtype)
-        .unwrap();
-
-    let flops = t.dims().iter().product::<usize>() * dtype.size_in_bytes();
-
-    let mut group = c.benchmark_group(device.bench_name(name));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(black_box(&t), black_box(&kernel), 1, 0, 1, 2);
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_benchmark(c, &device, DType::F32, "conv_transpose2d_f32");
-        run_benchmark(c, &device, DType::F16, "conv_transpose2d_f16");
-        run_benchmark(c, &device, DType::BF16, "conv_transpose2d_bf16");
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/matmul.rs
+++ b/candle-core/benches/benchmarks/matmul.rs
@ -1,44 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(a: &Tensor, b: &Tensor) {
-    a.matmul(&b.t().unwrap()).unwrap();
-}
-
-fn run_bench(c: &mut Criterion, device: &Device) {
-    let b = 1;
-    let m = 1;
-    let n = 2048;
-    let k = 2048;
-
-    let dtype = DType::F32;
-    let lhs = Tensor::zeros((b, m, k), dtype, device).unwrap();
-    let rhs = Tensor::zeros((b, n, k), dtype, device).unwrap();
-
-    let flops = b * m * n * k;
-
-    let mut group = c.benchmark_group(device.bench_name("matmul"));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(black_box(&lhs), black_box(&rhs));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_bench(c, &device);
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/mod.rs
+++ b/candle-core/benches/benchmarks/mod.rs
@ -1,67 +0,0 @@
-pub(crate) mod affine;
-pub(crate) mod conv_transpose2d;
-pub(crate) mod matmul;
-pub(crate) mod random;
-pub(crate) mod where_cond;
-
-use candle_core::{Device, Result};
-
-pub(crate) trait BenchDevice {
-    fn sync(&self) -> Result<()>;
-
-    fn bench_name<S: Into<String>>(&self, name: S) -> String;
-}
-
-impl BenchDevice for Device {
-    fn sync(&self) -> Result<()> {
-        match self {
-            Device::Cpu => Ok(()),
-            Device::Cuda(device) => {
-                #[cfg(feature = "cuda")]
-                return Ok(device.synchronize()?);
-                #[cfg(not(feature = "cuda"))]
-                panic!("Cuda device without cuda feature enabled: {:?}", device)
-            }
-            Device::Metal(device) => {
-                #[cfg(feature = "metal")]
-                return Ok(device.wait_until_completed()?);
-                #[cfg(not(feature = "metal"))]
-                panic!("Metal device without metal feature enabled: {:?}", device)
-            }
-        }
-    }
-
-    fn bench_name<S: Into<String>>(&self, name: S) -> String {
-        match self {
-            Device::Cpu => {
-                let cpu_type = if cfg!(feature = "accelerate") {
-                    "accelerate"
-                } else if cfg!(feature = "mkl") {
-                    "mkl"
-                } else {
-                    "cpu"
-                };
-                format!("{}_{}", cpu_type, name.into())
-            }
-            Device::Cuda(_) => format!("cuda_{}", name.into()),
-            Device::Metal(_) => format!("metal_{}", name.into()),
-        }
-    }
-}
-
-struct BenchDeviceHandler {
-    devices: Vec<Device>,
-}
-
-impl BenchDeviceHandler {
-    pub fn new() -> Result<Self> {
-        let mut devices = Vec::new();
-        if cfg!(feature = "metal") {
-            devices.push(Device::new_metal(0)?);
-        } else if cfg!(feature = "cuda") {
-            devices.push(Device::new_cuda(0)?);
-        }
-        devices.push(Device::Cpu);
-        Ok(Self { devices })
-    }
-}
--- a/candle-core/benches/benchmarks/random.rs
+++ b/candle-core/benches/benchmarks/random.rs
@ -1,63 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn rand_uniform(a: &Tensor) {
-    a.rand_like(-1.0, 123.0).unwrap();
-}
-
-fn rand_normal(a: &Tensor) {
-    a.randn_like(100.0, 15.0).unwrap();
-}
-
-fn run_random_bench(c: &mut Criterion, device: &Device) {
-    let b = 1;
-
-    let rows = 2048;
-    let cols = 2048;
-
-    let dtype = DType::F32;
-    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
-
-    let flops = b * rows * cols * dtype.size_in_bytes();
-
-    let mut group = c.benchmark_group(device.bench_name("random_uniform"));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |benches| {
-        benches.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                rand_uniform(black_box(&tensor));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-
-    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
-
-    let mut group = c.benchmark_group(device.bench_name("random_normal"));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |benches| {
-        benches.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                rand_normal(black_box(&tensor));
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let handler = BenchDeviceHandler::new().unwrap();
-    for device in handler.devices {
-        run_random_bench(c, &device);
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/where_cond.rs
+++ b/candle-core/benches/benchmarks/where_cond.rs
@ -1,64 +0,0 @@
-use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
-use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, Criterion, Throughput};
-use std::time::Instant;
-
-fn run(a: &Tensor, b: &Tensor, c: &Tensor) {
-    a.where_cond(b, c).unwrap();
-}
-
-const fn create_cond_arr<const N: usize>() -> [u8; N] {
-    let mut arr = [0u8; N];
-    let mut i = 0;
-    while i < N {
-        arr[i] = (i % 2) as u8;
-        i += 1;
-    }
-    arr
-}
-
-const B: usize = 1;
-const M: usize = 1024;
-const K: usize = 1024;
-const SIZE: usize = B * M * K;
-
-const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
-
-fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), &device).unwrap();
-    let on_true = Tensor::ones((B, M, K), dtype, &device).unwrap();
-    let on_false = Tensor::zeros((B, M, K), dtype, &device).unwrap();
-
-    let elements = B * M * K;
-    // E.g. 2 f32 tensors + 1 u8 tensor
-    let flops = (2 * elements * dtype.size_in_bytes()) + elements;
-
-    let mut group = c.benchmark_group(device.bench_name(name));
-    group.throughput(Throughput::Bytes(flops as u64));
-    group.bench_function("iter", move |b| {
-        b.iter_custom(|iters| {
-            let start = Instant::now();
-            for _i in 0..iters {
-                run(
-                    black_box(&tensor),
-                    black_box(&on_true),
-                    black_box(&on_false),
-                );
-            }
-            device.sync().unwrap();
-            start.elapsed()
-        })
-    });
-    group.finish();
-}
-
-fn criterion_benchmark(c: &mut Criterion) {
-    let device = BenchDeviceHandler::new().unwrap();
-    for d in device.devices {
-        run_where_cond_benchmark(c, &d, DType::F32, "where_cond_f32");
-        run_where_cond_benchmark(c, &d, DType::BF16, "where_cond_bf16");
-        run_where_cond_benchmark(c, &d, DType::F16, "where_cond_f16");
-    }
-}
-
-criterion_group!(benches, criterion_benchmark);
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -8,10 +8,11 @@ use anyhow::Result;
 use candle_core::{Device, Tensor};

 fn main() -> Result<()> {
-    let a = Tensor::new(&[[0.0f32, 1.0, 2.0], [3.0, 4.0, 5.0]], &Device::Cpu)?;
-    let b = Tensor::new(&[[88.0f32, 99.0]], &Device::Cpu)?;
-    let new_a = a.slice_scatter(&b, 1, 2)?;
-    assert_eq!(a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
-    assert_eq!(new_a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
+    let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
+    let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
+    let start = std::time::Instant::now();
+    let res = inp.conv2d(&w, 0, 1, 1, 1)?;
+    println!("{:?}", start.elapsed());
+    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -5,32 +5,25 @@ extern crate accelerate_src;
 extern crate intel_mkl_src;

 use anyhow::Result;
-use candle_core::{Device, Module, Tensor};
-
-use candle_core::quantized::{QMatMul, QTensor};
+use candle_core::{Device, Tensor};

 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
-    let q = Tensor::randn(0f32, 1.0, (72, 256), &device)?;
-    let q_cpu = q.to_device(&Device::Cpu)?;
-    let q = QTensor::quantize(&q, candle_core::quantized::GgmlDType::Q8K)?;
-    let q = QMatMul::from_qtensor(q)?;
-    let x = Tensor::randn(0f32, 1.0, (5, 256), &device)?;
-    let res_q_cuda = q.forward(&x)?;
-    println!("{res_q_cuda}");
-
-    let q_cpu = QTensor::quantize(&q_cpu, candle_core::quantized::GgmlDType::Q8K)?;
-    let q_cpu_tensor = q_cpu.dequantize(&Device::Cpu)?;
-    let q_cpu = QMatMul::from_qtensor(q_cpu)?;
-    let x_cpu = x.to_device(&Device::Cpu)?;
-    let res_q_cpu = q_cpu.forward(&x_cpu)?;
-    println!("{res_q_cpu}");
-
-    let res_mm = x_cpu.matmul(&q_cpu_tensor.t()?)?;
-    let diff = (res_mm - res_q_cuda.to_device(&Device::Cpu))?
-        .abs()?
-        .flatten_all()?
-        .max(0)?;
+    let in_t = Tensor::rand(-1f32, 1f32, (1, 3, 12, 7), &device)?;
+    let k_t = Tensor::rand(-1f32, 1f32, (6, 3, 1, 1), &device)?;
+    let out_t = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
+    println!("{out_t}");
+    let in_t = in_t.to_device(&Device::Cpu)?;
+    let k_t = k_t.to_device(&Device::Cpu)?;
+    let out_t2 = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
+    let diff = (out_t.to_device(&Device::Cpu)? - out_t2)?
+        .sqr()?
+        .sum_all()?;
    println!("{diff}");
+
+    let t = Tensor::randn(0f32, 1f32, (2, 4, 96, 96), &device)?;
+    let w = Tensor::randn(0f32, 1f32, (320, 4, 3, 3), &device)?;
+    let res = t.conv2d(&w, 1, 1, 1, 1)?;
+    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@ -0,0 +1,299 @@
+use candle_core::quantized::{gguf_file, k_quants, QTensor};
+use candle_core::{Device, Result, Tensor};
+use clap::{Parser, Subcommand, ValueEnum};
+use rayon::prelude::*;
+
+#[derive(ValueEnum, Debug, Clone)]
+enum QuantizationMode {
+    /// The default quantization includes all 2d tensors, except the output tensor which always
+    /// uses Q6_K.
+    Llama,
+}
+
+impl QuantizationMode {
+    fn quantize(
+        &self,
+        name: &str,
+        tensor: QTensor,
+        default: fn(&Tensor) -> Result<QTensor>,
+    ) -> Result<QTensor> {
+        match self {
+            Self::Llama => {
+                // Same behavior as the llama.cpp quantization.
+                let should_quantize = name.ends_with(".weight") && tensor.rank() == 2;
+                if should_quantize {
+                    let tensor = tensor.dequantize(&Device::Cpu)?;
+                    if name == "output.weight" {
+                        QTensor::quantize::<k_quants::BlockQ6K>(&tensor)
+                    } else {
+                        default(&tensor)
+                    }
+                } else {
+                    Ok(tensor)
+                }
+            }
+        }
+    }
+}
+
+#[derive(ValueEnum, Debug, Clone)]
+enum Quantization {
+    #[value(name = "q4_0")]
+    Q4_0,
+    #[value(name = "q4_1")]
+    Q4_1,
+    #[value(name = "q5_0")]
+    Q5_0,
+    #[value(name = "q5_1")]
+    Q5_1,
+    #[value(name = "q8_0")]
+    Q8_0,
+    #[value(name = "q8_1")]
+    Q8_1,
+    Q2k,
+    Q3k,
+    Q4k,
+    Q5k,
+    Q6k,
+    Q8k,
+    F16,
+    F32,
+}
+
+#[derive(ValueEnum, Debug, Clone)]
+enum Format {
+    Safetensors,
+    Npz,
+    Ggml,
+    Gguf,
+    Pth,
+    Pickle,
+}
+
+impl Format {
+    fn infer<P: AsRef<std::path::Path>>(p: P) -> Option<Self> {
+        p.as_ref()
+            .extension()
+            .and_then(|e| e.to_str())
+            .and_then(|e| match e {
+                // We don't infer any format for .bin as it can be used for ggml/gguf or pytorch.
+                "safetensors" | "safetensor" => Some(Self::Safetensors),
+                "npz" => Some(Self::Npz),
+                "pth" | "pt" => Some(Self::Pth),
+                "ggml" => Some(Self::Ggml),
+                "gguf" => Some(Self::Gguf),
+                _ => None,
+            })
+    }
+}
+
+#[derive(Subcommand, Debug, Clone)]
+enum Command {
+    Ls {
+        files: Vec<std::path::PathBuf>,
+
+        /// The file format to use, if unspecified infer from the file extension.
+        #[arg(long, value_enum)]
+        format: Option<Format>,
+
+        /// Enable verbose mode.
+        #[arg(short, long)]
+        verbose: bool,
+    },
+
+    Quantize {
+        /// The input file, in gguf format.
+        in_file: std::path::PathBuf,
+        /// The output file, in gguf format.
+        out_file: std::path::PathBuf,
+
+        /// The quantization schema to apply.
+        #[arg(long, value_enum)]
+        quantization: Quantization,
+
+        /// Which tensor to quantize.
+        #[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
+        mode: QuantizationMode,
+    },
+}
+
+#[derive(Parser, Debug, Clone)]
+struct Args {
+    #[command(subcommand)]
+    command: Command,
+}
+
+fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> Result<()> {
+    let format = match format {
+        Some(format) => format,
+        None => match Format::infer(file) {
+            Some(format) => format,
+            None => {
+                println!(
+                    "{file:?}: cannot infer format from file extension, use the --format flag"
+                );
+                return Ok(());
+            }
+        },
+    };
+    match format {
+        Format::Npz => {
+            let tensors = candle_core::npy::NpzTensors::new(file)?;
+            let mut names = tensors.names();
+            names.sort();
+            for name in names {
+                let shape_dtype = match tensors.get_shape_and_dtype(name) {
+                    Ok((shape, dtype)) => format!("[{shape:?}; {dtype:?}]"),
+                    Err(err) => err.to_string(),
+                };
+                println!("{name}: {shape_dtype}")
+            }
+        }
+        Format::Safetensors => {
+            let tensors = unsafe { candle_core::safetensors::MmapedFile::new(file)? };
+            let tensors = tensors.deserialize()?;
+            let mut tensors = tensors.tensors();
+            tensors.sort_by(|a, b| a.0.cmp(&b.0));
+            for (name, view) in tensors.iter() {
+                let dtype = view.dtype();
+                let dtype = match candle_core::DType::try_from(dtype) {
+                    Ok(dtype) => format!("{dtype:?}"),
+                    Err(_) => format!("{dtype:?}"),
+                };
+                let shape = view.shape();
+                println!("{name}: [{shape:?}; {dtype}]")
+            }
+        }
+        Format::Pth => {
+            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose)?;
+            tensors.sort_by(|a, b| a.name.cmp(&b.name));
+            for tensor_info in tensors.iter() {
+                println!(
+                    "{}: [{:?}; {:?}]",
+                    tensor_info.name,
+                    tensor_info.layout.shape(),
+                    tensor_info.dtype,
+                );
+                if verbose {
+                    println!("    {:?}", tensor_info);
+                }
+            }
+        }
+        Format::Pickle => {
+            let file = std::fs::File::open(file)?;
+            let mut reader = std::io::BufReader::new(file);
+            let mut stack = candle_core::pickle::Stack::empty();
+            stack.read_loop(&mut reader)?;
+            for (i, obj) in stack.stack().iter().enumerate() {
+                println!("{i} {obj:?}");
+            }
+        }
+        Format::Ggml => {
+            let mut file = std::fs::File::open(file)?;
+            let content = candle_core::quantized::ggml_file::Content::read(&mut file)?;
+            let mut tensors = content.tensors.into_iter().collect::<Vec<_>>();
+            tensors.sort_by(|a, b| a.0.cmp(&b.0));
+            for (name, qtensor) in tensors.iter() {
+                println!("{name}: [{:?}; {:?}]", qtensor.shape(), qtensor.dtype());
+            }
+        }
+        Format::Gguf => {
+            let mut file = std::fs::File::open(file)?;
+            let content = gguf_file::Content::read(&mut file)?;
+            if verbose {
+                let mut metadata = content.metadata.into_iter().collect::<Vec<_>>();
+                metadata.sort_by(|a, b| a.0.cmp(&b.0));
+                println!("metadata entries ({})", metadata.len());
+                for (key, value) in metadata.iter() {
+                    println!("  {key}: {value:?}");
+                }
+            }
+            let mut tensors = content.tensor_infos.into_iter().collect::<Vec<_>>();
+            tensors.sort_by(|a, b| a.0.cmp(&b.0));
+            for (name, info) in tensors.iter() {
+                println!("{name}: [{:?}; {:?}]", info.shape, info.ggml_dtype);
+            }
+        }
+    }
+    Ok(())
+}
+
+fn run_quantize(
+    in_file: std::path::PathBuf,
+    out_file: std::path::PathBuf,
+    q: Quantization,
+    qmode: QuantizationMode,
+) -> Result<()> {
+    // Open the out file early so as to fail directly on missing directories etc.
+    let mut out_file = std::fs::File::create(out_file)?;
+    let mut in_ = std::fs::File::open(&in_file)?;
+    let content = gguf_file::Content::read(&mut in_)?;
+    println!("tensors: {}", content.tensor_infos.len());
+
+    let quantize_fn = match q {
+        Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
+        Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
+        Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
+        Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
+        Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
+        Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
+        Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
+        Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
+        Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
+        Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
+        Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
+        Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
+        Quantization::F16 => QTensor::quantize::<half::f16>,
+        Quantization::F32 => QTensor::quantize::<f32>,
+    };
+
+    let qtensors = content
+        .tensor_infos
+        .par_iter()
+        .map(|(name, _)| {
+            println!("  quantizing {name}");
+            let mut in_file = std::fs::File::open(&in_file)?;
+            let tensor = content.tensor(&mut in_file, name)?;
+            let tensor = qmode.quantize(name, tensor, quantize_fn)?;
+            Ok((name, tensor))
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let qtensors = qtensors
+        .iter()
+        .map(|(k, v)| (k.as_str(), v))
+        .collect::<Vec<_>>();
+
+    let metadata = content
+        .metadata
+        .iter()
+        .map(|(k, v)| (k.as_str(), v))
+        .collect::<Vec<_>>();
+    gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?;
+    Ok(())
+}
+
+fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    match args.command {
+        Command::Ls {
+            files,
+            format,
+            verbose,
+        } => {
+            let multiple_files = files.len() > 1;
+            for file in files.iter() {
+                if multiple_files {
+                    println!("--- {file:?} ---");
+                }
+                run_ls(file, format.clone(), verbose)?
+            }
+        }
+        Command::Quantize {
+            in_file,
+            out_file,
+            quantization,
+            mode,
+        } => run_quantize(in_file, out_file, quantization, mode)?,
+    }
+    Ok(())
+}
--- a/candle-core/src/accelerate.rs
+++ b/candle-core/src/accelerate.rs
@ -370,70 +370,6 @@ pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
 }

-#[inline]
-pub fn vs_tanh_inplace(y: &mut [f32]) {
-    unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vd_tanh_inplace(y: &mut [f64]) {
-    unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vs_exp_inplace(y: &mut [f32]) {
-    unsafe { ffi::vvexpf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vd_exp_inplace(y: &mut [f64]) {
-    unsafe { ffi::vvexp(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
-    }
-    vs_tanh_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = 0.5 * v * (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
-    }
-    vd_tanh_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = 0.5 * v * (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vs_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vd_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
 macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
        #[inline]
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@ -39,14 +39,6 @@ pub trait BackendStorage: Sized {
        _params: &crate::conv::ParamsConv1D,
    ) -> Result<Self>;

-    fn conv_transpose1d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self>;
-
    fn conv2d(
        &self,
        _l: &Layout,
@ -65,7 +57,6 @@ pub trait BackendStorage: Sized {

    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self>;
    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self>;

    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self>;
@ -98,19 +89,6 @@ pub trait BackendStorage: Sized {
    ) -> Result<Self>;

    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;
-
-    #[allow(clippy::too_many_arguments)]
-    // Similar to cudaMemcpy2D, though values are in elements and not in bytes.
-    fn copy2d(
-        &self,
-        _: &mut Self,
-        _d1: usize,
-        _d2: usize,
-        _src_stride1: usize,
-        _dst_stride1: usize,
-        _src_offset: usize,
-        _dst_offset: usize,
-    ) -> Result<()>;
 }

 pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
@ -127,19 +105,9 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone {

    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;

-    /// # Safety
-    /// This function is unsafe as it doesn't initialize the underlying data store.
-    /// The caller should ensure that the data is properly initialized as early as possible
-    /// after this call.
-    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
-
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage>;

-    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage>;
-
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;

    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
-
-    fn set_seed(&self, _: u64) -> Result<()>;
 }
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -1,4 +1,3 @@
-/// Methods for backpropagation of gradients.
 use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};
 use crate::{Error, Result, Tensor, TensorId};
 use std::collections::HashMap;
@ -16,17 +15,6 @@ fn broadcast_back(arg: &Tensor, node: &Tensor, reduced_dims: &[usize]) -> Result
    }
 }

-thread_local! {
-    static CANDLE_GRAD_DO_NOT_DETACH: bool = {
-        match std::env::var("CANDLE_GRAD_DO_NOT_DETACH") {
-            Ok(s) => {
-                !s.is_empty() && s != "0"
-            },
-            Err(_) => false,
-        }
-    }
-}
-
 impl Tensor {
    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
    /// elements having dependencies on the latter ones, e.g. the first element if any is the
@ -48,8 +36,6 @@ impl Tensor {
                // Do not call recursively on the "leaf" nodes.
                track_grad = true;
                nodes
-            } else if node.dtype().is_int() {
-                nodes
            } else if let Some(op) = node.op() {
                match op {
                    Op::IndexAdd(t1, t2, t3, _)
@ -69,11 +55,6 @@ impl Tensor {
                        kernel: rhs,
                        ..
                    }
-                    | Op::ConvTranspose1D {
-                        arg: lhs,
-                        kernel: rhs,
-                        ..
-                    }
                    | Op::Conv2D {
                        arg: lhs,
                        kernel: rhs,
@ -88,8 +69,7 @@ impl Tensor {
                    | Op::Binary(lhs, rhs, _)
                    | Op::Gather(lhs, rhs, _)
                    | Op::IndexSelect(lhs, rhs, _)
-                    | Op::Matmul(lhs, rhs)
-                    | Op::SliceScatter0(lhs, rhs, _) => {
+                    | Op::Matmul(lhs, rhs) => {
                        let (tg, nodes) = walk(lhs, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(rhs, nodes, already_seen);
@ -110,19 +90,15 @@ impl Tensor {
                            nodes
                        }
                    }
-                    Op::Unary(_node, UnaryOp::Ceil)
-                    | Op::Unary(_node, UnaryOp::Floor)
-                    | Op::Unary(_node, UnaryOp::Round)
-                    | Op::Unary(_node, UnaryOp::Sign) => nodes,
                    Op::Reshape(node)
-                    | Op::UpsampleNearest1D { arg: node, .. }
-                    | Op::UpsampleNearest2D { arg: node, .. }
+                    | Op::UpsampleNearest2D(node)
                    | Op::AvgPool2D { arg: node, .. }
                    | Op::MaxPool2D { arg: node, .. }
                    | Op::Copy(node)
                    | Op::Broadcast(node)
                    | Op::Cmp(node, _)
-                    | Op::Reduce(node, ReduceOp::Min | ReduceOp::Sum | ReduceOp::Max, _)
+                    | Op::Reduce(node, _, _)
+                    | Op::ToDType(node)
                    | Op::ToDevice(node)
                    | Op::Transpose(node, _, _)
                    | Op::Permute(node, _)
@ -135,16 +111,6 @@ impl Tensor {
                        track_grad |= tg;
                        nodes
                    }
-                    Op::ToDType(node) => {
-                        if node.dtype().is_float() {
-                            let (tg, nodes) = walk(node, nodes, already_seen);
-                            track_grad |= tg;
-                            nodes
-                        } else {
-                            nodes
-                        }
-                    }
-                    Op::Reduce(_, ReduceOp::ArgMin | ReduceOp::ArgMax, _) => nodes,
                }
            } else {
                nodes
@ -168,16 +134,10 @@ impl Tensor {
            if node.is_variable() {
                continue;
            }
-            let grad = grads
-                .remove(node)
-                .expect("candle internal error - grad not populated");
-            // https://github.com/huggingface/candle/issues/1241
-            // Ideally, we would make these operations in place where possible to ensure that we
-            // do not have to allocate too often. Here we just call `.detach` to avoid computing
-            // the backprop graph of the backprop itself. This would be an issue for second order
-            // derivatives but these are out of scope at the moment.
-            let do_not_detach = CANDLE_GRAD_DO_NOT_DETACH.with(|b| *b);
-            let grad = if do_not_detach { grad } else { grad.detach() };
+            let grad = grads.remove(node).unwrap();
+            // TODO: We should perform all these operations in place (or at least not track the
+            // whole graph). The only drawback would be if we wanted to support grad of grad but
+            // this is out of scope.
            if let Some(op) = node.op() {
                match op {
                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
@ -232,45 +192,7 @@ impl Tensor {
                        let f_grad = pred.where_cond(&zeros, &grad)?;
                        *f_sum_grad = f_sum_grad.add(&f_grad)?;
                    }
-                    Op::Conv1D {
-                        arg,
-                        kernel,
-                        padding,
-                        stride,
-                        dilation,
-                    } => {
-                        // The output height for conv_transpose1d is:
-                        // (l_in - 1) * stride - 2 * padding + dilation * (k_size - 1) + out_padding + 1
-                        let grad_l_in = grad.dim(2)?;
-                        let k_size = kernel.dim(2)?;
-                        let out_size =
-                            (grad_l_in - 1) * stride + dilation * (k_size - 1) + 1 - 2 * padding;
-                        let out_padding = arg.dim(2)? - out_size;
-                        let grad_arg = grad.conv_transpose1d(
-                            kernel,
-                            *padding,
-                            out_padding,
-                            *stride,
-                            *dilation,
-                            /* groups */ 1,
-                        )?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-
-                        let grad_kernel = arg
-                            .transpose(0, 1)?
-                            .conv1d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
-                            .transpose(0, 1)?;
-                        let sum_grad = grads.or_insert(kernel)?;
-                        let (_, _, k0) = kernel.dims3()?;
-                        let (_, _, g_k0) = grad_kernel.dims3()?;
-                        let grad_kernel = if g_k0 != k0 {
-                            grad_kernel.narrow(2, 0, k0)?
-                        } else {
-                            grad_kernel
-                        };
-                        *sum_grad = sum_grad.add(&grad_kernel)?;
-                    }
+                    Op::Conv1D { .. } => Err(Error::BackwardNotSupported { op: "conv1d" })?,
                    Op::Conv2D {
                        arg,
                        kernel,
@ -300,44 +222,11 @@ impl Tensor {
                            .conv2d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
-                        let (_, _, k0, k1) = kernel.dims4()?;
-                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
-                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
-                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
-                        } else {
-                            grad_kernel
-                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
-                    Op::ConvTranspose1D { .. } => Err(Error::BackwardNotSupported {
-                        op: "conv-transpose1d",
+                    Op::ConvTranspose2D { .. } => Err(Error::BackwardNotSupported {
+                        op: "conv-transpose2d",
                    })?,
-                    Op::ConvTranspose2D {
-                        arg,
-                        kernel,
-                        padding,
-                        stride,
-                        dilation,
-                        output_padding: _output_padding,
-                    } => {
-                        let grad_arg = grad.conv2d(kernel, *padding, *dilation, *stride, 1)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-
-                        let grad_kernel = grad
-                            .transpose(0, 1)?
-                            .conv2d(&arg.transpose(0, 1)?, *padding, *stride, *dilation, 1)?
-                            .transpose(0, 1)?;
-                        let sum_grad = grads.or_insert(kernel)?;
-                        let (_, _, k0, k1) = kernel.dims4()?;
-                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
-                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
-                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
-                        } else {
-                            grad_kernel
-                        };
-                        *sum_grad = sum_grad.add(&grad_kernel)?;
-                    }
                    Op::AvgPool2D {
                        arg,
                        kernel_size,
@ -373,48 +262,9 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                    }
-                    Op::UpsampleNearest1D { arg, target_size } => {
-                        let (_n, c, size) = arg.dims3()?;
-                        if target_size % size != 0 {
-                            crate::bail!("backward not supported for non integer upscaling factors")
-                        }
-                        let scale = target_size / size;
-
-                        let kernel = Tensor::ones((c, 1, scale), arg.dtype(), arg.device())?;
-                        let conv_sum = grad.conv1d(&kernel, 0, scale, 1, c)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = conv_sum;
-                    }
-                    Op::UpsampleNearest2D {
-                        arg,
-                        target_h,
-                        target_w,
-                    } => {
-                        let (_n, c, h, w) = arg.dims4()?;
-                        if target_h % h != 0 || target_w % w != 0 {
-                            crate::bail!("backward not supported for non integer upscaling factors")
-                        }
-                        let scale_h = target_h / h;
-                        let scale_w = target_w / w;
-
-                        if scale_h != scale_w {
-                            crate::bail!("backward not supported for non uniform upscaling factors")
-                        };
-                        let kernel =
-                            Tensor::ones((c, 1, scale_h, scale_w), arg.dtype(), arg.device())?;
-                        let conv_sum = grad.conv2d(&kernel, 0, scale_h, 1, c)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = conv_sum;
-                    }
-                    Op::SliceScatter0(lhs, rhs, start_rhs) => {
-                        let rhs_sum_grad = grads.or_insert(rhs)?;
-                        let rhs_grad = grad.narrow(0, *start_rhs, rhs.dim(0)?)?;
-                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
-
-                        let lhs_sum_grad = grads.or_insert(lhs)?;
-                        let lhs_grad = grad.slice_scatter0(&rhs.zeros_like()?, *start_rhs)?;
-                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?
-                    }
+                    Op::UpsampleNearest2D { .. } => Err(Error::BackwardNotSupported {
+                        op: "upsample-nearest2d",
+                    })?,
                    Op::Gather(arg, indexes, dim) => {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.scatter_add(indexes, &grad, *dim)?;
@ -489,6 +339,7 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad)?;
                    }
+                    Op::Cmp(_args, _) => {}
                    Op::Reduce(arg, ReduceOp::Max, reduced_dims) => {
                        let node = broadcast_back(arg, node, reduced_dims)?;
                        let grad = broadcast_back(arg, &grad, reduced_dims)?;
@ -505,7 +356,7 @@ impl Tensor {
                    }
                    Op::ToDType(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad.to_dtype(arg.dtype())?)?
+                        *sum_grad = sum_grad.add(&grad.to_dtype(node.dtype())?)?
                    }
                    Op::Copy(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
@ -578,66 +429,20 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(_, UnaryOp::Floor)
-                    | Op::Unary(_, UnaryOp::Round)
-                    | Op::Reduce(_, ReduceOp::ArgMin, _)
-                    | Op::Reduce(_, ReduceOp::ArgMax, _)
-                    | Op::Unary(_, UnaryOp::Sign)
-                    | Op::Cmp(_, _) => {}
+                    Op::Reduce(_, ReduceOp::ArgMin, _) => {}
+                    Op::Reduce(_, ReduceOp::ArgMax, _) => {}
                    Op::Reshape(arg) => {
                        let arg_grad = grad.reshape(arg.dims())?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(_, UnaryOp::Ceil) => Err(Error::BackwardNotSupported { op: "ceil" })?,
-                    Op::Unary(arg, UnaryOp::Gelu) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        let cube = arg.powf(3.)?;
-                        let tanh = (0.0356774 * &cube + (0.797885 * arg)?)?.tanh()?;
-                        let gelu_grad = (((0.5 * &tanh)?
-                            + (0.0535161 * cube + (0.398942 * arg)?)? * (1. - tanh.powf(2.)?))?
-                            + 0.5)?;
-                        *sum_grad = sum_grad.add(&(&grad * gelu_grad)?)?
-                    }
-                    Op::Unary(arg, UnaryOp::Erf) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        // d/dx erf(x) = 2/sqrt(pi) * e^(-x^2)
-                        let erf_grad =
-                            (2. / std::f64::consts::PI.sqrt()) * (arg.sqr()?.neg()?).exp()?;
-                        *sum_grad = sum_grad.add(&(&grad * erf_grad)?)?
-                    }
-                    Op::Unary(arg, UnaryOp::GeluErf) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        // d/dx gelu_erf(x) = 0.5 + 0.398942 e^(-x^2/2) x + 0.5 erf(x/sqrt(2))
-                        let neg_half_square = (arg.sqr()?.neg()? / 2.)?;
-                        let scaled_exp_arg = (0.398942 * neg_half_square.exp()? * arg)?;
-                        let arg_scaled_sqrt = (arg / 2f64.sqrt())?;
-                        let erf_scaled_sqrt = (0.5 * arg_scaled_sqrt.erf()?)?;
-                        let gelu_erf_grad = (0.5 + scaled_exp_arg + erf_scaled_sqrt)?;
-                        *sum_grad = sum_grad.add(&(&grad * gelu_erf_grad)?)?;
-                    }
+                    Op::Unary(_, UnaryOp::Gelu) => Err(Error::BackwardNotSupported { op: "gelu" })?,
                    Op::Unary(arg, UnaryOp::Relu) => {
                        let sum_grad = grads.or_insert(arg)?;
                        let relu_grad = arg.ge(&arg.zeros_like()?)?.to_dtype(arg.dtype())?;
                        *sum_grad = sum_grad.add(&(&grad * relu_grad)?)?
                    }
-                    Op::Unary(arg, UnaryOp::Silu) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
-                        let sigmoid_arg = (*node / arg)?;
-                        let silu_grad = (&sigmoid_arg * (1. + (arg * (1. - &sigmoid_arg)?)?)?)?;
-                        *sum_grad = sum_grad.add(&(&grad * silu_grad)?)?
-                    }
-                    Op::Elu(arg, alpha) => {
-                        // d/dx elu(x) = 1 for x > 0, alpha * e^x for x <= 0
-                        let sum_grad = grads.or_insert(arg)?;
-                        let zeros = arg.zeros_like()?;
-                        let positive_mask = arg.gt(&zeros)?.to_dtype(arg.dtype())?;
-                        let negative_mask = arg.le(&zeros)?.to_dtype(arg.dtype())?;
-                        let negative_exp_mask = ((negative_mask * arg.exp())? * *alpha)?;
-                        let combined_mask = (positive_mask + negative_exp_mask)?;
-                        *sum_grad = sum_grad.add(&(grad * combined_mask)?)?
-                    }
+                    Op::Elu(..) => Err(Error::BackwardNotSupported { op: "elu" })?,
                    Op::Powf(arg, e) => {
                        let arg_grad = (&(grad * arg.powf(e - 1.)?)? * *e)?;
                        let sum_grad = grads.or_insert(arg)?;
@ -712,38 +517,29 @@ impl Tensor {
    }
 }

-/// A store for gradients, associating a tensor id to the corresponding gradient tensor, used for back propagation.
-#[derive(Debug)]
 pub struct GradStore(HashMap<TensorId, Tensor>);

 impl GradStore {
-    /// Create a new gradient store
    fn new() -> Self {
        GradStore(HashMap::new())
    }

-    /// Get the gradient tensor corresponding to the given tensor id
    pub fn get_id(&self, id: TensorId) -> Option<&Tensor> {
        self.0.get(&id)
    }

-    /// Get the gradient tensor associated with the given tensor
    pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> {
        self.0.get(&tensor.id())
    }

-    /// Remove the gradient tensor associated with the given tensor, returning it if it exists
    pub fn remove(&mut self, tensor: &Tensor) -> Option<Tensor> {
        self.0.remove(&tensor.id())
    }

-    /// Insert a gradient tensor associated with the given tensor, returning the previous gradient tensor if it existed
    pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option<Tensor> {
        self.0.insert(tensor.id(), grad)
    }

-    /// Get the gradient tensor associated with the given tensor, or, if it does not exist,
-    /// insert a tensor of zeroes, with the same shape and type as the given tensors and return it
    fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> {
        use std::collections::hash_map::Entry;
        let grad = match self.0.entry(tensor.id()) {
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
@ -25,46 +25,6 @@ impl ParamsConv1D {
    }
 }

-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ParamsConvTranspose1D {
-    pub(crate) b_size: usize,
-    pub(crate) l_in: usize,
-    pub(crate) c_out: usize,
-    pub(crate) c_in: usize,
-    pub(crate) k_size: usize,
-    pub(crate) padding: usize,
-    pub(crate) output_padding: usize,
-    pub(crate) stride: usize,
-    pub(crate) dilation: usize,
-}
-
-impl ParamsConvTranspose1D {
-    pub(crate) fn l_out(&self) -> usize {
-        (self.l_in - 1) * self.stride - 2 * self.padding
-            + self.dilation * (self.k_size - 1)
-            + self.output_padding
-            + 1
-    }
-
-    pub(crate) fn out_dims(&self) -> Vec<usize> {
-        let l_out = self.l_out();
-        vec![self.b_size, self.c_out, l_out]
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum CudnnFwdAlgo {
-    ImplicitGemm,
-    ImplicitPrecompGemm,
-    Gemm,
-    Direct,
-    Fft,
-    FftTiling,
-    Winograd,
-    WinogradNonFused,
-    Count,
-}
-
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ParamsConv2D {
    pub(crate) b_size: usize,
@ -77,7 +37,6 @@ pub struct ParamsConv2D {
    pub(crate) padding: usize,
    pub(crate) stride: usize,
    pub(crate) dilation: usize,
-    pub cudnn_fwd_algo: Option<CudnnFwdAlgo>,
 }

 impl ParamsConv2D {
@ -187,72 +146,6 @@ impl Tensor {
        }
    }

-    fn conv_transpose1d_single_group(
-        &self,
-        kernel: &Self,
-        params: &ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        let storage = self.storage().conv_transpose1d(
-            self.layout(),
-            &kernel.storage(),
-            kernel.layout(),
-            params,
-        )?;
-        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose1D {
-            arg,
-            kernel,
-            padding: params.padding,
-            output_padding: params.output_padding,
-            stride: params.stride,
-            dilation: params.dilation,
-        });
-        let out_dims = params.out_dims();
-        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
-    }
-
-    /// Applies a 1D transposed convolution over the input tensor.
-    pub fn conv_transpose1d(
-        &self,
-        kernel: &Self,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-        groups: usize,
-    ) -> Result<Self> {
-        let (c_in_k, c_out, k_size) = kernel.dims3()?;
-        let (b_size, c_in, l_in) = self.dims3()?;
-        if c_in != c_in_k {
-            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
-        }
-        if c_in % groups != 0 {
-            crate::bail!("in_channel {c_in} is not divisible by the number of groups")
-        }
-        let params = ParamsConvTranspose1D {
-            b_size,
-            l_in,
-            k_size,
-            c_out,
-            c_in: c_in / groups,
-            padding,
-            output_padding,
-            stride,
-            dilation,
-        };
-        if groups == 1 {
-            self.conv_transpose1d_single_group(kernel, &params)
-        } else {
-            let blocks = self.chunk(groups, 1)?;
-            let kernel = kernel.chunk(groups, 0)?;
-            let blocks = blocks
-                .iter()
-                .zip(&kernel)
-                .map(|(block, kernel)| block.conv_transpose1d_single_group(kernel, &params))
-                .collect::<Result<Vec<_>>>()?;
-            Tensor::cat(&blocks, 1)
-        }
-    }
-
    fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> Result<Self> {
        let storage =
            self.storage()
@ -295,7 +188,6 @@ impl Tensor {
            padding,
            stride,
            dilation,
-            cudnn_fwd_algo: None,
        };
        if groups == 1 {
            self.conv2d_single_group(kernel, &params)
--- a/candle-core/src/cpu/erf.rs
+++ b/candle-core/src/cpu/erf.rs
@ -1,763 +0,0 @@
-#![allow(clippy::excessive_precision)]
-// Code taken from https://github.com/statrs-dev/statrs
-//! Provides the [error](https://en.wikipedia.org/wiki/Error_function) and
-//! related functions
-
-mod evaluate {
-    //! Provides functions that don't have a numerical solution and must
-    //! be solved computationally (e.g. evaluation of a polynomial)
-
-    /// evaluates a polynomial at `z` where `coeff` are the coeffecients
-    /// to a polynomial of order `k` where `k` is the length of `coeff` and the
-    /// coeffecient
-    /// to the `k`th power is the `k`th element in coeff. E.g. [3,-1,2] equates to
-    /// `2z^2 - z + 3`
-    ///
-    /// # Remarks
-    ///
-    /// Returns 0 for a 0 length coefficient slice
-    pub fn polynomial(z: f64, coeff: &[f64]) -> f64 {
-        let n = coeff.len();
-        if n == 0 {
-            return 0.0;
-        }
-
-        let mut sum = *coeff.last().unwrap();
-        for c in coeff[0..n - 1].iter().rev() {
-            sum = *c + z * sum;
-        }
-        sum
-    }
-}
-use std::f64;
-
-/// `erf` calculates the error function at `x`.
-pub fn erf(x: f64) -> f64 {
-    if x.is_nan() {
-        f64::NAN
-    } else if x >= 0.0 && x.is_infinite() {
-        1.0
-    } else if x <= 0.0 && x.is_infinite() {
-        -1.0
-    } else if x == 0. {
-        0.0
-    } else {
-        erf_impl(x, false)
-    }
-}
-
-/// `erf_inv` calculates the inverse error function
-/// at `x`.
-pub fn erf_inv(x: f64) -> f64 {
-    if x == 0.0 {
-        0.0
-    } else if x >= 1.0 {
-        f64::INFINITY
-    } else if x <= -1.0 {
-        f64::NEG_INFINITY
-    } else if x < 0.0 {
-        erf_inv_impl(-x, 1.0 + x, -1.0)
-    } else {
-        erf_inv_impl(x, 1.0 - x, 1.0)
-    }
-}
-
-/// `erfc` calculates the complementary error function
-/// at `x`.
-pub fn erfc(x: f64) -> f64 {
-    if x.is_nan() {
-        f64::NAN
-    } else if x == f64::INFINITY {
-        0.0
-    } else if x == f64::NEG_INFINITY {
-        2.0
-    } else {
-        erf_impl(x, true)
-    }
-}
-
-/// `erfc_inv` calculates the complementary inverse
-/// error function at `x`.
-pub fn erfc_inv(x: f64) -> f64 {
-    if x <= 0.0 {
-        f64::INFINITY
-    } else if x >= 2.0 {
-        f64::NEG_INFINITY
-    } else if x > 1.0 {
-        erf_inv_impl(-1.0 + x, 2.0 - x, -1.0)
-    } else {
-        erf_inv_impl(1.0 - x, x, 1.0)
-    }
-}
-
-// **********************************************************
-// ********** Coefficients for erf_impl polynomial **********
-// **********************************************************
-
-/// Polynomial coefficients for a numerator of `erf_impl`
-/// in the interval [1e-10, 0.5].
-const ERF_IMPL_AN: &[f64] = &[
-    0.00337916709551257388990745,
-    -0.00073695653048167948530905,
-    -0.374732337392919607868241,
-    0.0817442448733587196071743,
-    -0.0421089319936548595203468,
-    0.0070165709512095756344528,
-    -0.00495091255982435110337458,
-    0.000871646599037922480317225,
-];
-
-/// Polynomial coefficients for a denominator of `erf_impl`
-/// in the interval [1e-10, 0.5]
-const ERF_IMPL_AD: &[f64] = &[
-    1.0,
-    -0.218088218087924645390535,
-    0.412542972725442099083918,
-    -0.0841891147873106755410271,
-    0.0655338856400241519690695,
-    -0.0120019604454941768171266,
-    0.00408165558926174048329689,
-    -0.000615900721557769691924509,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [0.5, 0.75].
-const ERF_IMPL_BN: &[f64] = &[
-    -0.0361790390718262471360258,
-    0.292251883444882683221149,
-    0.281447041797604512774415,
-    0.125610208862766947294894,
-    0.0274135028268930549240776,
-    0.00250839672168065762786937,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [0.5, 0.75].
-const ERF_IMPL_BD: &[f64] = &[
-    1.0,
-    1.8545005897903486499845,
-    1.43575803037831418074962,
-    0.582827658753036572454135,
-    0.124810476932949746447682,
-    0.0113724176546353285778481,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [0.75, 1.25].
-const ERF_IMPL_CN: &[f64] = &[
-    -0.0397876892611136856954425,
-    0.153165212467878293257683,
-    0.191260295600936245503129,
-    0.10276327061989304213645,
-    0.029637090615738836726027,
-    0.0046093486780275489468812,
-    0.000307607820348680180548455,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [0.75, 1.25].
-const ERF_IMPL_CD: &[f64] = &[
-    1.0,
-    1.95520072987627704987886,
-    1.64762317199384860109595,
-    0.768238607022126250082483,
-    0.209793185936509782784315,
-    0.0319569316899913392596356,
-    0.00213363160895785378615014,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [1.25, 2.25].
-const ERF_IMPL_DN: &[f64] = &[
-    -0.0300838560557949717328341,
-    0.0538578829844454508530552,
-    0.0726211541651914182692959,
-    0.0367628469888049348429018,
-    0.00964629015572527529605267,
-    0.00133453480075291076745275,
-    0.778087599782504251917881e-4,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [1.25, 2.25].
-const ERF_IMPL_DD: &[f64] = &[
-    1.0,
-    1.75967098147167528287343,
-    1.32883571437961120556307,
-    0.552528596508757581287907,
-    0.133793056941332861912279,
-    0.0179509645176280768640766,
-    0.00104712440019937356634038,
-    -0.106640381820357337177643e-7,
-];
-
-///  Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [2.25, 3.5].
-const ERF_IMPL_EN: &[f64] = &[
-    -0.0117907570137227847827732,
-    0.014262132090538809896674,
-    0.0202234435902960820020765,
-    0.00930668299990432009042239,
-    0.00213357802422065994322516,
-    0.00025022987386460102395382,
-    0.120534912219588189822126e-4,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [2.25, 3.5].
-const ERF_IMPL_ED: &[f64] = &[
-    1.0,
-    1.50376225203620482047419,
-    0.965397786204462896346934,
-    0.339265230476796681555511,
-    0.0689740649541569716897427,
-    0.00771060262491768307365526,
-    0.000371421101531069302990367,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [3.5, 5.25].
-const ERF_IMPL_FN: &[f64] = &[
-    -0.00546954795538729307482955,
-    0.00404190278731707110245394,
-    0.0054963369553161170521356,
-    0.00212616472603945399437862,
-    0.000394984014495083900689956,
-    0.365565477064442377259271e-4,
-    0.135485897109932323253786e-5,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [3.5, 5.25].
-const ERF_IMPL_FD: &[f64] = &[
-    1.0,
-    1.21019697773630784832251,
-    0.620914668221143886601045,
-    0.173038430661142762569515,
-    0.0276550813773432047594539,
-    0.00240625974424309709745382,
-    0.891811817251336577241006e-4,
-    -0.465528836283382684461025e-11,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [5.25, 8].
-const ERF_IMPL_GN: &[f64] = &[
-    -0.00270722535905778347999196,
-    0.0013187563425029400461378,
-    0.00119925933261002333923989,
-    0.00027849619811344664248235,
-    0.267822988218331849989363e-4,
-    0.923043672315028197865066e-6,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [5.25, 8].
-const ERF_IMPL_GD: &[f64] = &[
-    1.0,
-    0.814632808543141591118279,
-    0.268901665856299542168425,
-    0.0449877216103041118694989,
-    0.00381759663320248459168994,
-    0.000131571897888596914350697,
-    0.404815359675764138445257e-11,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [8, 11.5].
-const ERF_IMPL_HN: &[f64] = &[
-    -0.00109946720691742196814323,
-    0.000406425442750422675169153,
-    0.000274499489416900707787024,
-    0.465293770646659383436343e-4,
-    0.320955425395767463401993e-5,
-    0.778286018145020892261936e-7,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [8, 11.5].
-const ERF_IMPL_HD: &[f64] = &[
-    1.0,
-    0.588173710611846046373373,
-    0.139363331289409746077541,
-    0.0166329340417083678763028,
-    0.00100023921310234908642639,
-    0.24254837521587225125068e-4,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [11.5, 17].
-const ERF_IMPL_IN: &[f64] = &[
-    -0.00056907993601094962855594,
-    0.000169498540373762264416984,
-    0.518472354581100890120501e-4,
-    0.382819312231928859704678e-5,
-    0.824989931281894431781794e-7,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [11.5, 17].
-const ERF_IMPL_ID: &[f64] = &[
-    1.0,
-    0.339637250051139347430323,
-    0.043472647870310663055044,
-    0.00248549335224637114641629,
-    0.535633305337152900549536e-4,
-    -0.117490944405459578783846e-12,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [17, 24].
-const ERF_IMPL_JN: &[f64] = &[
-    -0.000241313599483991337479091,
-    0.574224975202501512365975e-4,
-    0.115998962927383778460557e-4,
-    0.581762134402593739370875e-6,
-    0.853971555085673614607418e-8,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [17, 24].
-const ERF_IMPL_JD: &[f64] = &[
-    1.0,
-    0.233044138299687841018015,
-    0.0204186940546440312625597,
-    0.000797185647564398289151125,
-    0.117019281670172327758019e-4,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [24, 38].
-const ERF_IMPL_KN: &[f64] = &[
-    -0.000146674699277760365803642,
-    0.162666552112280519955647e-4,
-    0.269116248509165239294897e-5,
-    0.979584479468091935086972e-7,
-    0.101994647625723465722285e-8,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [24, 38].
-const ERF_IMPL_KD: &[f64] = &[
-    1.0,
-    0.165907812944847226546036,
-    0.0103361716191505884359634,
-    0.000286593026373868366935721,
-    0.298401570840900340874568e-5,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [38, 60].
-const ERF_IMPL_LN: &[f64] = &[
-    -0.583905797629771786720406e-4,
-    0.412510325105496173512992e-5,
-    0.431790922420250949096906e-6,
-    0.993365155590013193345569e-8,
-    0.653480510020104699270084e-10,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [38, 60].
-const ERF_IMPL_LD: &[f64] = &[
-    1.0,
-    0.105077086072039915406159,
-    0.00414278428675475620830226,
-    0.726338754644523769144108e-4,
-    0.477818471047398785369849e-6,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [60, 85].
-const ERF_IMPL_MN: &[f64] = &[
-    -0.196457797609229579459841e-4,
-    0.157243887666800692441195e-5,
-    0.543902511192700878690335e-7,
-    0.317472492369117710852685e-9,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [60, 85].
-const ERF_IMPL_MD: &[f64] = &[
-    1.0,
-    0.052803989240957632204885,
-    0.000926876069151753290378112,
-    0.541011723226630257077328e-5,
-    0.535093845803642394908747e-15,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [85, 110].
-const ERF_IMPL_NN: &[f64] = &[
-    -0.789224703978722689089794e-5,
-    0.622088451660986955124162e-6,
-    0.145728445676882396797184e-7,
-    0.603715505542715364529243e-10,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [85, 110].
-const ERF_IMPL_ND: &[f64] = &[
-    1.0,
-    0.0375328846356293715248719,
-    0.000467919535974625308126054,
-    0.193847039275845656900547e-5,
-];
-
-// **********************************************************
-// ********** Coefficients for erf_inv_impl polynomial ******
-// **********************************************************
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0, 0.5].
-const ERF_INV_IMPL_AN: &[f64] = &[
-    -0.000508781949658280665617,
-    -0.00836874819741736770379,
-    0.0334806625409744615033,
-    -0.0126926147662974029034,
-    -0.0365637971411762664006,
-    0.0219878681111168899165,
-    0.00822687874676915743155,
-    -0.00538772965071242932965,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0, 0.5].
-const ERF_INV_IMPL_AD: &[f64] = &[
-    1.0,
-    -0.970005043303290640362,
-    -1.56574558234175846809,
-    1.56221558398423026363,
-    0.662328840472002992063,
-    -0.71228902341542847553,
-    -0.0527396382340099713954,
-    0.0795283687341571680018,
-    -0.00233393759374190016776,
-    0.000886216390456424707504,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.5, 0.75].
-const ERF_INV_IMPL_BN: &[f64] = &[
-    -0.202433508355938759655,
-    0.105264680699391713268,
-    8.37050328343119927838,
-    17.6447298408374015486,
-    -18.8510648058714251895,
-    -44.6382324441786960818,
-    17.445385985570866523,
-    21.1294655448340526258,
-    -3.67192254707729348546,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.5, 0.75].
-const ERF_INV_IMPL_BD: &[f64] = &[
-    1.0,
-    6.24264124854247537712,
-    3.9713437953343869095,
-    -28.6608180499800029974,
-    -20.1432634680485188801,
-    48.5609213108739935468,
-    10.8268667355460159008,
-    -22.6436933413139721736,
-    1.72114765761200282724,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x less than 3.
-const ERF_INV_IMPL_CN: &[f64] = &[
-    -0.131102781679951906451,
-    -0.163794047193317060787,
-    0.117030156341995252019,
-    0.387079738972604337464,
-    0.337785538912035898924,
-    0.142869534408157156766,
-    0.0290157910005329060432,
-    0.00214558995388805277169,
-    -0.679465575181126350155e-6,
-    0.285225331782217055858e-7,
-    -0.681149956853776992068e-9,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x less than 3.
-const ERF_INV_IMPL_CD: &[f64] = &[
-    1.0,
-    3.46625407242567245975,
-    5.38168345707006855425,
-    4.77846592945843778382,
-    2.59301921623620271374,
-    0.848854343457902036425,
-    0.152264338295331783612,
-    0.01105924229346489121,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 3 and 6.
-const ERF_INV_IMPL_DN: &[f64] = &[
-    -0.0350353787183177984712,
-    -0.00222426529213447927281,
-    0.0185573306514231072324,
-    0.00950804701325919603619,
-    0.00187123492819559223345,
-    0.000157544617424960554631,
-    0.460469890584317994083e-5,
-    -0.230404776911882601748e-9,
-    0.266339227425782031962e-11,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 3 and 6.
-const ERF_INV_IMPL_DD: &[f64] = &[
-    1.0,
-    1.3653349817554063097,
-    0.762059164553623404043,
-    0.220091105764131249824,
-    0.0341589143670947727934,
-    0.00263861676657015992959,
-    0.764675292302794483503e-4,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 6 and 18.
-const ERF_INV_IMPL_EN: &[f64] = &[
-    -0.0167431005076633737133,
-    -0.00112951438745580278863,
-    0.00105628862152492910091,
-    0.000209386317487588078668,
-    0.149624783758342370182e-4,
-    0.449696789927706453732e-6,
-    0.462596163522878599135e-8,
-    -0.281128735628831791805e-13,
-    0.99055709973310326855e-16,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 6 and 18.
-const ERF_INV_IMPL_ED: &[f64] = &[
-    1.0,
-    0.591429344886417493481,
-    0.138151865749083321638,
-    0.0160746087093676504695,
-    0.000964011807005165528527,
-    0.275335474764726041141e-4,
-    0.282243172016108031869e-6,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 18 and 44.
-const ERF_INV_IMPL_FN: &[f64] = &[
-    -0.0024978212791898131227,
-    -0.779190719229053954292e-5,
-    0.254723037413027451751e-4,
-    0.162397777342510920873e-5,
-    0.396341011304801168516e-7,
-    0.411632831190944208473e-9,
-    0.145596286718675035587e-11,
-    -0.116765012397184275695e-17,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 18 and 44.
-const ERF_INV_IMPL_FD: &[f64] = &[
-    1.0,
-    0.207123112214422517181,
-    0.0169410838120975906478,
-    0.000690538265622684595676,
-    0.145007359818232637924e-4,
-    0.144437756628144157666e-6,
-    0.509761276599778486139e-9,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x greater than 44.
-const ERF_INV_IMPL_GN: &[f64] = &[
-    -0.000539042911019078575891,
-    -0.28398759004727721098e-6,
-    0.899465114892291446442e-6,
-    0.229345859265920864296e-7,
-    0.225561444863500149219e-9,
-    0.947846627503022684216e-12,
-    0.135880130108924861008e-14,
-    -0.348890393399948882918e-21,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x greater than 44.
-const ERF_INV_IMPL_GD: &[f64] = &[
-    1.0,
-    0.0845746234001899436914,
-    0.00282092984726264681981,
-    0.468292921940894236786e-4,
-    0.399968812193862100054e-6,
-    0.161809290887904476097e-8,
-    0.231558608310259605225e-11,
-];
-
-/// `erf_impl` computes the error function at `z`.
-/// If `inv` is true, `1 - erf` is calculated as opposed to `erf`
-fn erf_impl(z: f64, inv: bool) -> f64 {
-    if z < 0.0 {
-        if !inv {
-            return -erf_impl(-z, false);
-        }
-        if z < -0.5 {
-            return 2.0 - erf_impl(-z, true);
-        }
-        return 1.0 + erf_impl(-z, false);
-    }
-
-    let result = if z < 0.5 {
-        if z < 1e-10 {
-            z * 1.125 + z * 0.003379167095512573896158903121545171688
-        } else {
-            z * 1.125
-                + z * evaluate::polynomial(z, ERF_IMPL_AN) / evaluate::polynomial(z, ERF_IMPL_AD)
-        }
-    } else if z < 110.0 {
-        let (r, b) = if z < 0.75 {
-            (
-                evaluate::polynomial(z - 0.5, ERF_IMPL_BN)
-                    / evaluate::polynomial(z - 0.5, ERF_IMPL_BD),
-                0.3440242112,
-            )
-        } else if z < 1.25 {
-            (
-                evaluate::polynomial(z - 0.75, ERF_IMPL_CN)
-                    / evaluate::polynomial(z - 0.75, ERF_IMPL_CD),
-                0.419990927,
-            )
-        } else if z < 2.25 {
-            (
-                evaluate::polynomial(z - 1.25, ERF_IMPL_DN)
-                    / evaluate::polynomial(z - 1.25, ERF_IMPL_DD),
-                0.4898625016,
-            )
-        } else if z < 3.5 {
-            (
-                evaluate::polynomial(z - 2.25, ERF_IMPL_EN)
-                    / evaluate::polynomial(z - 2.25, ERF_IMPL_ED),
-                0.5317370892,
-            )
-        } else if z < 5.25 {
-            (
-                evaluate::polynomial(z - 3.5, ERF_IMPL_FN)
-                    / evaluate::polynomial(z - 3.5, ERF_IMPL_FD),
-                0.5489973426,
-            )
-        } else if z < 8.0 {
-            (
-                evaluate::polynomial(z - 5.25, ERF_IMPL_GN)
-                    / evaluate::polynomial(z - 5.25, ERF_IMPL_GD),
-                0.5571740866,
-            )
-        } else if z < 11.5 {
-            (
-                evaluate::polynomial(z - 8.0, ERF_IMPL_HN)
-                    / evaluate::polynomial(z - 8.0, ERF_IMPL_HD),
-                0.5609807968,
-            )
-        } else if z < 17.0 {
-            (
-                evaluate::polynomial(z - 11.5, ERF_IMPL_IN)
-                    / evaluate::polynomial(z - 11.5, ERF_IMPL_ID),
-                0.5626493692,
-            )
-        } else if z < 24.0 {
-            (
-                evaluate::polynomial(z - 17.0, ERF_IMPL_JN)
-                    / evaluate::polynomial(z - 17.0, ERF_IMPL_JD),
-                0.5634598136,
-            )
-        } else if z < 38.0 {
-            (
-                evaluate::polynomial(z - 24.0, ERF_IMPL_KN)
-                    / evaluate::polynomial(z - 24.0, ERF_IMPL_KD),
-                0.5638477802,
-            )
-        } else if z < 60.0 {
-            (
-                evaluate::polynomial(z - 38.0, ERF_IMPL_LN)
-                    / evaluate::polynomial(z - 38.0, ERF_IMPL_LD),
-                0.5640528202,
-            )
-        } else if z < 85.0 {
-            (
-                evaluate::polynomial(z - 60.0, ERF_IMPL_MN)
-                    / evaluate::polynomial(z - 60.0, ERF_IMPL_MD),
-                0.5641309023,
-            )
-        } else {
-            (
-                evaluate::polynomial(z - 85.0, ERF_IMPL_NN)
-                    / evaluate::polynomial(z - 85.0, ERF_IMPL_ND),
-                0.5641584396,
-            )
-        };
-        let g = (-z * z).exp() / z;
-        g * b + g * r
-    } else {
-        0.0
-    };
-
-    if inv && z >= 0.5 {
-        result
-    } else if z >= 0.5 || inv {
-        1.0 - result
-    } else {
-        result
-    }
-}
-
-// `erf_inv_impl` computes the inverse error function where
-// `p`,`q`, and `s` are the first, second, and third intermediate
-// parameters respectively
-fn erf_inv_impl(p: f64, q: f64, s: f64) -> f64 {
-    let result = if p <= 0.5 {
-        let y = 0.0891314744949340820313;
-        let g = p * (p + 10.0);
-        let r = evaluate::polynomial(p, ERF_INV_IMPL_AN) / evaluate::polynomial(p, ERF_INV_IMPL_AD);
-        g * y + g * r
-    } else if q >= 0.25 {
-        let y = 2.249481201171875;
-        let g = (-2.0 * q.ln()).sqrt();
-        let xs = q - 0.25;
-        let r =
-            evaluate::polynomial(xs, ERF_INV_IMPL_BN) / evaluate::polynomial(xs, ERF_INV_IMPL_BD);
-        g / (y + r)
-    } else {
-        let x = (-q.ln()).sqrt();
-        if x < 3.0 {
-            let y = 0.807220458984375;
-            let xs = x - 1.125;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_CN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_CD);
-            y * x + r * x
-        } else if x < 6.0 {
-            let y = 0.93995571136474609375;
-            let xs = x - 3.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_DN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_DD);
-            y * x + r * x
-        } else if x < 18.0 {
-            let y = 0.98362827301025390625;
-            let xs = x - 6.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_EN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_ED);
-            y * x + r * x
-        } else if x < 44.0 {
-            let y = 0.99714565277099609375;
-            let xs = x - 18.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_FN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_FD);
-            y * x + r * x
-        } else {
-            let y = 0.99941349029541015625;
-            let xs = x - 44.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_GN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_GD);
-            y * x + r * x
-        }
-    };
-    s * result
-}
--- a/candle-core/src/cpu/mod.rs
+++ b/candle-core/src/cpu/mod.rs
@ -1,4 +1,3 @@
-pub mod erf;
 pub mod kernels;

 trait Cpu<const ARR: usize> {
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
--- a/candle-core/src/cpu_backend/utils.rs
+++ b/candle-core/src/cpu_backend/utils.rs
@ -1,350 +0,0 @@
-/// Helper functions to write CPU kernels.
-use crate::backend::BackendStorage;
-use crate::{Error, Layout, Result, WithDType};
-
-type C = super::CpuStorage;
-pub trait Map1 {
-    fn f<T: WithDType>(&self, vs: &[T], layout: &Layout) -> Result<Vec<T>>;
-
-    fn map(&self, vs: &C, layout: &Layout) -> Result<C> {
-        match vs {
-            C::U8(vs) => Ok(C::U8(self.f(vs, layout)?)),
-            C::U32(vs) => Ok(C::U32(self.f(vs, layout)?)),
-            C::I64(vs) => Ok(C::I64(self.f(vs, layout)?)),
-            C::BF16(vs) => Ok(C::BF16(self.f(vs, layout)?)),
-            C::F16(vs) => Ok(C::F16(self.f(vs, layout)?)),
-            C::F32(vs) => Ok(C::F32(self.f(vs, layout)?)),
-            C::F64(vs) => Ok(C::F64(self.f(vs, layout)?)),
-        }
-    }
-}
-
-pub trait Map1Any {
-    fn f<T: WithDType, W: Fn(Vec<T>) -> C>(&self, vs: &[T], layout: &Layout, wrap: W) -> Result<C>;
-
-    fn map(&self, vs: &C, layout: &Layout) -> Result<C> {
-        match vs {
-            C::U8(vs) => Ok(self.f(vs, layout, C::U8)?),
-            C::U32(vs) => Ok(self.f(vs, layout, C::U32)?),
-            C::I64(vs) => Ok(self.f(vs, layout, C::I64)?),
-            C::BF16(vs) => Ok(self.f(vs, layout, C::BF16)?),
-            C::F16(vs) => Ok(self.f(vs, layout, C::F16)?),
-            C::F32(vs) => Ok(self.f(vs, layout, C::F32)?),
-            C::F64(vs) => Ok(self.f(vs, layout, C::F64)?),
-        }
-    }
-}
-
-pub trait Map2 {
-    const OP: &'static str;
-    fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result<Vec<T>>;
-
-    fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result<C> {
-        match (v1, v2) {
-            (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::U32(v1), C::U32(v2)) => Ok(C::U32(self.f(v1, l1, v2, l2)?)),
-            (C::I64(v1), C::I64(v2)) => Ok(C::I64(self.f(v1, l1, v2, l2)?)),
-            (C::BF16(v1), C::BF16(v2)) => Ok(C::BF16(self.f(v1, l1, v2, l2)?)),
-            (C::F16(v1), C::F16(v2)) => Ok(C::F16(self.f(v1, l1, v2, l2)?)),
-            (C::F32(v1), C::F32(v2)) => Ok(C::F32(self.f(v1, l1, v2, l2)?)),
-            (C::F64(v1), C::F64(v2)) => Ok(C::F64(self.f(v1, l1, v2, l2)?)),
-            _ => Err(Error::DTypeMismatchBinaryOp {
-                lhs: v1.dtype(),
-                rhs: v2.dtype(),
-                op: Self::OP,
-            }
-            .bt()),
-        }
-    }
-}
-
-pub trait Map2U8 {
-    const OP: &'static str;
-    fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result<Vec<u8>>;
-
-    fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result<C> {
-        match (v1, v2) {
-            (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::U32(v1), C::U32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::I64(v1), C::I64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::BF16(v1), C::BF16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::F16(v1), C::F16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::F32(v1), C::F32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            (C::F64(v1), C::F64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
-            _ => Err(Error::DTypeMismatchBinaryOp {
-                lhs: v1.dtype(),
-                rhs: v2.dtype(),
-                op: Self::OP,
-            }
-            .bt()),
-        }
-    }
-}
-
-pub fn binary_map<T: Copy, U: Copy, F: FnMut(T, T) -> U>(
-    lhs_l: &Layout,
-    rhs_l: &Layout,
-    lhs: &[T],
-    rhs: &[T],
-    mut f: F,
-) -> Vec<U> {
-    match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) {
-        (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => lhs[o_l1..o_l2]
-            .iter()
-            .zip(rhs[o_r1..o_r2].iter())
-            .map(|(&l, &r)| f(l, r))
-            .collect(),
-        (Some((o_l1, o_l2)), None) => {
-            // TODO: Maybe we want to avoid going through the layout twice.
-            match rhs_l.offsets_b() {
-                Some(ob) => {
-                    let mut i_in_block = 0;
-                    let mut i_right_broadcast = 0;
-                    lhs[o_l1..o_l2]
-                        .iter()
-                        .map(|&l| {
-                            let r = unsafe { rhs.get_unchecked(i_in_block + ob.start) };
-                            i_right_broadcast += 1;
-                            if i_right_broadcast >= ob.right_broadcast {
-                                i_in_block += 1;
-                                i_right_broadcast = 0;
-                            }
-                            if i_in_block >= ob.len {
-                                i_in_block = 0
-                            }
-                            f(l, *r)
-                        })
-                        .collect()
-                }
-                None => lhs_l
-                    .strided_index()
-                    .zip(rhs_l.strided_index())
-                    .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
-                    .collect(),
-            }
-        }
-        (None, Some((o_r1, o_r2))) => {
-            // TODO: Maybe we want to avoid going through the layout twice.
-            match lhs_l.offsets_b() {
-                Some(ob) => {
-                    let mut i_in_block = 0;
-                    let mut i_right_broadcast = 0;
-                    rhs[o_r1..o_r2]
-                        .iter()
-                        .map(|&r| {
-                            let l = unsafe { lhs.get_unchecked(i_in_block + ob.start) };
-                            i_right_broadcast += 1;
-                            if i_right_broadcast >= ob.right_broadcast {
-                                i_in_block += 1;
-                                i_right_broadcast = 0;
-                            }
-                            if i_in_block >= ob.len {
-                                i_in_block = 0
-                            }
-                            f(*l, r)
-                        })
-                        .collect()
-                }
-                None => lhs_l
-                    .strided_index()
-                    .zip(rhs_l.strided_index())
-                    .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
-                    .collect(),
-            }
-        }
-        _ => lhs_l
-            .strided_index()
-            .zip(rhs_l.strided_index())
-            .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
-            .collect(),
-    }
-}
-
-// Similar to binary_map but with vectorized variants.
-pub fn binary_map_vec<T: Copy, F: FnMut(T, T) -> T, FV: FnMut(&[T], &[T], &mut [T])>(
-    lhs_l: &Layout,
-    rhs_l: &Layout,
-    lhs: &[T],
-    rhs: &[T],
-    mut f: F,
-    mut f_vec: FV,
-) -> Vec<T> {
-    let el_count = lhs_l.shape().elem_count();
-    match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) {
-        (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => {
-            let mut ys: Vec<T> = Vec::with_capacity(el_count);
-            let ys_to_set = ys.spare_capacity_mut();
-            let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) };
-            f_vec(&lhs[o_l1..o_l2], &rhs[o_r1..o_r2], ys_to_set);
-            // SAFETY: values are all set by f_vec.
-            unsafe { ys.set_len(el_count) };
-            ys
-        }
-        (Some((o_l1, o_l2)), None) => match rhs_l.offsets_b() {
-            Some(ob) if ob.right_broadcast == 1 => {
-                let rhs = &rhs[ob.start..ob.start + ob.len];
-                let mut ys: Vec<T> = Vec::with_capacity(el_count);
-                let ys_to_set = ys.spare_capacity_mut();
-                let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) };
-                let mut dst_i = 0;
-                for src_i in (o_l1..o_l2).step_by(ob.len) {
-                    f_vec(
-                        &lhs[src_i..src_i + ob.len],
-                        rhs,
-                        &mut ys_to_set[dst_i..dst_i + ob.len],
-                    );
-                    dst_i += ob.len;
-                }
-                // SAFETY: values are all set by f_vec.
-                unsafe { ys.set_len(el_count) };
-                ys
-            }
-            Some(ob) => {
-                let rhs = &rhs[ob.start..ob.start + ob.len];
-                let mut ys = lhs[o_l1..o_l2].to_vec();
-                for idx_l in 0..ob.left_broadcast {
-                    let start = idx_l * ob.len * ob.right_broadcast;
-                    for (i, &r) in rhs.iter().enumerate() {
-                        let start = start + i * ob.right_broadcast;
-                        for v in ys[start..start + ob.right_broadcast].iter_mut() {
-                            *v = f(*v, r)
-                        }
-                    }
-                }
-                ys
-            }
-            None => lhs_l
-                .strided_index()
-                .zip(rhs_l.strided_index())
-                .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
-                .collect(),
-        },
-        (None, Some((o_r1, o_r2))) => match lhs_l.offsets_b() {
-            Some(ob) if ob.right_broadcast == 1 => {
-                let lhs = &lhs[ob.start..ob.start + ob.len];
-                let mut ys: Vec<T> = Vec::with_capacity(el_count);
-                let ys_to_set = ys.spare_capacity_mut();
-                let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) };
-                let mut dst_i = 0;
-                for src_i in (o_r1..o_r2).step_by(ob.len) {
-                    f_vec(
-                        lhs,
-                        &rhs[src_i..src_i + ob.len],
-                        &mut ys_to_set[dst_i..dst_i + ob.len],
-                    );
-                    dst_i += ob.len;
-                }
-                // SAFETY: values are all set by f_vec.
-                unsafe { ys.set_len(el_count) };
-                ys
-            }
-            Some(ob) => {
-                let lhs = &lhs[ob.start..ob.start + ob.len];
-                let mut ys = rhs[o_r1..o_r2].to_vec();
-                for idx_l in 0..ob.left_broadcast {
-                    let start = idx_l * ob.len * ob.right_broadcast;
-                    for (i, &l) in lhs.iter().enumerate() {
-                        let start = start + i * ob.right_broadcast;
-                        for v in ys[start..start + ob.right_broadcast].iter_mut() {
-                            *v = f(l, *v)
-                        }
-                    }
-                }
-                ys
-            }
-            None => lhs_l
-                .strided_index()
-                .zip(rhs_l.strided_index())
-                .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
-                .collect(),
-        },
-        _ => lhs_l
-            .strided_index()
-            .zip(rhs_l.strided_index())
-            .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
-            .collect(),
-    }
-}
-
-pub fn unary_map<T: Copy, U: Copy, F: FnMut(T) -> U>(
-    vs: &[T],
-    layout: &Layout,
-    mut f: F,
-) -> Vec<U> {
-    match layout.strided_blocks() {
-        crate::StridedBlocks::SingleBlock { start_offset, len } => vs
-            [start_offset..start_offset + len]
-            .iter()
-            .map(|&v| f(v))
-            .collect(),
-        crate::StridedBlocks::MultipleBlocks {
-            block_start_index,
-            block_len,
-        } => {
-            let mut result = Vec::with_capacity(layout.shape().elem_count());
-            // Specialize the case where block_len is one to avoid the second loop.
-            if block_len == 1 {
-                for index in block_start_index {
-                    let v = unsafe { vs.get_unchecked(index) };
-                    result.push(f(*v))
-                }
-            } else {
-                for index in block_start_index {
-                    for offset in 0..block_len {
-                        let v = unsafe { vs.get_unchecked(index + offset) };
-                        result.push(f(*v))
-                    }
-                }
-            }
-            result
-        }
-    }
-}
-
-pub fn unary_map_vec<T: Copy, U: Copy, F: FnMut(T) -> U, FV: FnMut(&[T], &mut [U])>(
-    vs: &[T],
-    layout: &Layout,
-    mut f: F,
-    mut f_vec: FV,
-) -> Vec<U> {
-    match layout.strided_blocks() {
-        crate::StridedBlocks::SingleBlock { start_offset, len } => {
-            let mut ys: Vec<U> = Vec::with_capacity(len);
-            let ys_to_set = ys.spare_capacity_mut();
-            let ys_to_set = unsafe { std::mem::transmute::<_, &mut [U]>(ys_to_set) };
-            f_vec(&vs[start_offset..start_offset + len], ys_to_set);
-            // SAFETY: values are all set by f_vec.
-            unsafe { ys.set_len(len) };
-            ys
-        }
-        crate::StridedBlocks::MultipleBlocks {
-            block_start_index,
-            block_len,
-        } => {
-            let el_count = layout.shape().elem_count();
-            // Specialize the case where block_len is one to avoid the second loop.
-            if block_len == 1 {
-                let mut result = Vec::with_capacity(el_count);
-                for index in block_start_index {
-                    let v = unsafe { vs.get_unchecked(index) };
-                    result.push(f(*v))
-                }
-                result
-            } else {
-                let mut ys: Vec<U> = Vec::with_capacity(el_count);
-                let ys_to_set = ys.spare_capacity_mut();
-                let ys_to_set = unsafe { std::mem::transmute::<_, &mut [U]>(ys_to_set) };
-                let mut dst_index = 0;
-                for src_index in block_start_index {
-                    let vs = &vs[src_index..src_index + block_len];
-                    let ys = &mut ys_to_set[dst_index..dst_index + block_len];
-                    f_vec(vs, ys);
-                    dst_index += block_len;
-                }
-                // SAFETY: values are all set by f_vec.
-                unsafe { ys.set_len(el_count) };
-                ys
-            }
-        }
-    }
-}
--- a/candle-core/src/cuda_backend/mod.rs
+++ b/candle-core/src/cuda_backend/mod.rs
--- a/candle-core/src/cuda_backend/device.rs
+++ b/candle-core/src/cuda_backend/device.rs
@ -1,410 +0,0 @@
-use crate::backend::BackendDevice;
-use crate::{CpuStorage, DType, Layout, Result, Shape};
-pub use candle_kernels as kernels;
-pub use cudarc;
-use cudarc::driver::{CudaFunction, LaunchAsync, LaunchConfig};
-use half::{bf16, f16};
-use std::sync::{Arc, Mutex};
-
-use super::{CudaError, CudaStorage, CudaStorageSlice, WrapErr};
-
-/// Unique identifier for cuda devices.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub struct DeviceId(usize);
-
-impl DeviceId {
-    fn new() -> Self {
-        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
-        use std::sync::atomic;
-        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
-        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
-    }
-}
-
-struct CudaRng(cudarc::curand::CudaRng);
-unsafe impl Send for CudaRng {}
-
-#[derive(Clone)]
-pub struct CudaDevice {
-    id: DeviceId,
-    device: Arc<cudarc::driver::CudaDevice>,
-    pub(crate) blas: Arc<cudarc::cublas::CudaBlas>,
-    curand: Arc<Mutex<CudaRng>>,
-}
-
-impl std::fmt::Debug for CudaDevice {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "CudaDevice({:?})", self.id)
-    }
-}
-
-impl std::ops::Deref for CudaDevice {
-    type Target = Arc<cudarc::driver::CudaDevice>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.device
-    }
-}
-
-impl CudaDevice {
-    pub fn cuda_device(&self) -> Arc<cudarc::driver::CudaDevice> {
-        self.device.clone()
-    }
-
-    pub fn id(&self) -> DeviceId {
-        self.id
-    }
-
-    fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
-        let elem_count = shape.elem_count();
-        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
-        let slice = match dtype {
-            DType::U8 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<u8>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_u8", kernels::FILL)?;
-                let params = (&data, v as u8, elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::U8(data)
-            }
-            DType::U32 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<u32>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_u32", kernels::FILL)?;
-                let params = (&data, v as u32, elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::U32(data)
-            }
-            DType::I64 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<i64>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_i64", kernels::FILL)?;
-                let params = (&data, v as i64, elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::I64(data)
-            }
-            DType::BF16 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<bf16>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_bf16", kernels::FILL)?;
-                let params = (&data, bf16::from_f64(v), elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            DType::F16 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<f16>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_f16", kernels::FILL)?;
-                let params = (&data, f16::from_f64(v), elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::F16(data)
-            }
-            DType::F32 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<f32>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_f32", kernels::FILL)?;
-                let params = (&data, v as f32, elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::F32(data)
-            }
-            DType::F64 => {
-                // SAFETY: Set later by running the fill kernel.
-                let data = unsafe { self.alloc::<f64>(elem_count) }.w()?;
-                let func = self.get_or_load_func("fill_f64", kernels::FILL)?;
-                let params = (&data, v, elem_count);
-                unsafe { func.launch(cfg, params) }.w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
-    pub fn get_or_load_func(&self, module_name: &str, ptx: &'static str) -> Result<CudaFunction> {
-        if !self.has_func(module_name, module_name) {
-            // Leaking the string here is a bit sad but we need a &'static str and this is only
-            // done once per kernel name.
-            let static_module_name = Box::leak(module_name.to_string().into_boxed_str());
-            self.load_ptx(ptx.into(), module_name, &[static_module_name])
-                .map_err(|cuda| CudaError::Load {
-                    cuda,
-                    module_name: module_name.to_string(),
-                })
-                .w()?;
-        }
-        self.get_func(module_name, module_name)
-            // Clippy recommends this `ok_or` rather than `ok_or_else` so hopefully the compiler is
-            // able to only build the error value if needed.
-            .ok_or(CudaError::MissingKernel {
-                module_name: module_name.to_string(),
-            })
-            .w()
-    }
-}
-
-impl BackendDevice for CudaDevice {
-    type Storage = CudaStorage;
-
-    fn new(ordinal: usize) -> Result<Self> {
-        let device = cudarc::driver::CudaDevice::new(ordinal).w()?;
-        let blas = cudarc::cublas::CudaBlas::new(device.clone()).w()?;
-        let curand = cudarc::curand::CudaRng::new(299792458, device.clone()).w()?;
-        Ok(Self {
-            id: DeviceId::new(),
-            device,
-            blas: Arc::new(blas),
-            curand: Arc::new(Mutex::new(CudaRng(curand))),
-        })
-    }
-
-    fn set_seed(&self, seed: u64) -> Result<()> {
-        // We do not call set_seed but instead create a new curand object. This ensures that the
-        // state will be identical and the same random numbers will be generated.
-        let mut curand = self.curand.lock().unwrap();
-        curand.0 = cudarc::curand::CudaRng::new(seed, self.device.clone()).w()?;
-        Ok(())
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        crate::DeviceLocation::Cuda {
-            gpu_id: self.device.ordinal(),
-        }
-    }
-
-    fn same_device(&self, rhs: &Self) -> bool {
-        self.id == rhs.id
-    }
-
-    fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
-        let elem_count = shape.elem_count();
-        let slice = match dtype {
-            DType::U8 => {
-                let data = self.alloc_zeros::<u8>(elem_count).w()?;
-                CudaStorageSlice::U8(data)
-            }
-            DType::U32 => {
-                let data = self.alloc_zeros::<u32>(elem_count).w()?;
-                CudaStorageSlice::U32(data)
-            }
-            DType::I64 => {
-                let data = self.alloc_zeros::<i64>(elem_count).w()?;
-                CudaStorageSlice::I64(data)
-            }
-            DType::BF16 => {
-                let data = self.alloc_zeros::<bf16>(elem_count).w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            DType::F16 => {
-                let data = self.alloc_zeros::<f16>(elem_count).w()?;
-                CudaStorageSlice::F16(data)
-            }
-            DType::F32 => {
-                let data = self.alloc_zeros::<f32>(elem_count).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            DType::F64 => {
-                let data = self.alloc_zeros::<f64>(elem_count).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
-    fn rand_uniform(&self, shape: &Shape, dtype: DType, lo: f64, up: f64) -> Result<CudaStorage> {
-        let elem_count = shape.elem_count();
-        let curand = self.curand.lock().unwrap();
-        let slice = match dtype {
-            // TODO: Add support for F16 and BF16 though this is likely to require some upstream
-            // cudarc changes.
-            DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => {
-                Err(CudaError::UnsupportedDtype {
-                    dtype,
-                    op: "rand_uniform",
-                })
-                .w()?
-            }
-            DType::F32 => {
-                let mut data = unsafe { self.alloc::<f32>(elem_count) }.w()?;
-                curand.0.fill_with_uniform(&mut data).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            DType::F64 => {
-                let mut data = unsafe { self.alloc::<f64>(elem_count) }.w()?;
-                curand.0.fill_with_uniform(&mut data).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        let slice = if lo == 0. && up == 1.0 {
-            slice
-        } else {
-            use super::utils::Map1;
-            let layout = Layout::contiguous(shape);
-            super::Affine(up - lo, lo).map(&slice, self, &layout)?
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
-    fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, std: f64) -> Result<CudaStorage> {
-        // TODO: Add support for F16 and BF16 though this is likely to require some upstream
-        // cudarc changes.
-        let elem_count = shape.elem_count();
-        let curand = self.curand.lock().unwrap();
-        // curand can only generate an odd number of values.
-        // https://github.com/huggingface/candle/issues/734
-        let elem_count_round = if elem_count % 2 == 1 {
-            elem_count + 1
-        } else {
-            elem_count
-        };
-        let slice = match dtype {
-            DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => {
-                Err(CudaError::UnsupportedDtype {
-                    dtype,
-                    op: "rand_normal",
-                })
-                .w()?
-            }
-            DType::F32 => {
-                let mut data = unsafe { self.alloc::<f32>(elem_count_round) }.w()?;
-                curand
-                    .0
-                    .fill_with_normal(&mut data, mean as f32, std as f32)
-                    .w()?;
-                CudaStorageSlice::F32(data)
-            }
-            DType::F64 => {
-                let mut data = unsafe { self.alloc::<f64>(elem_count_round) }.w()?;
-                curand.0.fill_with_normal(&mut data, mean, std).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
-    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
-        self.const_impl(1., shape, dtype)
-    }
-
-    unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
-        let elem_count = shape.elem_count();
-        let slice = match dtype {
-            DType::U8 => {
-                let data = self.alloc::<u8>(elem_count).w()?;
-                CudaStorageSlice::U8(data)
-            }
-            DType::U32 => {
-                let data = self.alloc::<u32>(elem_count).w()?;
-                CudaStorageSlice::U32(data)
-            }
-            DType::I64 => {
-                let data = self.alloc::<i64>(elem_count).w()?;
-                CudaStorageSlice::I64(data)
-            }
-            DType::BF16 => {
-                let data = self.alloc::<bf16>(elem_count).w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            DType::F16 => {
-                let data = self.alloc::<f16>(elem_count).w()?;
-                CudaStorageSlice::F16(data)
-            }
-            DType::F32 => {
-                let data = self.alloc::<f32>(elem_count).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            DType::F64 => {
-                let data = self.alloc::<f64>(elem_count).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
-    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<CudaStorage> {
-        let slice = match storage {
-            CpuStorage::U8(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::U8(data)
-            }
-            CpuStorage::U32(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::U32(data)
-            }
-            CpuStorage::I64(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::I64(data)
-            }
-            CpuStorage::BF16(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            CpuStorage::F16(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::F16(data)
-            }
-            CpuStorage::F32(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            CpuStorage::F64(storage) => {
-                let data = self.htod_sync_copy(storage).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-
-    fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result<CudaStorage> {
-        let slice = match storage {
-            CpuStorage::U8(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::U8(data)
-            }
-            CpuStorage::U32(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::U32(data)
-            }
-            CpuStorage::I64(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::I64(data)
-            }
-            CpuStorage::BF16(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::BF16(data)
-            }
-            CpuStorage::F16(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::F16(data)
-            }
-            CpuStorage::F32(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::F32(data)
-            }
-            CpuStorage::F64(storage) => {
-                let data = self.htod_copy(storage).w()?;
-                CudaStorageSlice::F64(data)
-            }
-        };
-        Ok(CudaStorage {
-            slice,
-            device: self.clone(),
-        })
-    }
-}
--- a/candle-core/src/cuda_backend/error.rs
+++ b/candle-core/src/cuda_backend/error.rs
@ -1,62 +0,0 @@
-use crate::{DType, Layout};
-
-/// cudarc related errors
-#[derive(thiserror::Error, Debug)]
-pub enum CudaError {
-    #[error(transparent)]
-    Cuda(#[from] cudarc::driver::DriverError),
-
-    #[error(transparent)]
-    Compiler(#[from] cudarc::nvrtc::CompileError),
-
-    #[error(transparent)]
-    Cublas(#[from] cudarc::cublas::result::CublasError),
-
-    #[error(transparent)]
-    Curand(#[from] cudarc::curand::result::CurandError),
-
-    #[error("missing kernel '{module_name}'")]
-    MissingKernel { module_name: String },
-
-    #[error("unsupported dtype {dtype:?} for {op}")]
-    UnsupportedDtype { dtype: DType, op: &'static str },
-
-    #[error("internal error '{0}'")]
-    InternalError(&'static str),
-
-    #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")]
-    MatMulNonContiguous {
-        lhs_stride: Layout,
-        rhs_stride: Layout,
-        mnk: (usize, usize, usize),
-    },
-
-    #[error("{msg}, expected: {expected:?}, got: {got:?}")]
-    UnexpectedDType {
-        msg: &'static str,
-        expected: DType,
-        got: DType,
-    },
-
-    #[error("{cuda} when loading {module_name}")]
-    Load {
-        cuda: cudarc::driver::DriverError,
-        module_name: String,
-    },
-}
-
-impl From<CudaError> for crate::Error {
-    fn from(val: CudaError) -> Self {
-        crate::Error::Cuda(Box::new(val)).bt()
-    }
-}
-
-pub trait WrapErr<O> {
-    fn w(self) -> std::result::Result<O, crate::Error>;
-}
-
-impl<O, E: Into<CudaError>> WrapErr<O> for std::result::Result<O, E> {
-    fn w(self) -> std::result::Result<O, crate::Error> {
-        self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt())
-    }
-}
--- a/candle-core/src/cuda_backend/utils.rs
+++ b/candle-core/src/cuda_backend/utils.rs
@ -1,134 +0,0 @@
-/// Helper functions to plug cuda kernels in candle.
-use crate::{Layout, Result, Shape, WithDType};
-pub use cudarc;
-use cudarc::driver::{CudaSlice, DeviceRepr, ValidAsZeroBits};
-
-use super::{CudaDevice, CudaError, WrapErr};
-
-pub type S = super::CudaStorageSlice;
-
-pub trait Map1 {
-    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
-        &self,
-        src: &CudaSlice<T>,
-        dev: &CudaDevice,
-        layout: &Layout,
-    ) -> Result<CudaSlice<T>>;
-
-    fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result<S> {
-        let out = match s {
-            S::U8(s) => S::U8(self.f(s, d, l)?),
-            S::U32(s) => S::U32(self.f(s, d, l)?),
-            S::I64(s) => S::I64(self.f(s, d, l)?),
-            S::BF16(s) => S::BF16(self.f(s, d, l)?),
-            S::F16(s) => S::F16(self.f(s, d, l)?),
-            S::F32(s) => S::F32(self.f(s, d, l)?),
-            S::F64(s) => S::F64(self.f(s, d, l)?),
-        };
-        Ok(out)
-    }
-}
-
-pub trait Map2 {
-    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
-        &self,
-        src1: &CudaSlice<T>,
-        layout1: &Layout,
-        src2: &CudaSlice<T>,
-        layout2: &Layout,
-        dev: &CudaDevice,
-    ) -> Result<CudaSlice<T>>;
-
-    fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result<S> {
-        let out = match (s1, s2) {
-            (S::U8(s1), S::U8(s2)) => S::U8(self.f(s1, l1, s2, l2, d)?),
-            (S::U32(s1), S::U32(s2)) => S::U32(self.f(s1, l1, s2, l2, d)?),
-            (S::I64(s1), S::I64(s2)) => S::I64(self.f(s1, l1, s2, l2, d)?),
-            (S::BF16(s1), S::BF16(s2)) => S::BF16(self.f(s1, l1, s2, l2, d)?),
-            (S::F16(s1), S::F16(s2)) => S::F16(self.f(s1, l1, s2, l2, d)?),
-            (S::F32(s1), S::F32(s2)) => S::F32(self.f(s1, l1, s2, l2, d)?),
-            (S::F64(s1), S::F64(s2)) => S::F64(self.f(s1, l1, s2, l2, d)?),
-            _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?,
-        };
-        Ok(out)
-    }
-}
-
-pub trait Map2InPlace {
-    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
-        &self,
-        dst: &mut CudaSlice<T>,
-        dst_shape: &Shape,
-        src: &CudaSlice<T>,
-        src_l: &Layout,
-        dev: &CudaDevice,
-    ) -> Result<()>;
-
-    fn map(
-        &self,
-        dst: &mut S,
-        dst_s: &Shape,
-        src: &S,
-        src_l: &Layout,
-        d: &CudaDevice,
-    ) -> Result<()> {
-        match (dst, src) {
-            (S::U8(dst), S::U8(src)) => self.f(dst, dst_s, src, src_l, d),
-            (S::U32(dst), S::U32(src)) => self.f(dst, dst_s, src, src_l, d),
-            (S::I64(dst), S::I64(src)) => self.f(dst, dst_s, src, src_l, d),
-            (S::BF16(dst), S::BF16(src)) => self.f(dst, dst_s, src, src_l, d),
-            (S::F16(dst), S::F16(src)) => self.f(dst, dst_s, src, src_l, d),
-            (S::F32(dst), S::F32(src)) => self.f(dst, dst_s, src, src_l, d),
-            (S::F64(dst), S::F64(src)) => self.f(dst, dst_s, src, src_l, d),
-            _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?,
-        }
-    }
-}
-
-pub trait Map1Any {
-    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits, W: Fn(CudaSlice<T>) -> S>(
-        &self,
-        src: &CudaSlice<T>,
-        dev: &CudaDevice,
-        layout: &Layout,
-        wrap: W,
-    ) -> Result<S>;
-
-    fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result<S> {
-        let out = match s {
-            S::U8(s) => self.f(s, d, l, S::U8)?,
-            S::U32(s) => self.f(s, d, l, S::U32)?,
-            S::I64(s) => self.f(s, d, l, S::I64)?,
-            S::BF16(s) => self.f(s, d, l, S::BF16)?,
-            S::F16(s) => self.f(s, d, l, S::F16)?,
-            S::F32(s) => self.f(s, d, l, S::F32)?,
-            S::F64(s) => self.f(s, d, l, S::F64)?,
-        };
-        Ok(out)
-    }
-}
-
-pub trait Map2Any {
-    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
-        &self,
-        src1: &CudaSlice<T>,
-        layout1: &Layout,
-        src2: &CudaSlice<T>,
-        layout2: &Layout,
-        dev: &CudaDevice,
-    ) -> Result<S>;
-
-    fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result<S> {
-        let out = match (s1, s2) {
-            (S::U8(s1), S::U8(s2)) => self.f(s1, l1, s2, l2, d)?,
-            (S::U32(s1), S::U32(s2)) => self.f(s1, l1, s2, l2, d)?,
-            (S::I64(s1), S::I64(s2)) => self.f(s1, l1, s2, l2, d)?,
-            (S::BF16(s1), S::BF16(s2)) => self.f(s1, l1, s2, l2, d)?,
-            (S::F16(s1), S::F16(s2)) => self.f(s1, l1, s2, l2, d)?,
-            (S::F32(s1), S::F32(s2)) => self.f(s1, l1, s2, l2, d)?,
-            (S::F64(s1), S::F64(s2)) => self.f(s1, l1, s2, l2, d)?,
-            _ => Err(CudaError::InternalError("dtype mismatch in binary op")).w()?,
-        };
-        Ok(out)
-    }
-}
--- a/candle-core/src/cuda_backend/cudnn.rs
+++ b/candle-core/src/cuda_backend/cudnn.rs
@ -34,9 +34,6 @@ pub(crate) fn launch_conv2d<
    params: &crate::conv::ParamsConv2D,
    dev: &crate::cuda_backend::CudaDevice,
 ) -> crate::Result<()> {
-    use crate::conv::CudnnFwdAlgo as CandleAlgo;
-    use cudarc::cudnn::sys::cudnnConvolutionFwdAlgo_t as A;
-
    let device_id = dev.id();
    let cudnn = CUDNN.with(|cudnn| {
        if let Some(cudnn) = cudnn.borrow().get(&device_id) {
@ -93,20 +90,7 @@ pub(crate) fn launch_conv2d<
        w: &w,
        y: &y,
    };
-    let alg = match params.cudnn_fwd_algo {
-        None => conv2d.pick_algorithm()?,
-        Some(CandleAlgo::ImplicitGemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
-        Some(CandleAlgo::ImplicitPrecompGemm) => {
-            A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
-        }
-        Some(CandleAlgo::Gemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-        Some(CandleAlgo::Direct) => A::CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
-        Some(CandleAlgo::Fft) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-        Some(CandleAlgo::FftTiling) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
-        Some(CandleAlgo::Winograd) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-        Some(CandleAlgo::WinogradNonFused) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
-        Some(CandleAlgo::Count) => A::CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
-    };
+    let alg = conv2d.pick_algorithm()?;
    let workspace_size = conv2d.get_workspace_size(alg)?;
    let mut workspace = dev.cuda_device().alloc_zeros::<u8>(workspace_size)?;
    unsafe {
--- a/candle-core/src/custom_op.rs
+++ b/candle-core/src/custom_op.rs
@ -1,377 +0,0 @@
-use crate::op::{BackpropOp, Op};
-use crate::tensor::from_storage;
-use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
-use std::sync::Arc;
-
-/// Unary ops that can be defined in user-land.
-pub trait CustomOp1 {
-    // Box<dyn> does not support const yet, so use a function to get the name.
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _storage: &MetalStorage,
-        _layout: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// This function takes as argument the argument `arg` used in the forward pass, the result
-    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
-    /// The function should return the gradient of the argument.
-    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait CustomOp2 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-    ) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
-    fn bwd(
-        &self,
-        _arg1: &Tensor,
-        _arg2: &Tensor,
-        _res: &Tensor,
-        _grad_res: &Tensor,
-    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait CustomOp3 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-        s3: &CpuStorage,
-        l3: &Layout,
-    ) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-
-    fn bwd(
-        &self,
-        _arg1: &Tensor,
-        _arg2: &Tensor,
-        _arg3: &Tensor,
-        _res: &Tensor,
-        _grad_res: &Tensor,
-    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-impl Tensor {
-    /// Applies a unary custom op without backward support
-    pub fn apply_op1_no_bwd<C: CustomOp1>(&self, c: &C) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op1(self.layout(), c)?;
-        Ok(from_storage(storage, shape, BackpropOp::none(), false))
-    }
-
-    /// Applies a binary custom op without backward support
-    pub fn apply_op2_no_bwd<C: CustomOp2>(&self, rhs: &Self, c: &C) -> Result<Self> {
-        let (storage, shape) =
-            self.storage()
-                .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?;
-        Ok(from_storage(storage, shape, BackpropOp::none(), false))
-    }
-
-    /// Applies a ternary custom op without backward support
-    pub fn apply_op3_no_bwd<C: CustomOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op3(
-            self.layout(),
-            &t2.storage(),
-            t2.layout(),
-            &t3.storage(),
-            t3.layout(),
-            c,
-        )?;
-        Ok(from_storage(storage, shape, BackpropOp::none(), false))
-    }
-
-    /// Applies a unary custom op.
-    pub fn apply_op1_arc(&self, c: Arc<Box<dyn CustomOp1 + Send + Sync>>) -> Result<Self> {
-        let (storage, shape) = self
-            .storage()
-            .apply_op1(self.layout(), c.as_ref().as_ref())?;
-        let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone()));
-        Ok(from_storage(storage, shape, op, false))
-    }
-
-    pub fn apply_op1<C: 'static + CustomOp1 + Send + Sync>(&self, c: C) -> Result<Self> {
-        self.apply_op1_arc(Arc::new(Box::new(c)))
-    }
-
-    /// Applies a binary custom op.
-    pub fn apply_op2_arc(
-        &self,
-        rhs: &Self,
-        c: Arc<Box<dyn CustomOp2 + Send + Sync>>,
-    ) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op2(
-            self.layout(),
-            &rhs.storage(),
-            rhs.layout(),
-            c.as_ref().as_ref(),
-        )?;
-        let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone()));
-        Ok(from_storage(storage, shape, op, false))
-    }
-
-    pub fn apply_op2<C: 'static + CustomOp2 + Send + Sync>(&self, r: &Self, c: C) -> Result<Self> {
-        self.apply_op2_arc(r, Arc::new(Box::new(c)))
-    }
-
-    /// Applies a ternary custom op.
-    pub fn apply_op3_arc(
-        &self,
-        t2: &Self,
-        t3: &Self,
-        c: Arc<Box<dyn CustomOp3 + Send + Sync>>,
-    ) -> Result<Self> {
-        let (storage, shape) = self.storage().apply_op3(
-            self.layout(),
-            &t2.storage(),
-            t2.layout(),
-            &t3.storage(),
-            t3.layout(),
-            c.as_ref().as_ref(),
-        )?;
-        let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| {
-            Op::CustomOp3(t1, t2, t3, c.clone())
-        });
-        Ok(from_storage(storage, shape, op, false))
-    }
-
-    pub fn apply_op3<C: 'static + CustomOp3 + Send + Sync>(
-        &self,
-        t2: &Self,
-        t3: &Self,
-        c: C,
-    ) -> Result<Self> {
-        self.apply_op3_arc(t2, t3, Arc::new(Box::new(c)))
-    }
-}
-
-// In place ops.
-
-/// Unary ops that can be defined in user-land.
-/// These ops work in place and as such back-prop is unsupported.
-pub trait InplaceOp1 {
-    // Box<dyn> does not support const yet, so use a function to get the name.
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result<()>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Result<()> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> Result<()> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-}
-
-pub trait InplaceOp2 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout)
-        -> Result<()>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout) -> Result<()> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &mut MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<()> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-}
-
-pub trait InplaceOp3 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &mut CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-        s3: &CpuStorage,
-        l3: &Layout,
-    ) -> Result<()>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &mut CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<()> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn metal_fwd(
-        &self,
-        _: &mut MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-        _: &MetalStorage,
-        _: &Layout,
-    ) -> Result<()> {
-        Err(crate::Error::Metal(
-            format!("no metal implementation for {}", self.name()).into(),
-        ))
-    }
-}
-
-impl Tensor {
-    /// Applies a unary custom op in place.
-    pub fn inplace_op1<C: InplaceOp1>(&self, c: &C) -> Result<()> {
-        self.storage_mut().inplace_op1(self.layout(), c)
-    }
-
-    /// Applies a unary custom op in place (for the first tensor).
-    pub fn inplace_op2<C: InplaceOp2>(&self, rhs: &Self, c: &C) -> Result<()> {
-        self.storage_mut()
-            .inplace_op2(self.layout(), &rhs.storage(), rhs.layout(), c)
-    }
-
-    /// Applies a ternary custom op in place (for the first tensor).
-    pub fn inplace_op3<C: InplaceOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<()> {
-        self.storage_mut().inplace_op3(
-            self.layout(),
-            &t2.storage(),
-            t2.layout(),
-            &t3.storage(),
-            t3.layout(),
-            c,
-        )
-    }
-}
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -8,14 +8,12 @@ use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
 pub enum DeviceLocation {
    Cpu,
    Cuda { gpu_id: usize },
-    Metal { gpu_id: usize },
 }

 #[derive(Debug, Clone)]
 pub enum Device {
    Cpu,
    Cuda(crate::CudaDevice),
-    Metal(crate::MetalDevice),
 }

 pub trait NdArray {
@ -130,23 +128,10 @@ impl Device {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
    }

-    pub fn new_metal(ordinal: usize) -> Result<Self> {
-        Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
-    }
-
-    pub fn set_seed(&self, seed: u64) -> Result<()> {
-        match self {
-            Self::Cpu => CpuDevice.set_seed(seed),
-            Self::Cuda(c) => c.set_seed(seed),
-            Self::Metal(m) => m.set_seed(seed),
-        }
-    }
-
    pub fn same_device(&self, rhs: &Self) -> bool {
        match (self, rhs) {
            (Self::Cpu, Self::Cpu) => true,
            (Self::Cuda(lhs), Self::Cuda(rhs)) => lhs.same_device(rhs),
-            (Self::Metal(lhs), Self::Metal(rhs)) => lhs.same_device(rhs),
            _ => false,
        }
    }
@ -155,20 +140,21 @@ impl Device {
        match self {
            Self::Cpu => DeviceLocation::Cpu,
            Self::Cuda(device) => device.location(),
-            Device::Metal(device) => device.location(),
        }
    }

    pub fn is_cpu(&self) -> bool {
-        matches!(self, Self::Cpu)
+        match self {
+            Self::Cpu => true,
+            Self::Cuda(_) => false,
+        }
    }

    pub fn is_cuda(&self) -> bool {
-        matches!(self, Self::Cuda(_))
-    }
-
-    pub fn is_metal(&self) -> bool {
-        matches!(self, Self::Metal(_))
+        match self {
+            Self::Cpu => false,
+            Self::Cuda(_) => true,
+        }
    }

    pub fn cuda_if_available(ordinal: usize) -> Result<Self> {
@ -192,18 +178,8 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
-                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
-                if dtype == DType::F16 || dtype == DType::BF16 {
-                    let storage = device.rand_uniform(shape, DType::F32, lo, up)?;
-                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
-                } else {
-                    let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                    Ok(Storage::Cuda(storage))
-                }
-            }
-            Device::Metal(device) => {
                let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                Ok(Storage::Metal(storage))
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -230,18 +206,8 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
-                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
-                if dtype == DType::F16 || dtype == DType::BF16 {
-                    let storage = device.rand_normal(shape, DType::F32, mean, std)?;
-                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
-                } else {
-                    let storage = device.rand_normal(shape, dtype, mean, std)?;
-                    Ok(Storage::Cuda(storage))
-                }
-            }
-            Device::Metal(device) => {
                let storage = device.rand_normal(shape, dtype, mean, std)?;
-                Ok(Storage::Metal(storage))
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -265,10 +231,6 @@ impl Device {
                let storage = device.ones_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.ones_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -282,27 +244,6 @@ impl Device {
                let storage = device.zeros_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = device.zeros_impl(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
-        }
-    }
-
-    pub(crate) unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
-        match self {
-            Device::Cpu => {
-                let storage = CpuDevice.alloc_uninit(shape, dtype)?;
-                Ok(Storage::Cpu(storage))
-            }
-            Device::Cuda(device) => {
-                let storage = device.alloc_uninit(shape, dtype)?;
-                Ok(Storage::Cuda(storage))
-            }
-            Device::Metal(device) => {
-                let storage = device.alloc_uninit(shape, dtype)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -311,14 +252,9 @@ impl Device {
            Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())),
            Device::Cuda(device) => {
                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }

@ -327,14 +263,9 @@ impl Device {
            Device::Cpu => Ok(Storage::Cpu(S::to_cpu_storage_owned(data))),
            Device::Cuda(device) => {
                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
-            Device::Metal(device) => {
-                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
-                Ok(Storage::Metal(storage))
-            }
        }
    }
 }
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -14,9 +14,6 @@ impl Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            crate::DeviceLocation::Metal { gpu_id } => {
-                format!(", metal:{}", gpu_id)
-            }
        };

        write!(f, "Tensor[")?;
@ -65,13 +62,12 @@ impl std::fmt::Debug for Tensor {
 }

 /// Options for Tensor pretty printing
-#[derive(Debug, Clone)]
 pub struct PrinterOptions {
-    pub precision: usize,
-    pub threshold: usize,
-    pub edge_items: usize,
-    pub line_width: usize,
-    pub sci_mode: Option<bool>,
+    precision: usize,
+    threshold: usize,
+    edge_items: usize,
+    line_width: usize,
+    sci_mode: Option<bool>,
 }

 static PRINT_OPTS: std::sync::Mutex<PrinterOptions> =
@ -90,10 +86,6 @@ impl PrinterOptions {
    }
 }

-pub fn print_options() -> &'static std::sync::Mutex<PrinterOptions> {
-    &PRINT_OPTS
-}
-
 pub fn set_print_options(options: PrinterOptions) {
    *PRINT_OPTS.lock().unwrap() = options
 }
@ -122,26 +114,6 @@ pub fn set_print_options_full() {
    }
 }

-pub fn set_line_width(line_width: usize) {
-    PRINT_OPTS.lock().unwrap().line_width = line_width
-}
-
-pub fn set_precision(precision: usize) {
-    PRINT_OPTS.lock().unwrap().precision = precision
-}
-
-pub fn set_edge_items(edge_items: usize) {
-    PRINT_OPTS.lock().unwrap().edge_items = edge_items
-}
-
-pub fn set_threshold(threshold: usize) {
-    PRINT_OPTS.lock().unwrap().threshold = threshold
-}
-
-pub fn set_sci_mode(sci_mode: Option<bool>) {
-    PRINT_OPTS.lock().unwrap().sci_mode = sci_mode
-}
-
 struct FmtSize {
    current_size: usize,
 }
@ -504,9 +476,6 @@ impl std::fmt::Display for Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
-            crate::DeviceLocation::Metal { gpu_id } => {
-                format!(", metal:{}", gpu_id)
-            }
        };

        write!(
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@ -23,15 +23,7 @@ pub enum DType {
 }

 #[derive(Debug, PartialEq, Eq)]
-pub struct DTypeParseError(String);
-
-impl std::fmt::Display for DTypeParseError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "cannot parse '{}' as a dtype", self.0)
-    }
-}
-
-impl std::error::Error for DTypeParseError {}
+pub struct DTypeParseError;

 impl std::str::FromStr for DType {
    type Err = DTypeParseError;
@ -44,7 +36,7 @@ impl std::str::FromStr for DType {
            "f16" => Ok(Self::F16),
            "f32" => Ok(Self::F32),
            "f64" => Ok(Self::F64),
-            _ => Err(DTypeParseError(s.to_string())),
+            _ => Err(DTypeParseError),
        }
    }
 }
@ -75,20 +67,6 @@ impl DType {
            Self::F64 => 8,
        }
    }
-
-    pub fn is_int(&self) -> bool {
-        match self {
-            Self::U8 | Self::U32 | Self::I64 => true,
-            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => false,
-        }
-    }
-
-    pub fn is_float(&self) -> bool {
-        match self {
-            Self::U8 | Self::U32 | Self::I64 => false,
-            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => true,
-        }
-    }
 }

 pub trait WithDType:
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -79,16 +79,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn conv_transpose1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn conv2d(
        &self,
        _: &Layout,
@ -154,19 +144,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn copy2d(
-        &self,
-        _: &mut Self,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-    ) -> Result<()> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
@ -175,10 +152,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
@ -190,10 +163,6 @@ impl crate::backend::BackendDevice for CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn set_seed(&self, _: u64) -> Result<()> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn location(&self) -> crate::DeviceLocation {
        fail!()
    }
@ -210,18 +179,10 @@ impl crate::backend::BackendDevice for CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -1,244 +0,0 @@
-#![allow(dead_code)]
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
-use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
-
-#[derive(Debug, Clone)]
-pub struct MetalDevice;
-
-#[derive(Debug)]
-pub struct MetalStorage;
-
-#[derive(thiserror::Error, Debug)]
-pub enum MetalError {
-    #[error("{0}")]
-    Message(String),
-}
-
-impl From<String> for MetalError {
-    fn from(e: String) -> Self {
-        MetalError::Message(e)
-    }
-}
-
-macro_rules! fail {
-    () => {
-        unimplemented!("metal support has not been enabled, add `metal` feature to enable.")
-    };
-}
-
-impl crate::backend::BackendStorage for MetalStorage {
-    type Device = MetalDevice;
-
-    fn try_clone(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn dtype(&self) -> DType {
-        fail!()
-    }
-
-    fn device(&self) -> &Self::Device {
-        fail!()
-    }
-
-    fn to_cpu_storage(&self) -> Result<CpuStorage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv1d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv_transpose1d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv2d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn matmul(
-        &self,
-        _: &Self,
-        _: (usize, usize, usize, usize),
-        _: &Layout,
-        _: &Layout,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn copy2d(
-        &self,
-        _: &mut Self,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-        _: usize,
-    ) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
-
-impl crate::backend::BackendDevice for MetalDevice {
-    type Storage = MetalStorage;
-    fn new(_: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn set_seed(&self, _: u64) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn location(&self) -> crate::DeviceLocation {
-        fail!()
-    }
-
-    fn same_device(&self, _: &Self) -> bool {
-        fail!()
-    }
-
-    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -1,4 +1,4 @@
-use crate::{DType, DeviceLocation, Layout, MetalError, Shape};
+use crate::{DType, DeviceLocation, Layout, Shape};

 #[derive(Debug, Clone)]
 pub struct MatMulUnexpectedStriding {
@ -142,9 +142,6 @@ pub enum Error {
    #[error("{op} expects at least one tensor")]
    OpRequiresAtLeastOneTensor { op: &'static str },

-    #[error("{op} expects at least two tensors")]
-    OpRequiresAtLeastTwoTensors { op: &'static str },
-
    #[error("backward is not supported for {op}")]
    BackwardNotSupported { op: &'static str },

@ -152,9 +149,6 @@ pub enum Error {
    #[error("the candle crate has not been built with cuda support")]
    NotCompiledWithCudaSupport,

-    #[error("the candle crate has not been built with metal support")]
-    NotCompiledWithMetalSupport,
-
    #[error("cannot find tensor {path}")]
    CannotFindTensor { path: String },

@ -162,9 +156,6 @@ pub enum Error {
    #[error(transparent)]
    Cuda(Box<dyn std::error::Error + Send + Sync>),

-    #[error("Metal error {0}")]
-    Metal(#[from] MetalError),
-
    #[error(transparent)]
    TryFromIntError(#[from] core::num::TryFromIntError),

--- a/candle-core/src/indexer.rs
+++ b/candle-core/src/indexer.rs
@ -46,31 +46,19 @@ impl Tensor {
                    current_dim += 1;
                    out
                }
-                TensorIndexer::IndexSelect(indexes) => {
-                    if indexes.rank() != 1 {
-                        crate::bail!("multi-dimensional tensor indexing is not supported")
-                    }
-                    let out = x.index_select(&indexes.to_device(x.device())?, current_dim)?;
-                    current_dim += 1;
-                    out
-                }
-                TensorIndexer::Err(e) => crate::bail!("indexing error {e:?}"),
            };
        }
        Ok(x)
    }
 }

-#[derive(Debug)]
+#[derive(Debug, Clone)]
 /// Generic structure used to index a slice of the tensor
 pub enum TensorIndexer {
-    /// This selects the elements for which an index has some specific value.
+    /// This selects the elemnts for which an index has some specific value.
    Select(usize),
    /// This is a regular slice, purely indexing a chunk of the tensor
    Narrow(Bound<usize>, Bound<usize>),
-    /// Indexing via a 1d tensor
-    IndexSelect(Tensor),
-    Err(Error),
 }

 impl From<usize> for TensorIndexer {
@ -79,55 +67,36 @@ impl From<usize> for TensorIndexer {
    }
 }

-impl From<&[u32]> for TensorIndexer {
-    fn from(index: &[u32]) -> Self {
-        match Tensor::new(index, &crate::Device::Cpu) {
-            Ok(tensor) => TensorIndexer::IndexSelect(tensor),
-            Err(e) => TensorIndexer::Err(e),
+macro_rules! impl_from_range {
+    ($range_type:ty) => {
+        impl From<$range_type> for TensorIndexer {
+            fn from(range: $range_type) -> Self {
+                use std::ops::Bound::*;
+
+                let start = match range.start_bound() {
+                    Included(idx) => Included(*idx),
+                    Excluded(idx) => Excluded(*idx),
+                    Unbounded => Unbounded,
+                };
+
+                let end = match range.end_bound() {
+                    Included(idx) => Included(*idx),
+                    Excluded(idx) => Excluded(*idx),
+                    Unbounded => Unbounded,
+                };
+
+                TensorIndexer::Narrow(start, end)
+            }
        }
-    }
+    };
 }

-impl From<Vec<u32>> for TensorIndexer {
-    fn from(index: Vec<u32>) -> Self {
-        let len = index.len();
-        match Tensor::from_vec(index, len, &crate::Device::Cpu) {
-            Ok(tensor) => TensorIndexer::IndexSelect(tensor),
-            Err(e) => TensorIndexer::Err(e),
-        }
-    }
-}
-
-impl From<&Tensor> for TensorIndexer {
-    fn from(tensor: &Tensor) -> Self {
-        TensorIndexer::IndexSelect(tensor.clone())
-    }
-}
-
-trait RB: RangeBounds<usize> {}
-impl RB for Range<usize> {}
-impl RB for RangeFrom<usize> {}
-impl RB for RangeFull {}
-impl RB for RangeInclusive<usize> {}
-impl RB for RangeTo<usize> {}
-impl RB for RangeToInclusive<usize> {}
-
-impl<T: RB> From<T> for TensorIndexer {
-    fn from(range: T) -> Self {
-        use std::ops::Bound::*;
-        let start = match range.start_bound() {
-            Included(idx) => Included(*idx),
-            Excluded(idx) => Excluded(*idx),
-            Unbounded => Unbounded,
-        };
-        let end = match range.end_bound() {
-            Included(idx) => Included(*idx),
-            Excluded(idx) => Excluded(*idx),
-            Unbounded => Unbounded,
-        };
-        TensorIndexer::Narrow(start, end)
-    }
-}
+impl_from_range!(Range<usize>);
+impl_from_range!(RangeFrom<usize>);
+impl_from_range!(RangeFull);
+impl_from_range!(RangeInclusive<usize>);
+impl_from_range!(RangeTo<usize>);
+impl_from_range!(RangeToInclusive<usize>);

 /// Trait used to implement multiple signatures for ease of use of the slicing
 /// of a tensor
--- a/candle-core/src/layout.rs
+++ b/candle-core/src/layout.rs
@ -70,7 +70,7 @@ impl Layout {
        self.shape.is_fortran_contiguous(&self.stride)
    }

-    pub fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
+    pub(crate) fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
        let dims = self.shape().dims();
        if dim >= dims.len() {
            Err(Error::DimOutOfRange {
@ -99,7 +99,7 @@ impl Layout {
        })
    }

-    pub fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
+    pub(crate) fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
        let rank = self.shape.rank();
        if rank <= dim1 || rank <= dim2 {
            Err(Error::UnexpectedNumberOfDims {
@ -120,7 +120,7 @@ impl Layout {
        })
    }

-    pub fn permute(&self, idxs: &[usize]) -> Result<Self> {
+    pub(crate) fn permute(&self, idxs: &[usize]) -> Result<Self> {
        let is_permutation =
            idxs.len() == self.shape.rank() && (0..idxs.len()).all(|i| idxs.contains(&i));
        if !is_permutation {
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -14,7 +14,7 @@
 //!
 //! ## Features
 //!
-//! - Simple syntax (looks and feels like PyTorch)
+//! - Simple syntax (looks and like PyTorch)
 //! - CPU and Cuda backends (and M1 support)
 //! - Enable serverless (CPU) small and fast deployments
 //! - Model training
@ -37,27 +37,25 @@
 mod accelerate;
 pub mod backend;
 pub mod backprop;
-pub mod conv;
+mod conv;
 mod convert;
 pub mod cpu;
 pub mod cpu_backend;
 #[cfg(feature = "cuda")]
 pub mod cuda_backend;
-mod custom_op;
+#[cfg(feature = "cudnn")]
+pub mod cudnn;
 mod device;
 pub mod display;
 mod dtype;
 mod dummy_cuda_backend;
-mod dummy_metal_backend;
 pub mod error;
 mod indexer;
 pub mod layout;
-#[cfg(feature = "metal")]
-pub mod metal_backend;
 #[cfg(feature = "mkl")]
 mod mkl;
 pub mod npy;
-pub mod op;
+mod op;
 pub mod pickle;
 pub mod quantized;
 pub mod safetensors;
@ -66,21 +64,17 @@ pub mod shape;
 mod storage;
 mod strided_index;
 mod tensor;
-mod tensor_cat;
 pub mod test_utils;
 pub mod utils;
 mod variable;

-#[cfg(feature = "cudnn")]
-pub use cuda_backend::cudnn;
-
 pub use cpu_backend::CpuStorage;
-pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3};
-pub use device::{Device, DeviceLocation, NdArray};
-pub use dtype::{DType, DTypeParseError, FloatDType, IntDType, WithDType};
+pub use device::{Device, DeviceLocation};
+pub use dtype::{DType, FloatDType, IntDType, WithDType};
 pub use error::{Error, Result};
 pub use indexer::IndexOp;
 pub use layout::Layout;
+pub use op::{CustomOp1, CustomOp2, CustomOp3};
 pub use shape::{Shape, D};
 pub use storage::Storage;
 pub use strided_index::{StridedBlocks, StridedIndex};
@ -93,12 +87,6 @@ pub use cuda_backend::{CudaDevice, CudaStorage};
 #[cfg(not(feature = "cuda"))]
 pub use dummy_cuda_backend::{CudaDevice, CudaStorage};

-#[cfg(feature = "metal")]
-pub use metal_backend::{MetalDevice, MetalError, MetalStorage};
-
-#[cfg(not(feature = "metal"))]
-pub use dummy_metal_backend::{MetalDevice, MetalError, MetalStorage};
-
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

@ -122,33 +110,18 @@ impl ToUsize2 for (usize, usize) {
 }

 // A simple trait defining a module with forward method using a single argument.
-pub trait Module {
+pub trait Module: std::fmt::Debug {
    fn forward(&self, xs: &Tensor) -> Result<Tensor>;
+
+    /// Change the module to use training mode vs eval mode.
+    ///
+    /// The default implementation does nothing as this is only used for a couple modules such as
+    /// dropout or batch-normalization.
+    fn set_training(&mut self, _training: bool) {}
 }

-impl<T: Fn(&Tensor) -> Result<Tensor>> Module for T {
+impl Module for quantized::QMatMul {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        self(xs)
-    }
-}
-
-impl<M: Module> Module for Option<&M> {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        match self {
-            None => Ok(xs.clone()),
-            Some(m) => m.forward(xs),
-        }
-    }
-}
-
-// A trait defining a module with forward method using a single tensor argument and a flag to
-// separate the training and evaluation behaviors.
-pub trait ModuleT {
-    fn forward_t(&self, xs: &Tensor, train: bool) -> Result<Tensor>;
-}
-
-impl<M: Module> ModuleT for M {
-    fn forward_t(&self, xs: &Tensor, _train: bool) -> Result<Tensor> {
        self.forward(xs)
    }
 }
--- a/candle-core/src/metal_backend/device.rs
+++ b/candle-core/src/metal_backend/device.rs
@ -1,287 +0,0 @@
-use crate::{DType, Result};
-use candle_metal_kernels::Kernels;
-use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger};
-use std::collections::HashMap;
-use std::ffi::c_void;
-use std::path::Path;
-use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard};
-
-use super::MetalError;
-
-/// Unique identifier for cuda devices.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub struct DeviceId(usize);
-
-impl DeviceId {
-    pub(crate) fn new() -> Self {
-        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
-        use std::sync::atomic;
-        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
-        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
-    }
-}
-
-type BufferMap = HashMap<(NSUInteger, MTLResourceOptions), Vec<Arc<Buffer>>>;
-type AllocatedBuffers = Arc<RwLock<BufferMap>>;
-
-#[derive(Clone)]
-pub struct MetalDevice {
-    /// Unique identifier, the registryID is not sufficient as it identifies the GPU rather than
-    /// the device itself.
-    pub(crate) id: DeviceId,
-
-    /// Raw metal device: <https://developer.apple.com/documentation/metal/mtldevice?language=objc>
-    pub(crate) device: metal::Device,
-
-    /// Single command queue for the entire device.
-    pub(crate) command_queue: CommandQueue,
-    /// One command buffer at a time.
-    /// The scheduler works by allowing multiple
-    /// [ComputeCommandEncoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc)
-    /// on a single command buffer. Using a single command buffer would be fastest on the GPU but
-    /// prevents overlapping of CPU and GPU commands (because command buffer needs to be committed
-    /// to start to work).
-    /// Despite what the documentation says, command buffers are NOT ordered. They are ordered
-    /// for their START time, but there's no guarantee that command buffer1 will finish before
-    /// command buffer2 starts (or there are metal bugs there)
-    pub(crate) command_buffer: Arc<RwLock<CommandBuffer>>,
-    /// Keeps track of the current amount of compute command encoders on the current
-    /// command buffer
-    /// Arc, RwLock because of the interior mutability.
-    pub(crate) command_buffer_index: Arc<RwLock<usize>>,
-    /// The maximum amount of [compute command encoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) per [command buffer](https://developer.apple.com/documentation/metal/mtlcommandbuffer?language=objc)
-    pub(crate) compute_per_buffer: usize,
-    /// Simple keeper struct to keep track of the already compiled kernels so we can reuse them.
-    /// Heavily used by [`candle_metal_kernels`]
-    pub(crate) kernels: Arc<Kernels>,
-    /// Simple allocator struct.
-    /// The buffers are stored in size buckets since ML tends to use similar shapes over and over.
-    /// We store the buffers in [`Arc`] because it's much faster than Obj-c internal ref counting
-    /// (could be linked to FFI communication overhead).
-    ///
-    /// Whenever a buffer has a strong_count==1, we can reuse it, it means it was dropped in the
-    /// graph calculation, and only we the allocator kept a reference to it, therefore it's free
-    /// to be reused. However, in order for this to work, we need to guarantee the order of
-    /// operation, so that this buffer is not being used by another kernel at the same time.
-    /// Arc is the CPU reference count, it doesn't mean anything on the GPU side of things.
-    ///
-    /// Whenever we actually allocate a new buffer, we make a full sweep to clean up unused buffers
-    /// (strong_count = 1).
-    pub(crate) buffers: AllocatedBuffers,
-    /// Seed for random number generation.
-    pub(crate) seed: Arc<Mutex<Buffer>>,
-}
-
-impl std::fmt::Debug for MetalDevice {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "MetalDevice({:?})", self.id)
-    }
-}
-
-impl std::ops::Deref for MetalDevice {
-    type Target = metal::DeviceRef;
-
-    fn deref(&self) -> &Self::Target {
-        &self.device
-    }
-}
-
-impl MetalDevice {
-    pub fn id(&self) -> DeviceId {
-        self.id
-    }
-
-    pub fn metal_device(&self) -> &metal::Device {
-        &self.device
-    }
-
-    pub fn command_queue(&self) -> &CommandQueue {
-        &self.command_queue
-    }
-
-    pub fn command_buffer(&self) -> Result<CommandBuffer> {
-        let mut command_buffer_lock = self.command_buffer.try_write().map_err(MetalError::from)?;
-        let mut command_buffer = command_buffer_lock.to_owned();
-        let mut index = self
-            .command_buffer_index
-            .try_write()
-            .map_err(MetalError::from)?;
-        if *index > self.compute_per_buffer {
-            command_buffer.commit();
-            command_buffer = self.command_queue.new_command_buffer().to_owned();
-            *command_buffer_lock = command_buffer.clone();
-            *index = 0;
-
-            self.drop_unused_buffers()?;
-        }
-        *index += 1;
-        Ok(command_buffer)
-    }
-
-    pub fn wait_until_completed(&self) -> Result<()> {
-        let mut command_buffer = self.command_buffer.try_write().map_err(MetalError::from)?;
-        match command_buffer.status() {
-            metal::MTLCommandBufferStatus::Committed
-            | metal::MTLCommandBufferStatus::Scheduled
-            | metal::MTLCommandBufferStatus::Completed => {
-                panic!("Already committed");
-            }
-            _ => {}
-        }
-        command_buffer.commit();
-        command_buffer.wait_until_completed();
-        *command_buffer = self.command_queue.new_command_buffer().to_owned();
-
-        Ok(())
-    }
-
-    pub fn kernels(&self) -> &Kernels {
-        &self.kernels
-    }
-
-    pub fn device(&self) -> &metal::Device {
-        &self.device
-    }
-
-    /// Creates a new buffer (not necessarily zeroed).
-    /// The buffer is [MTLPrivate](https://developer.apple.com/documentation/metal/mtlstoragemode)
-    /// This means the buffer data cannot be read on the CPU directly.
-    ///
-    /// [`name`] is only used to keep track of the resource origin in case of bugs
-    pub fn new_buffer(
-        &self,
-        element_count: usize,
-        dtype: DType,
-        name: &str,
-    ) -> Result<Arc<Buffer>> {
-        let size = (element_count * dtype.size_in_bytes()) as NSUInteger;
-        self.allocate_buffer(size, MTLResourceOptions::StorageModePrivate, name)
-    }
-
-    /// Creates a new buffer (not necessarily zeroed).
-    /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode)
-    /// This means the buffer can be read on the CPU but will require manual
-    /// synchronization when the CPU memory is modified
-    /// Used as a bridge to gather data back from the GPU
-    pub fn new_buffer_managed(&self, size: NSUInteger) -> Result<Arc<Buffer>> {
-        self.allocate_buffer(size, MTLResourceOptions::StorageModeManaged, "managed")
-    }
-
-    /// Creates a new buffer from data.
-    /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode)
-    ///
-    /// Does not require synchronization, as [newBufferWithBytes](https://developer.apple.com/documentation/metal/mtldevice/1433429-newbufferwithbytes)
-    /// allocates the buffer and copies over the existing data before returning the MTLBuffer.
-    pub fn new_buffer_with_data<T>(&self, data: &[T]) -> Result<Arc<Buffer>> {
-        let size = core::mem::size_of_val(data) as NSUInteger;
-        let new_buffer = self.device.new_buffer_with_data(
-            data.as_ptr() as *const c_void,
-            size,
-            MTLResourceOptions::StorageModeManaged,
-        );
-        let mut buffers = self.buffers.try_write().map_err(MetalError::from)?;
-        let subbuffers = buffers
-            .entry((size, MTLResourceOptions::StorageModeManaged))
-            .or_insert(vec![]);
-
-        let new_buffer = Arc::new(new_buffer);
-        subbuffers.push(new_buffer.clone());
-        Ok(new_buffer)
-    }
-
-    pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result<Arc<Buffer>> {
-        let buffer = self.allocate_buffer(
-            size_in_bytes as NSUInteger,
-            MTLResourceOptions::StorageModePrivate,
-            "allocate_zeros",
-        )?;
-        let command_buffer = self.command_buffer()?;
-        command_buffer.set_label("zeros");
-        let blit = command_buffer.new_blit_command_encoder();
-        blit.fill_buffer(
-            &buffer,
-            metal::NSRange {
-                location: 0,
-                length: buffer.length(),
-            },
-            0,
-        );
-        blit.end_encoding();
-        Ok(buffer)
-    }
-
-    fn find_available_buffer(
-        &self,
-        size: NSUInteger,
-        option: MTLResourceOptions,
-        buffers: &RwLockWriteGuard<BufferMap>,
-    ) -> Option<Arc<Buffer>> {
-        let mut best_buffer: Option<&Arc<Buffer>> = None;
-        let mut best_buffer_size: NSUInteger = NSUInteger::MAX;
-        for ((buffer_size, buffer_option), subbuffers) in buffers.iter() {
-            if buffer_size >= &size && buffer_size < &best_buffer_size && buffer_option == &option {
-                for sub in subbuffers {
-                    if Arc::strong_count(sub) == 1 {
-                        best_buffer = Some(sub);
-                        best_buffer_size = *buffer_size;
-                    }
-                }
-            }
-        }
-        best_buffer.cloned()
-    }
-
-    fn drop_unused_buffers(&self) -> Result<()> {
-        let mut buffers = self.buffers.try_write().map_err(MetalError::from)?;
-        for subbuffers in buffers.values_mut() {
-            let newbuffers = subbuffers
-                .iter()
-                .filter(|s| Arc::strong_count(*s) > 1)
-                .map(Arc::clone)
-                .collect();
-            *subbuffers = newbuffers;
-        }
-        Ok(())
-    }
-
-    /// The critical allocator algorithm
-    fn allocate_buffer(
-        &self,
-        size: NSUInteger,
-        option: MTLResourceOptions,
-        _name: &str,
-    ) -> Result<Arc<Buffer>> {
-        let mut buffers = self.buffers.try_write().map_err(MetalError::from)?;
-        if let Some(b) = self.find_available_buffer(size, option, &buffers) {
-            // Cloning also ensures we increment the strong count
-            return Ok(b.clone());
-        }
-
-        let size = buf_size(size);
-        let subbuffers = buffers.entry((size, option)).or_insert(vec![]);
-
-        let new_buffer = self.device.new_buffer(size as NSUInteger, option);
-        let new_buffer = Arc::new(new_buffer);
-        subbuffers.push(new_buffer.clone());
-
-        Ok(new_buffer)
-    }
-
-    /// Create a metal GPU capture trace on [`path`].
-    pub fn capture<P: AsRef<Path>>(&self, path: P) -> Result<()> {
-        let capture = metal::CaptureManager::shared();
-        let descriptor = metal::CaptureDescriptor::new();
-        descriptor.set_destination(metal::MTLCaptureDestination::GpuTraceDocument);
-        descriptor.set_capture_device(self);
-        descriptor.set_output_url(path);
-
-        capture
-            .start_capture(&descriptor)
-            .map_err(MetalError::from)?;
-        Ok(())
-    }
-}
-
-fn buf_size(size: NSUInteger) -> NSUInteger {
-    (size - 1).next_power_of_two() as NSUInteger
-}
--- a/candle-core/src/metal_backend/mod.rs
+++ b/candle-core/src/metal_backend/mod.rs
--- a/candle-core/src/mkl.rs
+++ b/candle-core/src/mkl.rs
@ -333,16 +333,6 @@ pub fn vd_tanh_inplace(y: &mut [f64]) {
    unsafe { ffi::vdTanh(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
 }

-#[inline]
-pub fn vs_exp_inplace(y: &mut [f32]) {
-    unsafe { ffi::vsExp(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
-}
-
-#[inline]
-pub fn vd_exp_inplace(y: &mut [f64]) {
-    unsafe { ffi::vdExp(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
-}
-
 #[inline]
 pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
@ -365,28 +355,6 @@ pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
    }
 }

-#[inline]
-pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vs_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = -v
-    }
-    vd_exp_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = v / (1.0 + *y)
-    }
-}
-
 macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $mkl_name:ident) => {
        #[inline]
--- a/candle-core/src/npy.rs
+++ b/candle-core/src/npy.rs
@ -250,6 +250,8 @@ impl Tensor {
        if header.fortran_order {
            return Err(Error::Npy("fortran order not supported".to_string()));
        }
+        let mut data: Vec<u8> = vec![];
+        reader.read_to_end(&mut data)?;
        Self::from_reader(header.shape(), header.descr, &mut reader)
    }

--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,5 +1,5 @@
 #![allow(clippy::redundant_closure_call)]
-use crate::Tensor;
+use crate::{CpuStorage, CudaStorage, Layout, Result, Shape, Tensor};
 use half::{bf16, f16};
 use num_traits::float::Float;

@ -58,15 +58,8 @@ pub enum UnaryOp {
    Sqr,
    Sqrt,
    Gelu,
-    GeluErf,
-    Erf,
    Relu,
-    Silu,
    Tanh,
-    Floor,
-    Ceil,
-    Round,
-    Sign,
 }

 #[derive(Clone)]
@ -92,16 +85,6 @@ pub enum Op {
        dilation: usize,
    },

-    #[allow(dead_code)]
-    ConvTranspose1D {
-        arg: Tensor,
-        kernel: Tensor,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-    },
-
    #[allow(dead_code)]
    Conv2D {
        arg: Tensor,
@ -133,15 +116,7 @@ pub enum Op {
        stride: (usize, usize),
    },

-    UpsampleNearest1D {
-        arg: Tensor,
-        target_size: usize,
-    },
-    UpsampleNearest2D {
-        arg: Tensor,
-        target_h: usize,
-        target_w: usize,
-    },
+    UpsampleNearest2D(Tensor),

    Cat(Vec<Tensor>, usize),

@ -155,30 +130,132 @@ pub enum Op {
    Copy(Tensor),
    Broadcast(Tensor),
    Narrow(Tensor, usize, usize, usize),
-    SliceScatter0(Tensor, Tensor, usize),
    Reshape(Tensor),
    ToDevice(Tensor),
    Transpose(Tensor, usize, usize),
    Permute(Tensor, Vec<usize>),
    Elu(Tensor, f64),
    Powf(Tensor, f64),
-    CustomOp1(
-        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp1 + Send + Sync>>,
-    ),
+    CustomOp1(Tensor, std::sync::Arc<Box<dyn CustomOp1 + Send + Sync>>),
    CustomOp2(
        Tensor,
        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp2 + Send + Sync>>,
+        std::sync::Arc<Box<dyn CustomOp2 + Send + Sync>>,
    ),
    CustomOp3(
        Tensor,
        Tensor,
        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp3 + Send + Sync>>,
+        std::sync::Arc<Box<dyn CustomOp3 + Send + Sync>>,
    ),
 }

+/// Unary ops that can be defined in user-land.
+pub trait CustomOp1 {
+    // Box<dyn> does not support const yet, so use a function to get the name.
+    fn name(&self) -> &'static str;
+
+    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
+
+    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
+        Err(crate::Error::Cuda(
+            format!("no cuda implementation for {}", self.name()).into(),
+        ))
+    }
+
+    /// This function takes as argument the argument `arg` used in the forward pass, the result
+    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
+    /// The function should return the gradient of the argument.
+    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
+        Err(crate::Error::BackwardNotSupported { op: self.name() })
+    }
+}
+
+pub trait CustomOp2 {
+    fn name(&self) -> &'static str;
+
+    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cpu_fwd(
+        &self,
+        s1: &CpuStorage,
+        l1: &Layout,
+        s2: &CpuStorage,
+        l2: &Layout,
+    ) -> Result<(CpuStorage, Shape)>;
+
+    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cuda_fwd(
+        &self,
+        _: &CudaStorage,
+        _: &Layout,
+        _: &CudaStorage,
+        _: &Layout,
+    ) -> Result<(CudaStorage, Shape)> {
+        Err(crate::Error::Cuda(
+            format!("no cuda implementation for {}", self.name()).into(),
+        ))
+    }
+
+    fn bwd(
+        &self,
+        _arg1: &Tensor,
+        _arg2: &Tensor,
+        _res: &Tensor,
+        _grad_res: &Tensor,
+    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
+        Err(crate::Error::BackwardNotSupported { op: self.name() })
+    }
+}
+
+pub trait CustomOp3 {
+    fn name(&self) -> &'static str;
+
+    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cpu_fwd(
+        &self,
+        s1: &CpuStorage,
+        l1: &Layout,
+        s2: &CpuStorage,
+        l2: &Layout,
+        s3: &CpuStorage,
+        l3: &Layout,
+    ) -> Result<(CpuStorage, Shape)>;
+
+    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
+    /// offsets etc so the associated layout should be used to access it.
+    fn cuda_fwd(
+        &self,
+        _: &CudaStorage,
+        _: &Layout,
+        _: &CudaStorage,
+        _: &Layout,
+        _: &CudaStorage,
+        _: &Layout,
+    ) -> Result<(CudaStorage, Shape)> {
+        Err(crate::Error::Cuda(
+            format!("no cuda implementation for {}", self.name()).into(),
+        ))
+    }
+
+    fn bwd(
+        &self,
+        _arg1: &Tensor,
+        _arg2: &Tensor,
+        _arg3: &Tensor,
+        _res: &Tensor,
+        _grad_res: &Tensor,
+    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
+        Err(crate::Error::BackwardNotSupported { op: self.name() })
+    }
+}
+
 pub trait UnaryOpT {
    const NAME: &'static str;
    const KERNEL: &'static str;
@ -247,15 +324,8 @@ pub(crate) struct Recip;
 pub(crate) struct Sqr;
 pub(crate) struct Sqrt;
 pub(crate) struct Gelu;
-pub(crate) struct GeluErf;
-pub(crate) struct Erf;
 pub(crate) struct Relu;
-pub(crate) struct Silu;
 pub(crate) struct Tanh;
-pub(crate) struct Floor;
-pub(crate) struct Ceil;
-pub(crate) struct Round;
-pub(crate) struct Sign;

 macro_rules! bin_op {
    ($op:ident, $name: literal, $e: expr, $f32_vec: ident, $f64_vec: ident) => {
@ -454,20 +524,13 @@ unary_op!(Log, "log", v, v.ln(), vs_ln, vd_ln);
 unary_op!(Sin, "sin", v, v.sin(), vs_sin, vd_sin);
 unary_op!(Cos, "cos", v, v.cos(), vs_cos, vd_cos);
 unary_op!(Tanh, "tanh", v, v.tanh(), vs_tanh, vd_tanh);
+unary_op!(Abs, "abs", v, v.abs());
 unary_op!(Neg, "neg", v, -v);
 unary_op!(Recip, "recip", v, v.recip());
 unary_op!(Sqr, "sqr", v, v * v, vs_sqr, vd_sqr);
 unary_op!(Sqrt, "sqrt", v, v.sqrt(), vs_sqrt, vd_sqrt);

-// Hardcode the value for sqrt(2/pi)
-// https://github.com/huggingface/candle/issues/1982
-#[allow(clippy::excessive_precision)]
-const SQRT_TWO_OVER_PI_F32: f32 = 0.79788456080286535587989211986876373;
-#[allow(clippy::excessive_precision)]
-const SQRT_TWO_OVER_PI_F64: f64 = 0.79788456080286535587989211986876373;
-
-/// Tanh based approximation of the `gelu` operation
-/// GeluErf is the more precise one.
+/// `gelu` operation
 /// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
 impl UnaryOpT for Gelu {
    const NAME: &'static str = "gelu";
@ -478,7 +541,7 @@ impl UnaryOpT for Gelu {
            * v
            * (bf16::ONE
                + bf16::tanh(
-                    bf16::from_f32_const(SQRT_TWO_OVER_PI_F32)
+                    (bf16::from_f32_const(2.0) / bf16::PI).sqrt()
                        * v
                        * (bf16::ONE + bf16::from_f32_const(0.044715) * v * v),
                ))
@ -489,18 +552,22 @@ impl UnaryOpT for Gelu {
            * v
            * (f16::ONE
                + f16::tanh(
-                    f16::from_f32_const(SQRT_TWO_OVER_PI_F32)
+                    (f16::from_f32_const(2.0) / f16::PI).sqrt()
                        * v
                        * (f16::ONE + f16::from_f32_const(0.044715) * v * v),
                ))
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
-        0.5 * v * (1.0 + f32::tanh(SQRT_TWO_OVER_PI_F32 * v * (1.0 + 0.044715 * v * v)))
+        0.5 * v
+            * (1.0
+                + f32::tanh((2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
-        0.5 * v * (1.0 + f64::tanh(SQRT_TWO_OVER_PI_F64 * v * (1.0 + 0.044715 * v * v)))
+        0.5 * v
+            * (1.0
+                + f64::tanh((2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
    }
    #[inline(always)]
    fn u8(_: u8) -> u8 {
@ -533,301 +600,6 @@ impl UnaryOpT for Gelu {
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::mkl::vd_gelu(xs, ys)
    }
-
-    #[cfg(feature = "accelerate")]
-    const F32_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-        crate::accelerate::vs_gelu(xs, ys)
-    }
-
-    #[cfg(feature = "accelerate")]
-    const F64_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-        crate::accelerate::vd_gelu(xs, ys)
-    }
-}
-
-/// `erf` operation
-/// <https://en.wikipedia.org/wiki/Error_function>
-impl UnaryOpT for Erf {
-    const NAME: &'static str = "erf";
-    const KERNEL: &'static str = "uerf";
-    const V: Self = Erf;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        bf16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        f16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        Self::f64(v as f64) as f32
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        crate::cpu::erf::erf(v)
-    }
-    #[inline(always)]
-    fn u8(_: u8) -> u8 {
-        0
-    }
-    #[inline(always)]
-    fn u32(_: u32) -> u32 {
-        0
-    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
-}
-
-/// Silu operation
-impl UnaryOpT for Silu {
-    const NAME: &'static str = "silu";
-    const V: Self = Silu;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v / (bf16::ONE + (-v).exp())
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v / (f16::ONE + (-v).exp())
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v / (1.0 + (-v).exp())
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v / (1.0 + (-v).exp())
-    }
-    #[inline(always)]
-    fn u8(_: u8) -> u8 {
-        0
-    }
-    #[inline(always)]
-    fn u32(_: u32) -> u32 {
-        0
-    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
-    const KERNEL: &'static str = "usilu";
-
-    #[cfg(feature = "mkl")]
-    const F32_VEC: bool = true;
-
-    #[cfg(feature = "mkl")]
-    #[inline(always)]
-    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-        crate::mkl::vs_silu(xs, ys)
-    }
-
-    #[cfg(feature = "mkl")]
-    const F64_VEC: bool = true;
-
-    #[cfg(feature = "mkl")]
-    #[inline(always)]
-    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-        crate::mkl::vd_silu(xs, ys)
-    }
-
-    #[cfg(feature = "accelerate")]
-    const F32_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-        crate::accelerate::vs_silu(xs, ys)
-    }
-
-    #[cfg(feature = "accelerate")]
-    const F64_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-        crate::accelerate::vd_silu(xs, ys)
-    }
-}
-
-impl UnaryOpT for Abs {
-    const NAME: &'static str = "abs";
-    const KERNEL: &'static str = "uabs";
-    const V: Self = Abs;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.abs()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v.abs()
-    }
-}
-
-impl UnaryOpT for Ceil {
-    const NAME: &'static str = "ceil";
-    const KERNEL: &'static str = "uceil";
-    const V: Self = Ceil;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-impl UnaryOpT for Floor {
-    const NAME: &'static str = "floor";
-    const KERNEL: &'static str = "ufloor";
-    const V: Self = Floor;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-impl UnaryOpT for Round {
-    const NAME: &'static str = "round";
-    const KERNEL: &'static str = "uround";
-    const V: Self = Round;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.round()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.round()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.round()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.round()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-impl UnaryOpT for GeluErf {
-    const NAME: &'static str = "gelu_erf";
-    const KERNEL: &'static str = "ugelu_erf";
-    const V: Self = GeluErf;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        bf16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        f16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        Self::f64(v as f64) as f32
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        (crate::cpu::erf::erf(v / 2f64.sqrt()) + 1.) * 0.5 * v
-    }
-    #[inline(always)]
-    fn u8(_: u8) -> u8 {
-        0
-    }
-    #[inline(always)]
-    fn u32(_: u32) -> u32 {
-        0
-    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
 }

 impl UnaryOpT for Relu {
@ -915,10 +687,6 @@ impl BackpropOp {
        };
        Self(op)
    }
-
-    pub(crate) fn is_none(&self) -> bool {
-        self.0.is_none()
-    }
 }

 impl std::ops::Deref for BackpropOp {
@ -927,37 +695,3 @@ impl std::ops::Deref for BackpropOp {
        &self.0
    }
 }
-
-impl UnaryOpT for Sign {
-    const NAME: &'static str = "sign";
-    const KERNEL: &'static str = "usign";
-    const V: Self = Sign;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        bf16::from((v > bf16::ZERO) as i8) - bf16::from((v < bf16::ZERO) as i8)
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        f16::from((v > f16::ZERO) as i8) - f16::from((v < f16::ZERO) as i8)
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        f32::from(v > 0.) - f32::from(v < 0.)
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        f64::from(v > 0.) - f64::from(v < 0.)
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        u8::min(1, v)
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        u32::min(1, v)
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        (v > 0) as i64 - (v < 0) as i64
-    }
-}
--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@ -42,7 +42,7 @@ pub enum OpCode {
    Stop = b'.',
    NewObj = 0x81,
    EmptyList = b']',
-    BinFloat = b'G',
+    BinFloat = b'g',
    Append = b'a',
    Appends = b'e',
 }
@ -193,55 +193,6 @@ impl Object {
            _ => Err(self),
        }
    }
-
-    pub fn into_tensor_info(
-        self,
-        name: Self,
-        dir_name: &std::path::Path,
-    ) -> Result<Option<TensorInfo>> {
-        let name = match name.unicode() {
-            Ok(name) => name,
-            Err(_) => return Ok(None),
-        };
-        let (callable, args) = match self.reduce() {
-            Ok(callable_args) => callable_args,
-            _ => return Ok(None),
-        };
-        let (callable, args) = match callable {
-            Object::Class {
-                module_name,
-                class_name,
-            } if module_name == "torch._tensor" && class_name == "_rebuild_from_type_v2" => {
-                let mut args = args.tuple()?;
-                let callable = args.remove(0);
-                let args = args.remove(1);
-                (callable, args)
-            }
-            Object::Class {
-                module_name,
-                class_name,
-            } if module_name == "torch._utils" && class_name == "_rebuild_parameter" => {
-                let mut args = args.tuple()?;
-                args.remove(0).reduce()?
-            }
-            _ => (callable, args),
-        };
-        match callable {
-            Object::Class {
-                module_name,
-                class_name,
-            } if module_name == "torch._utils" && class_name == "_rebuild_tensor_v2" => {}
-            _ => return Ok(None),
-        };
-        let (layout, dtype, file_path, storage_size) = rebuild_args(args)?;
-        Ok(Some(TensorInfo {
-            name,
-            dtype,
-            layout,
-            path: format!("{}/{}", dir_name.to_string_lossy(), file_path),
-            storage_size,
-        }))
-    }
 }

 impl TryFrom<Object> for String {
@ -350,10 +301,8 @@ impl Stack {
                module_name,
                class_name,
            } => {
-                if module_name == "collections"
-                    && (class_name == "OrderedDict" || class_name == "defaultdict")
-                {
-                    // TODO: have a separate ordered dict and a separate default dict.
+                if module_name == "collections" && class_name == "OrderedDict" {
+                    // TODO: have a separate ordered dict.
                    Some(Object::Dict(vec![]))
                } else {
                    None
@ -462,10 +411,7 @@ impl Stack {
                self.push(Object::Int(arg))
            }
            OpCode::BinFloat => {
-                // Somehow floats are encoded using BigEndian whereas int types use LittleEndian.
-                // https://github.com/python/cpython/blob/0c80da4c14d904a367968955544dd6ae58c8101c/Lib/pickletools.py#L855
-                // https://github.com/pytorch/pytorch/blob/372d078f361e726bb4ac0884ac334b04c58179ef/torch/_weights_only_unpickler.py#L243
-                let arg = r.read_f64::<byteorder::BigEndian>()?;
+                let arg = r.read_f64::<LittleEndian>()?;
                self.push(Object::Float(arg))
            }
            OpCode::BinUnicode => {
@ -619,7 +565,6 @@ fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
        "HalfStorage" => DType::F16,
        "BFloat16Storage" => DType::BF16,
        "ByteStorage" => DType::U8,
-        "LongStorage" => DType::I64,
        other => {
            crate::bail!("unsupported storage type {other}")
        }
@ -637,16 +582,9 @@ pub struct TensorInfo {
    pub storage_size: usize,
 }

-/// Read the tensor info from a .pth file.
-///
-/// # Arguments
-/// * `file` - The path to the .pth file.
-/// * `verbose` - Whether to print debug information.
-/// * `key` - Optional key to retrieve `state_dict` from the pth file.
 pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
    file: P,
    verbose: bool,
-    key: Option<&str>,
 ) -> Result<Vec<TensorInfo>> {
    let file = std::fs::File::open(file)?;
    let zip_reader = std::io::BufReader::new(file);
@ -668,9 +606,8 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
        stack.read_loop(&mut reader)?;
        let obj = stack.finalize()?;
        if VERBOSE || verbose {
-            println!("{obj:#?}");
+            println!("{obj:?}");
        }
-
        let obj = match obj {
            Object::Build { callable, args } => match *callable {
                Object::Reduce { callable, args: _ } => match *callable {
@ -684,30 +621,52 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
            },
            obj => obj,
        };
-
-        // If key is provided, then we need to extract the state_dict from the object.
-        let obj = if let Some(key) = key {
-            if let Object::Dict(key_values) = obj {
-                key_values
-                    .into_iter()
-                    .find(|(k, _)| *k == Object::Unicode(key.to_owned()))
-                    .map(|(_, v)| v)
-                    .ok_or_else(|| E::Msg(format!("key {key} not found")))?
-            } else {
-                obj
-            }
-        } else {
-            obj
-        };
-
-        // If the object is a dict, then we can extract the tensor info from it.
-        // NOTE: We are assuming that the `obj` is state_dict by this stage.
        if let Object::Dict(key_values) = obj {
            for (name, value) in key_values.into_iter() {
-                match value.into_tensor_info(name, &dir_name) {
-                    Ok(Some(tensor_info)) => tensor_infos.push(tensor_info),
-                    Ok(None) => {}
-                    Err(err) => eprintln!("skipping: {err:?}"),
+                let name = match name.unicode() {
+                    Ok(name) => name,
+                    Err(_) => continue,
+                };
+                let (callable, args) = match value.reduce() {
+                    Ok(callable_args) => callable_args,
+                    _ => continue,
+                };
+                let (callable, args) = match callable {
+                    Object::Class {
+                        module_name,
+                        class_name,
+                    } if module_name == "torch._tensor"
+                        && class_name == "_rebuild_from_type_v2" =>
+                    {
+                        let mut args = args.tuple()?;
+                        let callable = args.remove(0);
+                        let args = args.remove(1);
+                        (callable, args)
+                    }
+                    _ => (callable, args),
+                };
+                match callable {
+                    Object::Class {
+                        module_name,
+                        class_name,
+                    } if module_name == "torch._utils" && class_name == "_rebuild_tensor_v2" => {}
+                    _ => continue,
+                };
+                match rebuild_args(args) {
+                    Ok((layout, dtype, file_path, storage_size)) => {
+                        let mut path = dir_name.clone();
+                        path.push(file_path);
+                        tensor_infos.push(TensorInfo {
+                            name,
+                            dtype,
+                            layout,
+                            path: path.to_string_lossy().into_owned(),
+                            storage_size,
+                        })
+                    }
+                    Err(err) => {
+                        eprintln!("skipping {name}: {err:?}")
+                    }
                }
            }
        }
@ -724,8 +683,8 @@ pub struct PthTensors {
 }

 impl PthTensors {
-    pub fn new<P: AsRef<std::path::Path>>(path: P, key: Option<&str>) -> Result<Self> {
-        let tensor_infos = read_pth_tensor_info(path.as_ref(), false, key)?;
+    pub fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
+        let tensor_infos = read_pth_tensor_info(path.as_ref(), false)?;
        let tensor_infos = tensor_infos
            .into_iter()
            .map(|ti| (ti.name.to_string(), ti))
@ -739,7 +698,6 @@ impl PthTensors {
    }

    pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
-        use std::io::Read;
        let tensor_info = match self.tensor_infos.get(name) {
            None => return Ok(None),
            Some(tensor_info) => tensor_info,
@ -748,70 +706,20 @@ impl PthTensors {
        let zip_reader = std::io::BufReader::new(std::fs::File::open(&self.path)?);
        let mut zip = zip::ZipArchive::new(zip_reader)?;
        let mut reader = zip.by_name(&tensor_info.path)?;
-        let is_fortran_contiguous = tensor_info.layout.is_fortran_contiguous();
-        let rank = tensor_info.layout.shape().rank();

-        // Reading the data is a bit tricky as it can be strided, for now only support the basic
-        // case and when the tensor is fortran contiguous.
-        if !tensor_info.layout.is_contiguous() && !is_fortran_contiguous {
+        // Reading the data is a bit tricky as it can be strided, use an offset, etc.
+        // For now only support the basic case.
+        if tensor_info.layout.start_offset() != 0 || !tensor_info.layout.is_contiguous() {
            crate::bail!(
                "cannot retrieve non-contiguous tensors {:?}",
                tensor_info.layout
            )
        }
-        let start_offset = tensor_info.layout.start_offset();
-        if start_offset > 0 {
-            std::io::copy(
-                &mut reader.by_ref().take(start_offset as u64),
-                &mut std::io::sink(),
-            )?;
-        }
        let tensor = Tensor::from_reader(
            tensor_info.layout.shape().clone(),
            tensor_info.dtype,
            &mut reader,
        )?;
-
-        if rank > 1 && is_fortran_contiguous {
-            // Reverse the shape, e.g. Shape(2, 3, 4) -> Shape(4, 3, 2)
-            let shape_reversed: Vec<_> = tensor_info.layout.dims().iter().rev().cloned().collect();
-            let tensor = tensor.reshape(shape_reversed)?;
-
-            // Permute (transpose) the dimensions, e.g. Shape(4, 3, 2) -> Shape(2, 3, 4)
-            let dim_indeces_reversed: Vec<_> = (0..rank).rev().collect();
-            let tensor = tensor.permute(dim_indeces_reversed)?;
-            Ok(Some(tensor))
-        } else {
-            Ok(Some(tensor))
-        }
+        Ok(Some(tensor))
    }
 }
-
-/// Read all the tensors from a PyTorch pth file with a given key.
-///
-/// # Arguments
-/// * `path` - Path to the pth file.
-/// * `key` - Optional key to retrieve `state_dict` from the pth file. Sometimes the pth file
-///           contains multiple objects and the state_dict is the one we are interested in.
-pub fn read_all_with_key<P: AsRef<std::path::Path>>(
-    path: P,
-    key: Option<&str>,
-) -> Result<Vec<(String, Tensor)>> {
-    let pth = PthTensors::new(path, key)?;
-    let tensor_names = pth.tensor_infos.keys();
-    let mut tensors = Vec::with_capacity(tensor_names.len());
-    for name in tensor_names {
-        if let Some(tensor) = pth.get(name)? {
-            tensors.push((name.to_string(), tensor))
-        }
-    }
-    Ok(tensors)
-}
-
-/// Read all the tensors from a PyTorch pth file.
-///
-/// # Arguments
-/// * `path` - Path to the pth file.
-pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tensor)>> {
-    read_all_with_key(path, None)
-}
--- a/candle-core/src/quantized/avx.rs
+++ b/candle-core/src/quantized/avx.rs
@ -50,9 +50,14 @@ pub(crate) unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> __m256 {
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
+    let nb = n / qk;
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+    }
+
    unsafe {
        let mut acc = _mm256_setzero_ps();
        for (x, y) in xs.iter().zip(ys.iter()) {
@ -353,7 +358,7 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                q3 = q3.add(32);

                // Prepare low and high bits
-                // We hardcode the shifts here to avoid loading them into a separate register
+                // We hardcode the shifts here to avoid loading them into a seperate register
                let q3l_0 = _mm256_and_si256(q3bits, m3);
                let q3h_0 = if j == 0 {
                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 0)), 0)
@ -586,7 +591,7 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bits = _mm256_loadu_si256(q5 as *const __m256i);
                q5 = q5.add(32);

-                //Similar to q3k we hardcode the shifts here to avoid loading them into a separate register
+                //Similar to q3k we hardcode the shifts here to avoid loading them into a seperate register
                let q5l_0 = _mm256_and_si256(q5bits, m4);
                let q5l_0_shift_input = _mm256_and_si256(hbits, hmask);
                let q5l_0_right_shift = match j {
@ -633,35 +638,3 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
        Ok(hsum_float_8(acc) + summs)
    }
 }
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % qk != 0 {
-        crate::bail!("vec_dot_q8k_8k: {n} is not divisible by {qk}")
-    }
-
-    unsafe {
-        let mut acc = _mm256_setzero_ps();
-        for (xs, ys) in xs.iter().zip(ys.iter()) {
-            let mut sumi = _mm256_setzero_si256();
-            let x_qs = xs.qs.as_ptr();
-            let y_qs = ys.qs.as_ptr();
-            for j in (0..QK_K).step_by(32) {
-                let xs = _mm256_loadu_si256(x_qs.add(j) as *const __m256i);
-                let ys = _mm256_loadu_si256(y_qs.add(j) as *const __m256i);
-
-                let xs0 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(xs, 0));
-                let ys0 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(ys, 0));
-                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(xs0, ys0));
-
-                let xs1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(xs, 1));
-                let ys1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(ys, 1));
-                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(xs1, ys1));
-            }
-            let d = _mm256_set1_ps(xs.d * ys.d);
-            acc = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi), acc);
-        }
-        Ok(hsum_float_8(acc))
-    }
-}
--- a/candle-core/src/quantized/cuda.rs
+++ b/candle-core/src/quantized/cuda.rs
@ -1,456 +0,0 @@
-use super::{GgmlDType, QStorage};
-use crate::quantized::k_quants::GgmlType;
-use crate::{backend::BackendDevice, cuda_backend::WrapErr};
-use crate::{CudaDevice, CudaStorage, Result};
-
-use cudarc::driver::{CudaSlice, CudaView, DeviceSlice};
-
-#[derive(Clone, Debug)]
-pub struct QCudaStorage {
-    data: CudaSlice<u8>,
-    dtype: GgmlDType,
-    device: CudaDevice,
-}
-
-static FORCE_DMMV: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
-
-pub fn set_force_dmmv(f: bool) {
-    FORCE_DMMV.store(f, std::sync::atomic::Ordering::Relaxed)
-}
-
-pub const WARP_SIZE: usize = 32;
-pub const MMQ_X_Q4_0_AMPERE: usize = 4;
-pub const MMQ_Y_Q4_0_AMPERE: usize = 32;
-pub const NWARPS_Q4_0_AMPERE: usize = 4;
-pub const GGML_CUDA_MMV_X: usize = 32;
-pub const GGML_CUDA_MMV_Y: usize = 1;
-pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256;
-pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256;
-pub const MATRIX_ROW_PADDING: usize = 512;
-
-fn ceil_div(p: usize, q: usize) -> usize {
-    (p + q - 1) / q
-}
-
-fn pad(p: usize, q: usize) -> usize {
-    ceil_div(p, q) * q
-}
-
-fn quantize_q8_1(
-    src: &CudaView<f32>,
-    dst: &mut CudaSlice<u8>,
-    elem_count: usize,
-    dev: &CudaDevice,
-) -> Result<()> {
-    use cudarc::driver::LaunchAsync;
-
-    let kx = elem_count;
-    let kx_padded = pad(kx, MATRIX_ROW_PADDING);
-    let num_blocks = ceil_div(kx_padded, CUDA_QUANTIZE_BLOCK_SIZE);
-    let func = dev.get_or_load_func("quantize_q8_1", candle_kernels::QUANTIZED)?;
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (num_blocks as u32, 1, 1),
-        block_dim: (CUDA_QUANTIZE_BLOCK_SIZE as u32, 1, 1),
-        shared_mem_bytes: 0,
-    };
-    let params = (src, dst, kx as i32, kx_padded as i32);
-    unsafe { func.launch(cfg, params) }.w()?;
-    Ok(())
-}
-
-fn dequantize(
-    data: &CudaSlice<u8>,
-    dtype: GgmlDType,
-    elem_count: usize,
-    dev: &CudaDevice,
-) -> Result<CudaStorage> {
-    use cudarc::driver::LaunchAsync;
-
-    let nb = (elem_count + 255) / 256;
-    let (kernel_name, is_k, block_dim, num_blocks) = match dtype {
-        GgmlDType::Q4_0 => ("dequantize_block_q4_0", false, 32, nb),
-        GgmlDType::Q4_1 => ("dequantize_block_q4_1", false, 32, nb),
-        GgmlDType::Q5_0 => (
-            "dequantize_block_q5_0",
-            false,
-            CUDA_DEQUANTIZE_BLOCK_SIZE,
-            ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
-        ),
-        GgmlDType::Q5_1 => (
-            "dequantize_block_q5_1",
-            false,
-            CUDA_DEQUANTIZE_BLOCK_SIZE,
-            ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
-        ),
-        GgmlDType::Q8_0 => ("dequantize_block_q8_0", false, 32, nb),
-        GgmlDType::Q2K => ("dequantize_block_q2_K", true, 64, nb),
-        GgmlDType::Q3K => ("dequantize_block_q3_K", true, 64, nb),
-        GgmlDType::Q4K => ("dequantize_block_q4_K", true, 32, nb),
-        GgmlDType::Q5K => ("dequantize_block_q5_K", true, 64, nb),
-        GgmlDType::Q6K => ("dequantize_block_q6_K", true, 64, nb),
-        GgmlDType::Q8K => ("dequantize_block_q8_K", true, 32, nb),
-        _ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
-    };
-    let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
-    let dst = unsafe { dev.alloc::<f32>(elem_count).w()? };
-    // See e.g.
-    // https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (num_blocks as u32, 1, 1),
-        block_dim: (block_dim as u32, 1, 1),
-        shared_mem_bytes: 0,
-    };
-
-    if is_k {
-        let params = (data, &dst);
-        unsafe { func.launch(cfg, params) }.w()?;
-    } else {
-        let nb32 = match dtype {
-            GgmlDType::Q5_0 | GgmlDType::Q5_1 => elem_count,
-            _ => elem_count / 32,
-        };
-        let params = (data, &dst, nb32 as i32);
-        unsafe { func.launch(cfg, params) }.w()?;
-    }
-    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
-}
-
-fn dequantize_mul_mat_vec(
-    data: &CudaSlice<u8>,
-    y: &CudaView<f32>,
-    dtype: GgmlDType,
-    ncols: usize,
-    nrows: usize,
-    dev: &CudaDevice,
-) -> Result<CudaStorage> {
-    use cudarc::driver::LaunchAsync;
-
-    let data_elems = data.len() / dtype.type_size() * dtype.block_size();
-    if data_elems < ncols * nrows {
-        crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems)
-    }
-    if y.len() != ncols {
-        crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len())
-    }
-    let kernel_name = match dtype {
-        GgmlDType::Q4_0 => "dequantize_mul_mat_vec_q4_0_cuda",
-        GgmlDType::Q4_1 => "dequantize_mul_mat_vec_q4_1_cuda",
-        GgmlDType::Q5_0 => "dequantize_mul_mat_vec_q5_0_cuda",
-        GgmlDType::Q5_1 => "dequantize_mul_mat_vec_q5_1_cuda",
-        GgmlDType::Q8_0 => "dequantize_mul_mat_vec_q8_0_cuda",
-        GgmlDType::Q2K => "dequantize_mul_mat_vec_q2_k",
-        GgmlDType::Q3K => "dequantize_mul_mat_vec_q3_k",
-        GgmlDType::Q4K => "dequantize_mul_mat_vec_q4_k",
-        GgmlDType::Q5K => "dequantize_mul_mat_vec_q5_k",
-        GgmlDType::Q6K => "dequantize_mul_mat_vec_q6_k",
-        _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
-    };
-    let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
-    let dst = unsafe { dev.alloc::<f32>(nrows).w()? };
-    let block_num_y = ceil_div(nrows, GGML_CUDA_MMV_Y);
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (block_num_y as u32, 1, 1),
-        block_dim: (WARP_SIZE as u32, GGML_CUDA_MMV_Y as u32, 1),
-        shared_mem_bytes: 0,
-    };
-
-    let params = (data, y, &dst, ncols as i32, nrows as i32);
-    unsafe { func.launch(cfg, params) }.w()?;
-    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
-}
-
-fn mul_mat_vec_via_q8_1(
-    data: &CudaSlice<u8>,
-    y: &CudaView<f32>,
-    dtype: GgmlDType,
-    ncols: usize,
-    nrows: usize,
-    dev: &CudaDevice,
-) -> Result<CudaStorage> {
-    use cudarc::driver::LaunchAsync;
-
-    let data_elems = data.len() / dtype.type_size() * dtype.block_size();
-    if data_elems < ncols * nrows {
-        crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems)
-    }
-    if y.len() != ncols {
-        crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len())
-    }
-    // Start by quantizing y
-    let ncols_padded = pad(ncols, MATRIX_ROW_PADDING);
-    let y_size_in_bytes = ncols_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
-    let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes).w()? };
-    quantize_q8_1(y, &mut y_q8_1, ncols, dev)?;
-
-    let kernel_name = match dtype {
-        GgmlDType::Q4_0 => "mul_mat_vec_q4_0_q8_1_cuda",
-        GgmlDType::Q4_1 => "mul_mat_vec_q4_1_q8_1_cuda",
-        GgmlDType::Q5_0 => "mul_mat_vec_q5_0_q8_1_cuda",
-        GgmlDType::Q5_1 => "mul_mat_vec_q5_1_q8_1_cuda",
-        GgmlDType::Q8_0 => "mul_mat_vec_q8_0_q8_1_cuda",
-        GgmlDType::Q2K => "mul_mat_vec_q2_K_q8_1_cuda",
-        GgmlDType::Q3K => "mul_mat_vec_q3_K_q8_1_cuda",
-        GgmlDType::Q4K => "mul_mat_vec_q4_K_q8_1_cuda",
-        GgmlDType::Q5K => "mul_mat_vec_q5_K_q8_1_cuda",
-        GgmlDType::Q6K => "mul_mat_vec_q6_K_q8_1_cuda",
-        _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
-    };
-    let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
-    let dst = unsafe { dev.alloc::<f32>(nrows).w()? };
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (nrows as u32, 1, 1),
-        block_dim: (WARP_SIZE as u32, 4, 1),
-        shared_mem_bytes: 0,
-    };
-
-    let params = (
-        data,
-        &y_q8_1,
-        &dst,
-        /* ncols_x */ ncols as i32,
-        /* nrows_x */ nrows as i32,
-        /* nrows_y */ ncols as i32,
-        /* nrows_dst */ nrows as i32,
-    );
-    unsafe { func.launch(cfg, params) }.w()?;
-    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
-}
-
-impl QCudaStorage {
-    pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result<Self> {
-        let size_in_bytes = ceil_div(el_count, dtype.block_size()) * dtype.type_size();
-        let data = device.alloc_zeros::<u8>(size_in_bytes).w()?;
-        Ok(QCudaStorage {
-            data,
-            device: device.clone(),
-            dtype,
-        })
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &CudaDevice {
-        &self.device
-    }
-
-    pub fn dequantize(&self, elem_count: usize) -> Result<CudaStorage> {
-        fn deq<T: GgmlType>(buffer: &[u8], n: usize, dst: &mut [f32]) -> Result<()> {
-            let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) };
-            let vec = slice.to_vec();
-            T::to_float(&vec, dst)
-        }
-
-        let fast_kernel = matches!(
-            self.dtype,
-            GgmlDType::Q4_0
-                | GgmlDType::Q4_1
-                | GgmlDType::Q5_0
-                | GgmlDType::Q5_1
-                | GgmlDType::Q8_0
-                | GgmlDType::Q2K
-                | GgmlDType::Q3K
-                | GgmlDType::Q4K
-                | GgmlDType::Q5K
-                | GgmlDType::Q6K
-                | GgmlDType::Q8K
-        );
-        if fast_kernel {
-            return dequantize(&self.data, self.dtype, elem_count, self.device());
-        }
-        // Run the dequantization on cpu.
-
-        let buffer = self.device.dtoh_sync_copy(&self.data).w()?;
-        let mut out = vec![0.0; elem_count];
-        let block_len = elem_count / self.dtype.block_size();
-        match self.dtype {
-            GgmlDType::F32 => deq::<f32>(&buffer, block_len, &mut out)?,
-            GgmlDType::F16 => deq::<half::f16>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q4_0 => deq::<crate::quantized::BlockQ4_0>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q4_1 => deq::<crate::quantized::BlockQ4_1>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q5_0 => deq::<crate::quantized::BlockQ5_0>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q5_1 => deq::<crate::quantized::BlockQ5_1>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q8_0 => deq::<crate::quantized::BlockQ8_0>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q8_1 => deq::<crate::quantized::BlockQ8_1>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q2K => deq::<crate::quantized::BlockQ2K>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q3K => deq::<crate::quantized::BlockQ3K>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q4K => deq::<crate::quantized::BlockQ4K>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q5K => deq::<crate::quantized::BlockQ5K>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q6K => deq::<crate::quantized::BlockQ6K>(&buffer, block_len, &mut out)?,
-            GgmlDType::Q8K => deq::<crate::quantized::BlockQ8K>(&buffer, block_len, &mut out)?,
-        }
-
-        self.device
-            .storage_from_cpu_storage(&crate::CpuStorage::F32(out))
-    }
-
-    pub fn quantize(&mut self, src: &CudaStorage) -> Result<()> {
-        // Run the quantization on cpu.
-        let src = match &src.slice {
-            crate::cuda_backend::CudaStorageSlice::F32(data) => {
-                self.device.dtoh_sync_copy(data).w()?
-            }
-            _ => crate::bail!("only f32 can be quantized"),
-        };
-        let src_len = src.len();
-        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
-        let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
-        qcpu_storage.quantize(&src)?;
-        let data = qcpu_storage.data()?;
-        let data = self.device.htod_sync_copy(data.as_ref()).w()?;
-        self.data = data;
-        Ok(())
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        self.data.len()
-    }
-
-    pub fn fwd(
-        &self,
-        self_shape: &crate::Shape,
-        storage: &CudaStorage,
-        layout: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        if matches!(layout.shape().dims(), [1, 1, _] | [1, _]) {
-            self.dequantize_matmul_vec(self_shape, storage, layout)
-        } else {
-            self.dequantize_matmul(self_shape, storage, layout)
-        }
-    }
-}
-
-impl QCudaStorage {
-    fn dequantize_matmul_vec(
-        &self,
-        self_shape: &crate::Shape,
-        rhs: &CudaStorage,
-        rhs_l: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        let (nrows, ncols) = self_shape.dims2()?;
-        let rhs = rhs.as_cuda_slice::<f32>()?;
-        let rhs = match rhs_l.contiguous_offsets() {
-            Some((o1, o2)) => rhs.slice(o1..o2),
-            None => Err(crate::Error::RequiresContiguous { op: "dmmv" }.bt())?,
-        };
-        let (with_batch, k) = match rhs_l.shape().dims() {
-            [1, 1, k] => (true, k),
-            [1, k] => (false, k),
-            _ => crate::bail!("unexpected rhs shape in dmmv {:?}", rhs_l.shape()),
-        };
-        if ncols != *k {
-            crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", rhs_l.shape())
-        }
-
-        let out = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
-            dequantize_mul_mat_vec(&self.data, &rhs, self.dtype, ncols, nrows, self.device())?
-        } else {
-            mul_mat_vec_via_q8_1(&self.data, &rhs, self.dtype, ncols, nrows, self.device())?
-        };
-        let out_shape = if with_batch {
-            vec![1, 1, nrows]
-        } else {
-            vec![1, nrows]
-        };
-        Ok((out, out_shape.into()))
-    }
-
-    fn dequantize_matmul(
-        &self,
-        self_shape: &crate::Shape,
-        storage: &CudaStorage,
-        layout: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        use crate::backend::BackendStorage;
-        let (n, k) = self_shape.dims2()?;
-        let (b, m, k2) = match layout.shape().dims() {
-            &[b, m, k2] => (b, m, k2),
-            &[m, k2] => (1, m, k2),
-            s => crate::bail!("unexpected shape for input {s:?}"),
-        };
-        if k2 != k {
-            crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", layout.shape())
-        }
-
-        let data_f32 = self.dequantize(n * k)?;
-        let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0).broadcast_as((b, k, n))?;
-        let out = storage.matmul(&data_f32, (b, m, n, k), layout, &rhs_l)?;
-        let mut out_shape = layout.shape().dims().to_vec();
-        out_shape.pop();
-        out_shape.push(n);
-        Ok((out, out_shape.into()))
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    device: &CudaDevice,
-    data: &[T],
-) -> Result<super::QStorage> {
-    let data = unsafe {
-        std::slice::from_raw_parts(data.as_ptr() as *const u8, core::mem::size_of_val(data))
-    };
-    let data = device.htod_sync_copy(data).w()?;
-    Ok(QStorage::Cuda(QCudaStorage {
-        data,
-        device: device.clone(),
-        dtype: T::DTYPE,
-    }))
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn cuda_quantize_q8_1() -> Result<()> {
-        let dev = CudaDevice::new(0)?;
-        let el = 256;
-        let el_padded = pad(el, MATRIX_ROW_PADDING);
-        let y_size_in_bytes =
-            el_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
-        let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes).w()? };
-        let vs: Vec<f32> = (0..el).map(|v| v as f32).collect();
-        let y = dev.htod_sync_copy(&vs).w()?;
-        quantize_q8_1(&y.slice(..), &mut y_q8_1, el, &dev)?;
-        Ok(())
-    }
-
-    #[test]
-    fn cuda_mmv_q8_1() -> Result<()> {
-        let dev = CudaDevice::new(0)?;
-        let ncols = 256;
-        let vs: Vec<f32> = (0..ncols).map(|v| v as f32).collect();
-        let y = dev.htod_sync_copy(&vs).w()?;
-        let mut xs = QCudaStorage::zeros(&dev, ncols, GgmlDType::Q4_0)?;
-        xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
-        let cuda_storage = mul_mat_vec_via_q8_1(
-            &xs.data,
-            &y.slice(..),
-            /* dtype */ GgmlDType::Q4_0,
-            /* ncols */ ncols,
-            /* nrows */ 1,
-            &dev,
-        )?;
-        let vs = cuda_storage.as_cuda_slice::<f32>()?;
-        let vs = dev.dtoh_sync_copy(&vs.slice(..)).unwrap();
-        assert_eq!(vs.len(), 1);
-        // for n = 255, n.(n+1).(2n+1) / 6 = 5559680
-        // Q8 means 1/256 precision.
-        assert_eq!(vs[0], 5561664.5);
-
-        let cuda_storage = dequantize_mul_mat_vec(
-            &xs.data,
-            &y.slice(..),
-            /* dtype */ GgmlDType::Q4_0,
-            /* ncols */ ncols,
-            /* nrows */ 1,
-            &dev,
-        )?;
-        let vs = cuda_storage.as_cuda_slice::<f32>()?;
-        let vs = dev.dtoh_sync_copy(&vs.slice(..)).unwrap();
-        assert_eq!(vs.len(), 1);
-        assert_eq!(vs[0], 5561851.0);
-        Ok(())
-    }
-}
--- a/candle-core/src/quantized/dummy_cuda.rs
+++ b/candle-core/src/quantized/dummy_cuda.rs
@ -1,50 +0,0 @@
-#![allow(unused)]
-use super::GgmlDType;
-use crate::{CudaDevice, CudaStorage, Error, Result};
-
-pub struct QCudaStorage {
-    dtype: GgmlDType,
-    device: CudaDevice,
-}
-
-impl QCudaStorage {
-    pub fn zeros(_: &CudaDevice, _: usize, _: GgmlDType) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &CudaDevice {
-        &self.device
-    }
-
-    pub fn dequantize(&self, _elem_count: usize) -> Result<CudaStorage> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    pub fn quantize(&mut self, _src: &CudaStorage) -> Result<()> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        0
-    }
-
-    pub fn fwd(
-        &self,
-        _self_shape: &crate::Shape,
-        _storage: &CudaStorage,
-        _layout: &crate::Layout,
-    ) -> Result<(CudaStorage, crate::Shape)> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    _device: &CudaDevice,
-    _data: &[T],
-) -> Result<super::QStorage> {
-    Err(Error::NotCompiledWithCudaSupport)
-}
--- a/candle-core/src/quantized/dummy_metal.rs
+++ b/candle-core/src/quantized/dummy_metal.rs
@ -1,50 +0,0 @@
-#![allow(unused)]
-use super::GgmlDType;
-use crate::{Error, MetalDevice, MetalStorage, Result};
-
-pub struct QMetalStorage {
-    dtype: GgmlDType,
-    device: MetalDevice,
-}
-
-impl QMetalStorage {
-    pub fn zeros(_: &MetalDevice, _: usize, _: GgmlDType) -> Result<Self> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &MetalDevice {
-        &self.device
-    }
-
-    pub fn dequantize(&self, _elem_count: usize) -> Result<MetalStorage> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    pub fn quantize(&mut self, _src: &MetalStorage) -> Result<()> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        0
-    }
-
-    pub fn fwd(
-        &self,
-        _self_shape: &crate::Shape,
-        _storage: &MetalStorage,
-        _layout: &crate::Layout,
-    ) -> Result<(MetalStorage, crate::Shape)> {
-        Err(Error::NotCompiledWithMetalSupport)
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    _device: &MetalDevice,
-    _data: &[T],
-) -> Result<super::QStorage> {
-    Err(Error::NotCompiledWithMetalSupport)
-}
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -1,7 +1,7 @@
 //! Support for the GGML file format.

-use super::{k_quants, GgmlDType, QStorage};
-use crate::{Device, Result};
+use super::{k_quants, GgmlDType};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;

@ -121,17 +121,11 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
    raw_data: &[u8],
    size_in_bytes: usize,
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let raw_data_ptr = raw_data.as_ptr();
    let n_blocks = size_in_bytes / std::mem::size_of::<T>();
    let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
-    let data: QStorage = match device {
-        Device::Cpu => QStorage::Cpu(Box::new(data.to_vec())),
-        Device::Metal(metal) => super::metal::load_quantized(metal, data)?,
-        Device::Cuda(cuda) => super::cuda::load_quantized(cuda, data)?,
-    };
-    super::QTensor::new(data, dims)
+    super::QTensor::new(data.to_vec(), dims)
 }

 /// Creates a [Tensor] from a raw GGML tensor.
@ -139,50 +133,23 @@ pub fn qtensor_from_ggml(
    ggml_dtype: GgmlDType,
    raw_data: &[u8],
    dims: Vec<usize>,
-    device: &Device,
 ) -> Result<super::QTensor> {
    let tensor_elems = dims.iter().product::<usize>();
-    let block_size = ggml_dtype.block_size();
-    if tensor_elems % block_size != 0 {
-        crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
-        )
-    }
-    let size_in_bytes = tensor_elems / block_size * ggml_dtype.type_size();
+    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();

    match ggml_dtype {
-        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims, device),
-        GgmlDType::Q4_0 => {
-            from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4_1 => {
-            from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_0 => {
-            from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5_1 => {
-            from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q8_0 => {
-            from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q2K => {
-            from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q3K => {
-            from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q4K => {
-            from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q5K => {
-            from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims, device)
-        }
-        GgmlDType::Q6K => {
-            from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
-        }
+        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims),
+        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_0 => from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4_1 => from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_0 => from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5_1 => from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q8_0 => from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q2K => from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q3K => from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q4K => from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q5K => from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims),
+        GgmlDType::Q6K => from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims),
        _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
    }
 }
@ -190,7 +157,6 @@ pub fn qtensor_from_ggml(
 fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    reader: &mut R,
    magic: VersionedMagic,
-    device: &Device,
 ) -> Result<(String, super::QTensor)> {
    let n_dims = reader.read_u32::<LittleEndian>()?;
    let name_len = reader.read_u32::<LittleEndian>()?;
@ -211,11 +177,11 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    }
    let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>();
    let tensor_elems = dims.iter().product::<usize>();
-    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.block_size();
+    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
    // TODO: Mmap version to avoid copying the data around?
    let mut raw_data = vec![0u8; size_in_bytes];
    reader.read_exact(&mut raw_data)?;
-    match qtensor_from_ggml(ggml_dtype, &raw_data, dims, device) {
+    match qtensor_from_ggml(ggml_dtype, &raw_data, dims) {
        Ok(tensor) => Ok((name, tensor)),
        Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
    }
@ -226,14 +192,10 @@ pub struct Content {
    pub hparams: HParams,
    pub vocab: Vocab,
    pub tensors: HashMap<String, super::QTensor>,
-    pub device: Device,
 }

 impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(
-        reader: &mut R,
-        device: &Device,
-    ) -> Result<Content> {
+    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Content> {
        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
        let last_position = reader.seek(std::io::SeekFrom::End(0))?;
        reader.seek(std::io::SeekFrom::Start(0))?;
@ -243,16 +205,14 @@ impl Content {
        let mut tensors = HashMap::new();

        while reader.stream_position()? != last_position {
-            let (name, tensor) = read_one_tensor(reader, magic, device)?;
+            let (name, tensor) = read_one_tensor(reader, magic)?;
            tensors.insert(name, tensor);
        }
-        let device = device.clone();
        Ok(Self {
            magic,
            hparams,
            vocab,
            tensors,
-            device,
        })
    }

--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -3,7 +3,7 @@
 //! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md

 use super::{GgmlDType, QTensor};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::collections::HashMap;

@ -29,7 +29,6 @@ impl TryFrom<u32> for Magic {
 pub enum VersionedMagic {
    GgufV1,
    GgufV2,
-    GgufV3,
 }

 impl VersionedMagic {
@ -40,8 +39,7 @@ impl VersionedMagic {
        let versioned_magic = match (magic, version) {
            (Magic::Gguf, 1) => Self::GgufV1,
            (Magic::Gguf, 2) => Self::GgufV2,
-            (Magic::Gguf, 3) => Self::GgufV3,
-            _ => crate::bail!("gguf: unsupported magic/version {magic:?}/{version}"),
+            _ => crate::bail!("ggml: unsupported magic/version {magic:?}/{version}"),
        };
        Ok(versioned_magic)
    }
@ -59,25 +57,14 @@ impl TensorInfo {
        &self,
        reader: &mut R,
        tensor_data_offset: u64,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_elems = self.shape.elem_count();
-        let block_size = self.ggml_dtype.block_size();
-        if tensor_elems % block_size != 0 {
-            crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
-        )
-        }
-        let size_in_bytes = tensor_elems / block_size * self.ggml_dtype.type_size();
+        let size_in_bytes =
+            tensor_elems * self.ggml_dtype.type_size() / self.ggml_dtype.blck_size();
        let mut raw_data = vec![0u8; size_in_bytes];
        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(
-            self.ggml_dtype,
-            &raw_data,
-            self.shape.dims().to_vec(),
-            device,
-        )
+        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
    }
 }

@ -92,9 +79,7 @@ pub struct Content {
 fn read_string<R: std::io::Read>(reader: &mut R, magic: &VersionedMagic) -> Result<String> {
    let len = match magic {
        VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-        VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-            reader.read_u64::<LittleEndian>()? as usize
-        }
+        VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
    };
    let mut v = vec![0u8; len];
    reader.read_exact(&mut v)?;
@ -294,9 +279,7 @@ impl Value {
                let value_type = ValueType::from_u32(value_type)?;
                let len = match magic {
                    VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-                    VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-                        reader.read_u64::<LittleEndian>()? as usize
-                    }
+                    VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
                };
                let mut vs = Vec::with_capacity(len);
                for _ in 0..len {
@ -393,15 +376,11 @@ impl Content {

        let tensor_count = match magic {
            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-                reader.read_u64::<LittleEndian>()? as usize
-            }
+            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
        };
        let metadata_kv_count = match magic {
            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
-                reader.read_u64::<LittleEndian>()? as usize
-            }
+            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
        };

        let mut metadata = HashMap::new();
@ -423,7 +402,7 @@ impl Content {
                    reader.read_u32_into::<LittleEndian>(&mut dimensions)?;
                    dimensions.into_iter().map(|c| c as usize).collect()
                }
-                VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                VersionedMagic::GgufV2 => {
                    let mut dimensions = vec![0; n_dimensions as usize];
                    reader.read_u64_into::<LittleEndian>(&mut dimensions)?;
                    dimensions.into_iter().map(|c| c as usize).collect()
@ -466,13 +445,12 @@ impl Content {
        &self,
        reader: &mut R,
        name: &str,
-        device: &Device,
    ) -> Result<QTensor> {
        let tensor_info = match self.tensor_infos.get(name) {
            Some(tensor_info) => tensor_info,
-            None => crate::bail!("cannot find tensor info for {name}"),
+            None => crate::bail!("cannot find tensor-infor for {name}"),
        };
-        tensor_info.read(reader, self.tensor_data_offset, device)
+        tensor_info.read(reader, self.tensor_data_offset)
    }
 }

@ -524,9 +502,10 @@ pub fn write<W: std::io::Seek + std::io::Write>(
                "internal error, unexpected current position {tensor_start_pos} {offset} {pos}"
            )
        }
-        let data = tensor.data()?;
-        let size_in_bytes = data.len();
-        w.write_all(&data)?;
+        let data_ptr = tensor.as_ptr();
+        let size_in_bytes = tensor.storage_size_in_bytes();
+        let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
+        w.write_all(data)?;
        let padding = 31 - (31 + size_in_bytes) % 32;
        w.write_all(&vec![0u8; padding])?;
    }
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@ -34,9 +34,6 @@ pub trait GgmlType: Sized + Clone + Send + Sync {
    /// Dot product used as a building block for quantized mat-mul.
    /// n is the number of elements to be considered.
    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32>;
-
-    /// Generic implementation of the dot product without simd optimizations.
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32>;
 }

 #[derive(Debug, Clone, PartialEq)]
@ -228,17 +225,15 @@ impl GgmlType for BlockQ4_0 {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q4_0_q8_0(n, xs, ys);

-        #[cfg(target_feature = "simd128")]
-        return super::simd128::vec_dot_q4_0_q8_0(n, xs, ys);
-
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = QK8_0;
+        let nb = n / qk;
        if n % QK8_0 != 0 {
            crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
        }
+        if nb % 2 != 0 {
+            crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+        }
+
        // Generic implementation.
        let mut sumf = 0f32;
        for (xs, ys) in xs.iter().zip(ys.iter()) {
@ -260,10 +255,6 @@ impl GgmlType for BlockQ4_1 {
    type VecDotType = BlockQ8_1;

    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        // ggml_vec_dot_q4_1_q8_1
        let qk = QK8_1;
        if n % qk != 0 {
@ -363,10 +354,7 @@ impl GgmlType for BlockQ5_0 {
        if nb % 2 != 0 {
            crate::bail!("vec_dot_q5_0_q8_0: {n}, nb is not divisible by 2")
        }
-        Self::vec_dot_unopt(n, xs, ys)
-    }

-    fn vec_dot_unopt(_n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        // Generic implementation.
        let mut sumf = 0f32;

@ -457,10 +445,6 @@ impl GgmlType for BlockQ5_1 {
    type VecDotType = BlockQ8_1;

    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = Self::BLCK_SIZE;
        if n % Self::BLCK_SIZE != 0 {
            crate::bail!("vec_dot_q5_1_q8_1: {n} is not divisible by {qk}")
@ -622,13 +606,6 @@ impl GgmlType for BlockQ8_0 {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q8_0_q8_0(n, xs, ys);

-        #[cfg(target_feature = "simd128")]
-        return super::simd128::vec_dot_q8_0_q8_0(n, xs, ys);
-
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = QK8_0;
        if n % QK8_0 != 0 {
            crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
@ -654,11 +631,7 @@ impl GgmlType for BlockQ8_1 {
    const BLCK_SIZE: usize = QK8_1;
    type VecDotType = BlockQ8_1;

-    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
+    fn vec_dot(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
        unimplemented!("no support for vec-dot on Q8_1")
    }

@ -708,13 +681,6 @@ impl GgmlType for BlockQ2K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q2k_q8k(n, xs, ys);

-        #[cfg(target_feature = "simd128")]
-        return super::simd128::vec_dot_q2k_q8k(n, xs, ys);
-
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
        }
@ -735,17 +701,18 @@ impl GgmlType for BlockQ2K {

            let mut isum = 0;
            let mut is = 0;
+            let mut d;
            for _ in 0..(QK_K / 128) {
                let mut shift = 0;
                for _ in 0..4 {
-                    let d = (sc[is] & 0xF) as i32;
+                    d = (sc[is] & 0xF) as i32;
                    is += 1;
                    let mut isuml = 0;
                    for l in 0..16 {
                        isuml += q8[l] as i32 * (((q2[l] >> shift) & 3) as i32);
                    }
                    isum += d * isuml;
-                    let d = (sc[is] & 0xF) as i32;
+                    d = (sc[is] & 0xF) as i32;
                    is += 1;
                    isuml = 0;
                    for l in 16..32 {
@ -884,10 +851,6 @@ impl GgmlType for BlockQ3K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q3k_q8k(n, xs, ys);

-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q3k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1114,6 +1077,7 @@ impl GgmlType for BlockQ3K {
            let d_all = block.d.to_f32();
            let mut m = 1;
            let mut is = 0;
+            let mut dl;

            // Dequantize both 128 long blocks
            // 32 qs values per 128 long block
@ -1124,7 +1088,7 @@ impl GgmlType for BlockQ3K {
                    for (scale_index, scale_scoped_y) in
                        shift_scoped_y.chunks_exact_mut(16).enumerate()
                    {
-                        let dl = d_all * (scales[is] as f32 - 32.0);
+                        dl = d_all * (scales[is] as f32 - 32.0);
                        for (i, inner_y) in scale_scoped_y.iter_mut().enumerate() {
                            let new_y = dl
                                * (((qs[i + 16 * scale_index] >> shift) & 3) as i8
@ -1162,13 +1126,6 @@ impl GgmlType for BlockQ4K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q4k_q8k(n, xs, ys);

-        #[cfg(target_feature = "simd128")]
-        return super::simd128::vec_dot_q4k_q8k(n, xs, ys);
-
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1355,10 +1312,6 @@ impl GgmlType for BlockQ5K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q5k_q8k(n, xs, ys);

-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1545,13 +1498,13 @@ impl GgmlType for BlockQ5K {
                let d2 = d * sc as f32;
                let m2 = min * m as f32;
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u1 != 0 { 16f32 } else { 0f32 };
-                    y[ys_index] = d1 * ((ql & 0xF) as f32 + to_add) - m1;
+                    let to_add = if qh & u1 != 0 { 16 } else { 1 };
+                    y[ys_index] = d1 * ((ql & 0xF) + to_add) as f32 - m1;
                    ys_index += 1;
                }
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u2 != 0 { 16f32 } else { 0f32 };
-                    y[ys_index] = d2 * ((ql >> 4) as f32 + to_add) - m2;
+                    let to_add = if qh & u2 != 0 { 16 } else { 1 };
+                    y[ys_index] = d2 * ((ql >> 4) + to_add) as f32 - m2;
                    ys_index += 1;
                }
                is += 2;
@ -1576,13 +1529,6 @@ impl GgmlType for BlockQ6K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q6k_q8k(n, xs, ys);

-        #[cfg(target_feature = "simd128")]
-        return super::simd128::vec_dot_q6k_q8k(n, xs, ys);
-
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1751,38 +1697,8 @@ impl GgmlType for BlockQ8K {
    const BLCK_SIZE: usize = QK_K;
    type VecDotType = BlockQ8K;

-    #[allow(unreachable_code)]
-    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        #[cfg(target_feature = "avx")]
-        return super::avx::vec_dot_q8k_q8k(n, xs, ys);
-
-        #[cfg(target_feature = "neon")]
-        return super::neon::vec_dot_q8k_q8k(n, xs, ys);
-
-        #[cfg(target_feature = "simd128")]
-        return super::simd128::vec_dot_q8k_q8k(n, xs, ys);
-
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        let qk = QK_K;
-        if n % QK_K != 0 {
-            crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
-        }
-
-        // Generic implementation.
-        let mut sumf = 0f32;
-        for (xs, ys) in xs.iter().zip(ys.iter()) {
-            let sum_i = xs
-                .qs
-                .iter()
-                .zip(ys.qs.iter())
-                .map(|(&x, &y)| x as i32 * y as i32)
-                .sum::<i32>();
-            sumf += sum_i as f32 * xs.d * ys.d
-        }
-        Ok(sumf)
+    fn vec_dot(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
+        unreachable!()
    }

    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
@ -1888,10 +1804,6 @@ impl GgmlType for f32 {
    type VecDotType = f32;

    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if xs.len() < n {
            crate::bail!("size mismatch {} < {n}", xs.len())
        }
@ -1926,10 +1838,6 @@ impl GgmlType for f16 {
    type VecDotType = f16;

    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
-        Self::vec_dot_unopt(n, xs, ys)
-    }
-
-    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if xs.len() < n {
            crate::bail!("size mismatch {} < {n}", xs.len())
        }
--- a/candle-core/src/quantized/metal.rs
+++ b/candle-core/src/quantized/metal.rs
@ -1,222 +0,0 @@
-use super::{GgmlDType, QStorage};
-use crate::backend::BackendStorage;
-use crate::{DType, MetalDevice, MetalStorage, Result, Shape};
-use metal::Buffer;
-use std::sync::Arc;
-
-pub struct QMetalStorage {
-    dtype: GgmlDType,
-    device: MetalDevice,
-    buffer: Arc<Buffer>,
-}
-
-impl QMetalStorage {
-    pub fn zeros(device: &MetalDevice, elem_count: usize, dtype: GgmlDType) -> Result<Self> {
-        let size = elem_count * dtype.type_size() / dtype.block_size();
-        let buffer = device.allocate_zeros(size)?;
-        Ok(Self {
-            buffer,
-            device: device.clone(),
-            dtype,
-        })
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.dtype
-    }
-
-    pub fn device(&self) -> &MetalDevice {
-        &self.device
-    }
-
-    pub fn buffer(&self) -> &Buffer {
-        &self.buffer
-    }
-
-    pub fn dequantize(&self, elem_count: usize) -> Result<MetalStorage> {
-        use crate::quantized::k_quants::GgmlType;
-
-        let buffer = self.device.new_buffer_managed(self.buffer.length())?;
-        let command_buffer = self.device.command_buffer()?;
-        command_buffer.set_label("to_cpu");
-        let blit = command_buffer.new_blit_command_encoder();
-        blit.set_label("blit_to_cpu");
-        blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
-        blit.end_encoding();
-        self.device.wait_until_completed()?;
-        let mut out = vec![0.0; elem_count];
-        let block_len = elem_count / self.dtype.block_size();
-        match self.dtype {
-            GgmlDType::F32 => {
-                let vec: Vec<f32> = read_to_vec(&buffer, block_len);
-                f32::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::F16 => {
-                let vec: Vec<half::f16> = read_to_vec(&buffer, block_len);
-                half::f16::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4_0 => {
-                let vec: Vec<crate::quantized::BlockQ4_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4_1 => {
-                let vec: Vec<crate::quantized::BlockQ4_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5_0 => {
-                let vec: Vec<crate::quantized::BlockQ5_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5_1 => {
-                let vec: Vec<crate::quantized::BlockQ5_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8_0 => {
-                let vec: Vec<crate::quantized::BlockQ8_0> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8_1 => {
-                let vec: Vec<crate::quantized::BlockQ8_1> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q2K => {
-                let vec: Vec<crate::quantized::BlockQ2K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ2K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q3K => {
-                let vec: Vec<crate::quantized::BlockQ3K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ3K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q4K => {
-                let vec: Vec<crate::quantized::BlockQ4K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ4K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q5K => {
-                let vec: Vec<crate::quantized::BlockQ5K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ5K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q6K => {
-                let vec: Vec<crate::quantized::BlockQ6K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ6K::to_float(&vec, &mut out)?;
-            }
-            GgmlDType::Q8K => {
-                let vec: Vec<crate::quantized::BlockQ8K> = read_to_vec(&buffer, block_len);
-                crate::quantized::BlockQ8K::to_float(&vec, &mut out)?;
-            }
-        }
-
-        let buffer = self.device.new_buffer_with_data(&out)?;
-        Ok(MetalStorage::new(
-            buffer,
-            self.device.clone(),
-            elem_count,
-            DType::F32,
-        ))
-    }
-
-    pub fn quantize(&mut self, src: &MetalStorage) -> Result<()> {
-        // Quantization only happens on CPU for now.
-        let src = src.to_cpu::<f32>()?;
-        let elem_count = src.len();
-        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
-        let mut qcpu_storage = crate::Device::Cpu.qzeros(elem_count, self.dtype)?;
-        qcpu_storage.quantize(&src)?;
-        let buffer = self.device.new_buffer_with_data(&qcpu_storage.data()?)?;
-        self.buffer = buffer;
-        Ok(())
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        self.buffer.length() as usize
-    }
-
-    pub fn fwd(
-        &self,
-        self_shape: &Shape,
-        storage: &MetalStorage,
-        layout: &crate::Layout,
-    ) -> Result<(MetalStorage, Shape)> {
-        use crate::MetalError;
-
-        if !layout.is_contiguous() {
-            crate::bail!("input tensor is not contiguous {layout:?}")
-        }
-        let src_shape = layout.shape();
-        // self is transposed so n is first then k.
-        if src_shape.rank() < 2 {
-            crate::bail!("input tensor has only one dimension {layout:?}")
-        }
-        let (n, k) = self_shape.dims2()?;
-        let mut dst_shape = src_shape.dims().to_vec();
-
-        let (b, m) = match dst_shape.len() {
-            3 => (dst_shape[0], dst_shape[1]),
-            2 => (1, dst_shape[0]),
-            n => crate::bail!("Invalid rank {n} for quantized matmul metal"),
-        };
-        let last_k = dst_shape.pop().unwrap();
-        if last_k != k {
-            crate::bail!("input tensor {layout:?} incompatible with {:?}", self_shape)
-        }
-        dst_shape.push(n);
-        let dst_shape = Shape::from(dst_shape);
-        let device = storage.device().clone();
-        let dst = device.new_buffer(dst_shape.elem_count(), DType::F32, "qmatmul")?;
-        let command_buffer = device.command_buffer()?;
-        candle_metal_kernels::call_quantized_matmul_t(
-            device.device(),
-            &command_buffer,
-            device.kernels(),
-            self.dtype.into(),
-            (b, m, n, k),
-            storage.buffer(),
-            layout.start_offset() * storage.dtype().size_in_bytes(),
-            &self.buffer,
-            &dst,
-        )
-        .map_err(MetalError::from)?;
-        let dst_storage = crate::MetalStorage::new(dst, device, dst_shape.elem_count(), DType::F32);
-        Ok((dst_storage, dst_shape))
-    }
-}
-
-pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
-    device: &MetalDevice,
-    data: &[T],
-) -> Result<QStorage> {
-    let buffer = device.new_buffer_with_data(data)?;
-    let device = device.clone();
-    Ok(QStorage::Metal(QMetalStorage {
-        dtype: T::DTYPE,
-        device,
-        buffer,
-    }))
-}
-
-fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
-    let ptr = buffer.contents() as *const T;
-    assert!(!ptr.is_null());
-    let slice = unsafe { std::slice::from_raw_parts(ptr, n) };
-    slice.to_vec()
-}
-
-impl From<GgmlDType> for candle_metal_kernels::GgmlDType {
-    fn from(value: GgmlDType) -> Self {
-        match value {
-            GgmlDType::Q4_0 => candle_metal_kernels::GgmlDType::Q4_0,
-            GgmlDType::Q4_1 => candle_metal_kernels::GgmlDType::Q4_1,
-            GgmlDType::Q5_0 => candle_metal_kernels::GgmlDType::Q5_0,
-            GgmlDType::Q5_1 => candle_metal_kernels::GgmlDType::Q5_1,
-            GgmlDType::Q8_0 => candle_metal_kernels::GgmlDType::Q8_0,
-            GgmlDType::Q8_1 => candle_metal_kernels::GgmlDType::Q8_1,
-            GgmlDType::Q2K => candle_metal_kernels::GgmlDType::Q2K,
-            GgmlDType::Q3K => candle_metal_kernels::GgmlDType::Q3K,
-            GgmlDType::Q4K => candle_metal_kernels::GgmlDType::Q4K,
-            GgmlDType::Q5K => candle_metal_kernels::GgmlDType::Q5K,
-            GgmlDType::Q6K => candle_metal_kernels::GgmlDType::Q6K,
-            GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K,
-            GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16,
-            GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32,
-        }
-    }
-}
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -1,134 +1,21 @@
-use crate::{CpuStorage, Device, Result, Shape, Storage, Tensor};
-use k_quants::*;
-use std::borrow::Cow;
+use crate::{Device, Result, Shape, Tensor};

 #[cfg(target_feature = "avx")]
 pub mod avx;
-mod dummy_cuda;
-mod dummy_metal;
 pub mod ggml_file;
 pub mod gguf_file;
 pub mod k_quants;
-#[cfg(feature = "metal")]
-pub mod metal;
-#[cfg(not(feature = "metal"))]
-mod metal {
-    pub use super::dummy_metal::*;
-}
-#[cfg(feature = "cuda")]
-pub mod cuda;
-#[cfg(not(feature = "cuda"))]
-mod cuda {
-    pub use super::dummy_cuda::*;
-}
-
 #[cfg(target_feature = "neon")]
 pub mod neon;
-#[cfg(target_feature = "simd128")]
-pub mod simd128;
 pub mod utils;
-use half::f16;

 pub use k_quants::GgmlType;

 pub struct QTensor {
-    storage: QStorage,
+    data: Box<dyn QuantizedType>,
    shape: Shape,
 }

-impl Device {
-    fn qzeros(&self, elem_count: usize, dtype: GgmlDType) -> Result<QStorage> {
-        match self {
-            Device::Cpu => {
-                let storage = dtype.cpu_zeros(elem_count);
-                Ok(QStorage::Cpu(storage))
-            }
-            Device::Metal(metal) => {
-                let storage = metal::QMetalStorage::zeros(metal, elem_count, dtype)?;
-                Ok(QStorage::Metal(storage))
-            }
-            Device::Cuda(cuda) => {
-                let storage = cuda::QCudaStorage::zeros(cuda, elem_count, dtype)?;
-                Ok(QStorage::Cuda(storage))
-            }
-        }
-    }
-}
-
-pub enum QStorage {
-    Cpu(Box<dyn QuantizedType>),
-    Metal(metal::QMetalStorage),
-    Cuda(cuda::QCudaStorage),
-}
-
-impl QStorage {
-    fn block_size(&self) -> usize {
-        match self {
-            QStorage::Cpu(storage) => storage.block_size(),
-            QStorage::Metal(storage) => storage.dtype().block_size(),
-            QStorage::Cuda(storage) => storage.dtype().block_size(),
-        }
-    }
-
-    fn dtype(&self) -> GgmlDType {
-        match self {
-            QStorage::Cpu(storage) => storage.dtype(),
-            QStorage::Metal(storage) => storage.dtype(),
-            QStorage::Cuda(storage) => storage.dtype(),
-        }
-    }
-
-    fn device(&self) -> Device {
-        match self {
-            QStorage::Cpu(_storage) => Device::Cpu,
-            QStorage::Metal(storage) => Device::Metal(storage.device().clone()),
-            QStorage::Cuda(storage) => Device::Cuda(storage.device().clone()),
-        }
-    }
-
-    fn size_in_bytes(&self) -> usize {
-        match self {
-            QStorage::Cpu(storage) => storage.storage_size_in_bytes(),
-            QStorage::Metal(storage) => storage.storage_size_in_bytes(),
-            QStorage::Cuda(storage) => storage.storage_size_in_bytes(),
-        }
-    }
-
-    fn quantize(&mut self, src: &Storage) -> Result<()> {
-        match (self, src) {
-            (QStorage::Cpu(storage), Storage::Cpu(src)) => {
-                storage.from_float(src.as_slice::<f32>()?)?;
-            }
-            (QStorage::Metal(storage), Storage::Metal(src)) => storage.quantize(src)?,
-            (QStorage::Cuda(storage), Storage::Cuda(src)) => storage.quantize(src)?,
-            _ => crate::bail!("Invalid dequantize storage locations do not match"),
-        }
-        Ok(())
-    }
-
-    fn dequantize(&self, elem_count: usize) -> Result<Storage> {
-        match self {
-            QStorage::Cpu(storage) => Ok(Storage::Cpu(storage.dequantize(elem_count)?)),
-            QStorage::Metal(storage) => Ok(Storage::Metal(storage.dequantize(elem_count)?)),
-            QStorage::Cuda(storage) => Ok(Storage::Cuda(storage.dequantize(elem_count)?)),
-        }
-    }
-
-    fn data(&self) -> Result<Cow<[u8]>> {
-        match self {
-            QStorage::Cpu(storage) => {
-                let data_ptr = storage.as_ptr();
-                let size_in_bytes = storage.storage_size_in_bytes();
-                let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
-                Ok(Cow::from(data))
-            }
-            QStorage::Metal(_) | QStorage::Cuda(_) => {
-                crate::bail!("not implemented");
-            }
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum GgmlDType {
    F32,
@ -188,25 +75,6 @@ impl GgmlDType {
        }
    }

-    /// The block dtype
-    pub fn cpu_zeros(&self, elem_count: usize) -> Box<dyn QuantizedType> {
-        match self {
-            Self::F32 => Box::new(vec![f32::zeros(); elem_count]),
-            Self::F16 => Box::new(vec![f16::zeros(); elem_count]),
-            Self::Q4_0 => Box::new(vec![BlockQ4_0::zeros(); elem_count / BlockQ4_0::BLCK_SIZE]),
-            Self::Q4_1 => Box::new(vec![BlockQ4_1::zeros(); elem_count / BlockQ4_1::BLCK_SIZE]),
-            Self::Q5_0 => Box::new(vec![BlockQ5_0::zeros(); elem_count / BlockQ5_0::BLCK_SIZE]),
-            Self::Q5_1 => Box::new(vec![BlockQ5_1::zeros(); elem_count / BlockQ5_1::BLCK_SIZE]),
-            Self::Q8_0 => Box::new(vec![BlockQ8_0::zeros(); elem_count / BlockQ8_0::BLCK_SIZE]),
-            Self::Q8_1 => Box::new(vec![BlockQ8_1::zeros(); elem_count / BlockQ8_1::BLCK_SIZE]),
-            Self::Q2K => Box::new(vec![BlockQ2K::zeros(); elem_count / BlockQ2K::BLCK_SIZE]),
-            Self::Q3K => Box::new(vec![BlockQ3K::zeros(); elem_count / BlockQ3K::BLCK_SIZE]),
-            Self::Q4K => Box::new(vec![BlockQ4K::zeros(); elem_count / BlockQ4K::BLCK_SIZE]),
-            Self::Q5K => Box::new(vec![BlockQ5K::zeros(); elem_count / BlockQ5K::BLCK_SIZE]),
-            Self::Q6K => Box::new(vec![BlockQ6K::zeros(); elem_count / BlockQ6K::BLCK_SIZE]),
-            Self::Q8K => Box::new(vec![BlockQ8K::zeros(); elem_count / BlockQ8K::BLCK_SIZE]),
-        }
-    }
    /// The type size for blocks in bytes.
    pub fn type_size(&self) -> usize {
        use k_quants::*;
@ -230,7 +98,7 @@ impl GgmlDType {
    }

    /// The block size, i.e. the number of elements stored in each block.
-    pub fn block_size(&self) -> usize {
+    pub fn blck_size(&self) -> usize {
        match self {
            Self::F32 => 1,
            Self::F16 => 1,
@ -249,13 +117,9 @@ impl GgmlDType {
 pub trait QuantizedType: Send + Sync {
    fn dtype(&self) -> GgmlDType;
    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()>;
-    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage>;
+    fn to_float(&self, ys: &mut [f32]) -> Result<()>;
    fn storage_size_in_bytes(&self) -> usize;
    fn as_ptr(&self) -> *const u8;
-    fn block_size(&self) -> usize;
-    #[allow(clippy::wrong_self_convention)]
-    fn from_float(&mut self, xs: &[f32]) -> Result<()>;
-    fn size(&self) -> usize;
 }

 impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
@ -263,26 +127,12 @@ impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
        k_quants::matmul(mkn, lhs, self.as_slice(), dst)
    }

-    fn size(&self) -> usize {
-        self.len() * core::mem::size_of::<T>()
-    }
-
-    fn from_float(&mut self, xs: &[f32]) -> Result<()> {
-        T::from_float(xs, self)
-    }
-
    fn dtype(&self) -> GgmlDType {
        T::DTYPE
    }

-    fn block_size(&self) -> usize {
-        T::BLCK_SIZE
-    }
-
-    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage> {
-        let mut ys = vec![0.0f32; elem_count];
-        T::to_float(self.as_slice(), &mut ys)?;
-        Ok(CpuStorage::F32(ys))
+    fn to_float(&self, ys: &mut [f32]) -> Result<()> {
+        T::to_float(self.as_slice(), ys)
    }

    fn storage_size_in_bytes(&self) -> usize {
@ -300,53 +150,56 @@ impl std::fmt::Debug for QTensor {
    }
 }

-fn check_shape(shape: &Shape, block_size: usize) -> Result<()> {
+fn check_shape<T: k_quants::GgmlType>(shape: &Shape) -> Result<()> {
    let dims = shape.dims();
    if dims.is_empty() {
        crate::bail!("scalar tensor cannot be quantized {shape:?}")
    }
-    if dims[dims.len() - 1] % block_size != 0 {
+    if dims[dims.len() - 1] % T::BLCK_SIZE != 0 {
        crate::bail!(
            "quantized tensor must have their last dim divisible by block size {shape:?} {}",
-            block_size
+            T::BLCK_SIZE
        )
    }
    Ok(())
 }

 impl QTensor {
-    pub fn new<S: Into<Shape>>(storage: QStorage, shape: S) -> Result<Self> {
+    pub fn new<S: Into<Shape>, T: k_quants::GgmlType + Send + Sync + 'static>(
+        data: Vec<T>,
+        shape: S,
+    ) -> Result<Self> {
        let shape = shape.into();
-        check_shape(&shape, storage.block_size())?;
-        Ok(Self { storage, shape })
+        check_shape::<T>(&shape)?;
+        Ok(Self {
+            data: Box::new(data),
+            shape,
+        })
    }

-    pub fn quantize(src: &Tensor, dtype: GgmlDType) -> Result<Self> {
+    pub fn quantize<T: k_quants::GgmlType + Send + Sync + 'static>(src: &Tensor) -> Result<Self> {
        let shape = src.shape();
-        let block_size = dtype.block_size();
-        check_shape(shape, block_size)?;
-        let src = src.to_dtype(crate::DType::F32)?.flatten_all()?;
-        let elem_count = shape.elem_count();
-        if elem_count % block_size != 0 {
+        check_shape::<T>(shape)?;
+        let src = src
+            .to_dtype(crate::DType::F32)?
+            .flatten_all()?
+            .to_vec1::<f32>()?;
+        if src.len() % T::BLCK_SIZE != 0 {
            crate::bail!(
                "tensor size ({shape:?}) is not divisible by block size {}",
-                block_size
+                T::BLCK_SIZE
            )
        }
-        let mut storage = src.device().qzeros(elem_count, dtype)?;
-        storage.quantize(&src.storage())?;
+        let mut data = vec![T::zeros(); src.len() / T::BLCK_SIZE];
+        T::from_float(&src, &mut data)?;
        Ok(Self {
-            storage,
+            data: Box::new(data),
            shape: shape.clone(),
        })
    }

    pub fn dtype(&self) -> GgmlDType {
-        self.storage.dtype()
-    }
-
-    pub fn device(&self) -> Device {
-        self.storage.device()
+        self.data.dtype()
    }

    pub fn rank(&self) -> usize {
@ -358,56 +211,38 @@ impl QTensor {
    }

    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
-        let storage = self.storage.dequantize(self.shape.elem_count())?;
-        let none = crate::op::BackpropOp::none();
-        let is_variable = false;
-        crate::tensor::from_storage(storage, self.shape.clone(), none, is_variable)
-            .to_device(device)
+        let mut f32_data = vec![0f32; self.shape.elem_count()];
+        self.data.to_float(&mut f32_data)?;
+        Tensor::from_vec(f32_data, &self.shape, device)
+    }
+
+    pub fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> {
+        self.data.matmul_t(mkn, lhs, dst)
    }

    pub fn storage_size_in_bytes(&self) -> usize {
-        self.storage.size_in_bytes()
+        self.data.storage_size_in_bytes()
    }

-    pub fn data(&self) -> Result<Cow<'_, [u8]>> {
-        self.storage.data()
+    pub fn as_ptr(&self) -> *const u8 {
+        self.data.as_ptr()
    }
 }

-#[derive(Clone, Debug)]
-pub enum QMatMul {
-    QTensor(std::sync::Arc<QTensor>),
-    Tensor(Tensor),
-}
-
-thread_local! {
-    static DEQUANTIZE_ALL: bool = {
-        match std::env::var("CANDLE_DEQUANTIZE_ALL") {
-            Ok(s) => {
-                !s.is_empty() && s != "0"
-            },
-            Err(_) => false,
-        }
-    }
-}
+#[derive(Debug)]
+pub struct QMatMul(std::sync::Arc<QTensor>);

 impl QMatMul {
-    pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Result<Self> {
-        let dequantize = match qtensor.dtype() {
-            GgmlDType::F32 | GgmlDType::F16 => true,
-            _ => DEQUANTIZE_ALL.with(|b| *b),
-        };
-        let t = if dequantize {
-            let tensor = qtensor.dequantize(&qtensor.device())?;
-            Self::Tensor(tensor)
-        } else {
-            Self::QTensor(qtensor)
-        };
-        Ok(t)
+    pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Self {
+        Self(qtensor)
    }

-    pub fn from_qtensor(qtensor: QTensor) -> Result<Self> {
-        Self::from_arc(std::sync::Arc::new(qtensor))
+    pub fn from_qtensor(qtensor: QTensor) -> Self {
+        Self(std::sync::Arc::new(qtensor))
+    }
+
+    pub fn inner(&self) -> &std::sync::Arc<QTensor> {
+        &self.0
    }
 }

@ -437,55 +272,21 @@ impl crate::CustomOp1 for QTensor {
        }
        dst_shape.push(n);
        let dst_shape = Shape::from(dst_shape);
-        #[allow(clippy::infallible_destructuring_match)]
-        let self_storage = match &self.storage {
-            QStorage::Cpu(storage) => storage,
-            QStorage::Metal(_) | QStorage::Cuda(_) => crate::bail!("Invalid storage"),
-        };
-        let slice = storage.as_slice::<f32>()?;
-        let slice = &slice[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
+        let storage = storage.as_slice::<f32>()?;
+        let storage =
+            &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
        let mut dst_storage = vec![0f32; dst_shape.elem_count()];
-        self_storage.matmul_t((dst_shape.elem_count() / n, k, n), slice, &mut dst_storage)?;
+        self.matmul_t(
+            (dst_shape.elem_count() / n, k, n),
+            storage,
+            &mut dst_storage,
+        )?;
        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
    }
-
-    fn metal_fwd(
-        &self,
-        storage: &crate::MetalStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::MetalStorage, Shape)> {
-        let self_storage = match &self.storage {
-            QStorage::Metal(metal) => metal,
-            _ => unreachable!("Cannot call metal matmul on non metal QTensor"),
-        };
-        self_storage.fwd(&self.shape, storage, layout)
-    }
-
-    fn cuda_fwd(
-        &self,
-        storage: &crate::CudaStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::CudaStorage, Shape)> {
-        let self_storage = match &self.storage {
-            QStorage::Cuda(cuda) => cuda,
-            _ => unreachable!("Cannot call cuda matmul on non cuda QTensor"),
-        };
-        self_storage.fwd(&self.shape, storage, layout)
-    }
 }

-impl crate::Module for QMatMul {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        match self {
-            Self::QTensor(t) => xs.apply_op1_no_bwd(t.as_ref()),
-            Self::Tensor(w) => {
-                let w = match *xs.dims() {
-                    [b1, b2, _, _] => w.broadcast_left((b1, b2))?.t()?,
-                    [bsize, _, _] => w.broadcast_left(bsize)?.t()?,
-                    _ => w.t()?,
-                };
-                xs.matmul(&w)
-            }
-        }
+impl QMatMul {
+    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        xs.apply_op1_no_bwd(self.0.as_ref())
    }
 }
--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
@ -12,14 +12,6 @@ use core::arch::arm::*;
 #[cfg(target_arch = "aarch64")]
 use core::arch::aarch64::*;

-#[inline(always)]
-unsafe fn vdotq_s32(a: int8x16_t, b: int8x16_t) -> int32x4_t {
-    // TODO: dotprod
-    let p0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    let p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-    vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))
-}
-
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
@ -27,39 +19,71 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
+    }

    unsafe {
        let mut sumv0 = vdupq_n_f32(0.0f32);
-        for i in 0..nb {
+        let mut sumv1 = vdupq_n_f32(0.0f32);
+        for i in (0..nb).step_by(2) {
            let x0 = &xs[i];
+            let x1 = &xs[i + 1];
            let y0 = &ys[i];
+            let y1 = &ys[i + 1];

            let m4b = vdupq_n_u8(0x0F);
            let s8b = vdupq_n_s8(0x8);

            let v0_0 = vld1q_u8(x0.qs.as_ptr());
+            let v0_1 = vld1q_u8(x1.qs.as_ptr());

            // 4-bit -> 8-bit
            let v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
            let v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            let v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
+            let v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));

            // sub 8
            let v0_0ls = vsubq_s8(v0_0l, s8b);
            let v0_0hs = vsubq_s8(v0_0h, s8b);
+            let v0_1ls = vsubq_s8(v0_1l, s8b);
+            let v0_1hs = vsubq_s8(v0_1h, s8b);

            // load y
            let v1_0l = vld1q_s8(y0.qs.as_ptr());
            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));
+            let v1_1l = vld1q_s8(y1.qs.as_ptr());
+            let v1_1h = vld1q_s8(y1.qs.as_ptr().add(16));
+
+            // TODO: Support dotprod when it's available outside of nightly.
+            let pl0l = vmull_s8(vget_low_s8(v0_0ls), vget_low_s8(v1_0l));
+            let pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+            let ph0l = vmull_s8(vget_low_s8(v0_0hs), vget_low_s8(v1_0h));
+            let ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+            let pl1l = vmull_s8(vget_low_s8(v0_1ls), vget_low_s8(v1_1l));
+            let pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+            let ph1l = vmull_s8(vget_low_s8(v0_1hs), vget_low_s8(v1_1h));
+            let ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
+
+            let pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+            let ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+            let pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+            let ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));

-            let pl0 = vdotq_s32(v0_0ls, v1_0l);
-            let ph0 = vdotq_s32(v0_0hs, v1_0h);
            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
                x0.d.to_f32() * y0.d.to_f32(),
            );
+            sumv1 = vmlaq_n_f32(
+                sumv1,
+                vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
+                x1.d.to_f32() * y1.d.to_f32(),
+            );
        }
-        Ok(vaddvq_f32(sumv0))
+        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
    }
 }

@ -70,58 +94,60 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
    }
    let nb = n / QK8_0;
+    if nb % 2 != 0 {
+        crate::bail!("vec_dot_q8_0_q8_0: {nb} is not even")
+    }
    unsafe {
        let mut sumv0 = vdupq_n_f32(0.0f32);
-        for i in 0..nb {
+        let mut sumv1 = vdupq_n_f32(0.0f32);
+        for i in (0..nb).step_by(2) {
            let x0 = &xs[i];
+            let x1 = &xs[i + 1];
            let y0 = &ys[i];
+            let y1 = &ys[i + 1];

            let x0_0 = vld1q_s8(x0.qs.as_ptr());
            let x0_1 = vld1q_s8(x0.qs.as_ptr().add(16));
+            let x1_0 = vld1q_s8(x1.qs.as_ptr());
+            let x1_1 = vld1q_s8(x1.qs.as_ptr().add(16));

            // load y
            let y0_0 = vld1q_s8(y0.qs.as_ptr());
            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));
+            let y1_0 = vld1q_s8(y1.qs.as_ptr());
+            let y1_1 = vld1q_s8(y1.qs.as_ptr().add(16));

-            let p0 = vdotq_s32(x0_0, y0_0);
-            let p1 = vdotq_s32(x0_1, y0_1);
+            // TODO dotprod once this is the intrinsics are.
+            let p0_0 = vmull_s8(vget_low_s8(x0_0), vget_low_s8(y0_0));
+            let p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
+            let p0_2 = vmull_s8(vget_low_s8(x0_1), vget_low_s8(y0_1));
+            let p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
+
+            let p1_0 = vmull_s8(vget_low_s8(x1_0), vget_low_s8(y1_0));
+            let p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
+            let p1_2 = vmull_s8(vget_low_s8(x1_1), vget_low_s8(y1_1));
+            let p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
+
+            let p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
+            let p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+            let p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
+            let p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));

            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(p0, p1)),
                x0.d.to_f32() * y0.d.to_f32(),
            );
+            sumv1 = vmlaq_n_f32(
+                sumv1,
+                vcvtq_f32_s32(vaddq_s32(p2, p3)),
+                x1.d.to_f32() * y1.d.to_f32(),
+            );
        }
-        Ok(vaddvq_f32(sumv0))
+        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
    }
 }

-#[inline(always)]
-pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
-    }
-
-    let mut sumf = 0f32;
-    for (xs, ys) in xs.iter().zip(ys.iter()) {
-        unsafe {
-            let mut sum_i = vdupq_n_s32(0);
-            let scale = xs.d * ys.d;
-            let xs = xs.qs.as_ptr();
-            let ys = ys.qs.as_ptr();
-            for i in (0..QK_K).step_by(16) {
-                let xs = vld1q_s8(xs.add(i));
-                let ys = vld1q_s8(ys.add(i));
-                let xy = vdotq_s32(xs, ys);
-                sum_i = vaddq_s32(sum_i, xy)
-            }
-            sumf += vaddvq_s32(sum_i) as f32 * scale
-        }
-    }
-    Ok(sumf)
-}
-
 #[inline(always)]
 pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
    if n % QK_K != 0 {
@ -183,16 +209,30 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.2, m4b), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.3, m4b), q6h_3));

-                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
-                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                // TODO: dotprod
+
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
                scale = scale.add(2);

-                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
-                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
+                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
+                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
                scale = scale.add(2);

                let q8bytes = vld1q_s8_x4(q8);
@ -212,16 +252,29 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.2, 4), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.3, 4), q6h_3));

-                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
-                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                // TODO: dotprod case.
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
                scale = scale.add(2);

-                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
-                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
+                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
+                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
+                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
                scale = scale.add(2);
            }
            sum += d_all * y.d * ((isum - 32 * isum_mins) as f32);
@ -298,14 +351,28 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.0, 4), q5h_2));
                let q5bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.1, 4), q5h_3));

-                let p0 = vdotq_s32(q5bytes_0, q8bytes.0);
-                let p1 = vdotq_s32(q5bytes_1, q8bytes.1);
-                sumi += vaddvq_s32(vaddq_s32(p0, p1)) * *scales as i32;
+                // TODO: dotprod
+
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q5bytes_0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q5bytes_1), vget_high_s8(q8bytes.1)),
+                );
+                sumi += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * *scales as i32;
                scales = scales.add(1);

-                let p2 = vdotq_s32(q5bytes_2, q8bytes.2);
-                let p3 = vdotq_s32(q5bytes_3, q8bytes.3);
-                sumi += vaddvq_s32(vaddq_s32(p2, p3)) * *scales as i32;
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_2), vget_low_s8(q8bytes.2)),
+                    vmull_s8(vget_high_s8(q5bytes_2), vget_high_s8(q8bytes.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q5bytes_3), vget_low_s8(q8bytes.3)),
+                    vmull_s8(vget_high_s8(q5bytes_3), vget_high_s8(q8bytes.3)),
+                );
+                sumi += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * *scales as i32;
                scales = scales.add(1);
            }
            sumf += d * sumi as f32 - dmin * sumi_mins as f32;
@ -368,15 +435,22 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
            for j in 0..QK_K / 64 {
                let q4bits = vld1q_u8_x2(q4);
                q4 = q4.add(32);
+                // TODO: dotprod
                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
                let q4bytes = int8x16x2_t(
                    vreinterpretq_s8_u8(vandq_u8(q4bits.0, m4b)),
                    vreinterpretq_s8_u8(vandq_u8(q4bits.1, m4b)),
                );
-                let p0 = vdotq_s32(q4bytes.0, q8bytes.0);
-                let p1 = vdotq_s32(q4bytes.1, q8bytes.1);
-                sumi1 += vaddvq_s32(vaddq_s32(p0, p1)) * scales[2 * j] as i32;
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
+                );
+                sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * scales[2 * j] as i32;

                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
@ -384,9 +458,15 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.0, 4)),
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.1, 4)),
                );
-                let p2 = vdotq_s32(q4bytes.0, q8bytes.0);
-                let p3 = vdotq_s32(q4bytes.1, q8bytes.1);
-                sumi2 += vaddvq_s32(vaddq_s32(p2, p3)) * scales[2 * j + 1] as i32;
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
+                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
+                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
+                );
+                sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * scales[2 * j + 1] as i32;
            }
            sumf += d * (sumi1 + sumi2) as f32;
        }
@ -464,14 +544,27 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );

-                let p0 = vdotq_s32(q3bytes_0, q8bytes_1.0);
-                let p1 = vdotq_s32(q3bytes_1, q8bytes_1.1);
-                let p2 = vdotq_s32(q3bytes_2, q8bytes_1.2);
-                let p3 = vdotq_s32(q3bytes_3, q8bytes_1.3);
-                isum += vaddvq_s32(p0) * *scale as i32
-                    + vaddvq_s32(p1) * *scale.add(1) as i32
-                    + vaddvq_s32(p2) * *scale.add(2) as i32
-                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                // TODO: dotprod
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_1.0)),
+                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_1.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_1.1)),
+                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_1.1)),
+                );
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_1.2)),
+                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_1.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_1.3)),
+                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_1.3)),
+                );
+                isum += vaddvq_s16(p0) as i32 * *scale as i32
+                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
+                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
+                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
                scale = scale.add(4);

                let q3h_0 = vbicq_u8(m2, qhbits.0);
@ -496,14 +589,27 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );

-                let p0 = vdotq_s32(q3bytes_0, q8bytes_2.0);
-                let p1 = vdotq_s32(q3bytes_1, q8bytes_2.1);
-                let p2 = vdotq_s32(q3bytes_2, q8bytes_2.2);
-                let p3 = vdotq_s32(q3bytes_3, q8bytes_2.3);
-                isum += vaddvq_s32(p0) * *scale as i32
-                    + vaddvq_s32(p1) * *scale.add(1) as i32
-                    + vaddvq_s32(p2) * *scale.add(2) as i32
-                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                // TODO: dotprod
+                let p0 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_2.0)),
+                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_2.0)),
+                );
+                let p1 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_2.1)),
+                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_2.1)),
+                );
+                let p2 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_2.2)),
+                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_2.2)),
+                );
+                let p3 = vaddq_s16(
+                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_2.3)),
+                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_2.3)),
+                );
+                isum += vaddvq_s16(p0) as i32 * *scale as i32
+                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
+                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
+                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
                scale = scale.add(4);

                if j == 0 {
@ -561,6 +667,7 @@ pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Res
            let mut is = 0usize;

            // TODO: dotprod
+
            for _j in 0..QK_K / 128 {
                let q2bits = vld1q_u8_x2(q2);
                q2 = q2.add(32);
@ -607,7 +714,14 @@ unsafe fn multiply_accum_with_scale(
    q2bytes: int8x16x2_t,
    q8bytes: int8x16x2_t,
 ) -> i32 {
-    let p1 = vdotq_s32(q2bytes.0, q8bytes.0);
-    let p2 = vdotq_s32(q2bytes.1, q8bytes.1);
-    vaddvq_s32(p1) * aux[is + index] as i32 + vaddvq_s32(p2) * aux[is + 1 + index] as i32
+    let p1 = vaddq_s16(
+        vmull_s8(vget_low_s8(q2bytes.0), vget_low_s8(q8bytes.0)),
+        vmull_s8(vget_high_s8(q2bytes.0), vget_high_s8(q8bytes.0)),
+    );
+    let p2 = vaddq_s16(
+        vmull_s8(vget_low_s8(q2bytes.1), vget_low_s8(q8bytes.1)),
+        vmull_s8(vget_high_s8(q2bytes.1), vget_high_s8(q8bytes.1)),
+    );
+    vaddvq_s16(p1) as i32 * aux[is + index] as i32
+        + vaddvq_s16(p2) as i32 * aux[is + 1 + index] as i32
 }
--- a/candle-core/src/quantized/simd128.rs
+++ b/candle-core/src/quantized/simd128.rs
@ -1,419 +0,0 @@
-use super::k_quants::{BlockQ2K, BlockQ4K, BlockQ4_0, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K};
-use crate::Result;
-use byteorder::{ByteOrder, LittleEndian};
-use half::f16;
-
-use core::arch::wasm32::*;
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
-    }
-    unsafe {
-        let mut acc = f32x4_splat(0.0f32);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let x1234 = v128_load(x.qs.as_ptr() as *const v128);
-            let x12 = v128_and(x1234, u8x16_splat(0x0F));
-            let x12 = i8x16_sub(x12, i8x16_splat(8));
-            let x34 = u8x16_shr(x1234, 4);
-            let x34 = i8x16_sub(x34, i8x16_splat(8));
-
-            let x1 = i16x8_extend_low_i8x16(x12);
-            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
-            let sum_xy = i32x4_dot_i16x8(x1, y1);
-
-            let x2 = i16x8_extend_high_i8x16(x12);
-            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
-
-            let x3 = i16x8_extend_low_i8x16(x34);
-            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
-
-            let x4 = i16x8_extend_high_i8x16(x34);
-            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
-
-            let sum_xy = f32x4_convert_i32x4(sum_xy);
-
-            // f32x4_relaxed_madd is nightly only.
-            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
-            let scaled = f32x4_mul(sum_xy, d);
-            acc = f32x4_add(acc, scaled)
-        }
-        let res = f32x4_extract_lane::<0>(acc)
-            + f32x4_extract_lane::<1>(acc)
-            + f32x4_extract_lane::<2>(acc)
-            + f32x4_extract_lane::<3>(acc);
-        Ok(res)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
-    }
-    unsafe {
-        let mut acc = f32x4_splat(0.0f32);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let x1 = i16x8_load_extend_i8x8(x.qs.as_ptr());
-            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
-            let sum_xy = i32x4_dot_i16x8(x1, y1);
-
-            let x2 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(8));
-            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
-
-            let x3 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(16));
-            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
-
-            let x4 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(24));
-            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
-
-            let sum_xy = f32x4_convert_i32x4(sum_xy);
-
-            // f32x4_relaxed_madd is nightly only.
-            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
-            let scaled = f32x4_mul(sum_xy, d);
-            acc = f32x4_add(acc, scaled)
-        }
-        let res = f32x4_extract_lane::<0>(acc)
-            + f32x4_extract_lane::<1>(acc)
-            + f32x4_extract_lane::<2>(acc)
-            + f32x4_extract_lane::<3>(acc);
-        Ok(res)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
-    }
-    unsafe {
-        let mut sumf = f32x4_splat(0f32);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let mut q2: &[_] = &x.qs;
-            let mut q8: &[_] = &y.qs;
-            let sc = &x.scales;
-
-            let mut summs = i32x4_splat(0);
-            for i in (0..(QK_K / 16)).step_by(4) {
-                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(i));
-                let scales = i32x4_shr(
-                    i32x4(
-                        sc[i] as i32,
-                        sc[i + 1] as i32,
-                        sc[i + 2] as i32,
-                        sc[i + 3] as i32,
-                    ),
-                    4,
-                );
-                summs = i32x4_add(summs, i32x4_mul(bsums, scales))
-            }
-            let summs = f32x4_convert_i32x4(summs);
-
-            let dall = y.d * x.d.to_f32();
-            let dmin = y.d * x.dmin.to_f32();
-
-            let mut isum = i32x4_splat(0);
-            let mut is = 0;
-            for _ in 0..(QK_K / 128) {
-                let mut shift = 0;
-                for _ in 0..4 {
-                    let d = (sc[is] & 0xF) as i32;
-                    is += 1;
-                    let mut isuml = i16x8_splat(0);
-                    for l in (0..16).step_by(8) {
-                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
-                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
-                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
-                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
-                    }
-                    let dd = i32x4_splat(d);
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
-                    let d = (sc[is] & 0xF) as i32;
-                    is += 1;
-                    let mut isuml = i16x8_splat(0);
-                    for l in (16..32).step_by(8) {
-                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
-                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
-                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
-                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
-                    }
-                    let dd = i32x4_splat(d);
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
-                    shift += 2;
-                    // adjust the indexing
-                    q8 = &q8[32..];
-                }
-                // adjust the indexing
-                q2 = &q2[32..];
-            }
-            let isum = f32x4_convert_i32x4(isum);
-            sumf = f32x4_add(
-                sumf,
-                f32x4_sub(
-                    f32x4_mul(isum, f32x4_splat(dall)),
-                    f32x4_mul(summs, f32x4_splat(dmin)),
-                ),
-            );
-        }
-        let sumf = f32x4_extract_lane::<0>(sumf)
-            + f32x4_extract_lane::<1>(sumf)
-            + f32x4_extract_lane::<2>(sumf)
-            + f32x4_extract_lane::<3>(sumf);
-        Ok(sumf)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
-    }
-
-    const KMASK1: u32 = 0x3f3f3f3f;
-    const KMASK2: u32 = 0x0f0f0f0f;
-    const KMASK3: u32 = 0x03030303;
-
-    let mut utmp: [u32; 4] = [0; 4];
-    let mut scales: [u8; 8] = [0; 8];
-    let mut mins: [u8; 8] = [0; 8];
-
-    let mut aux8: [u8; QK_K] = [0; QK_K];
-    let mut sums = f32x4_splat(0f32);
-    unsafe {
-        for (y, x) in ys.iter().zip(xs.iter()) {
-            let q4 = &x.qs;
-            let q8 = &y.qs;
-
-            for j in 0..QK_K / 64 {
-                let q4_1 = v128_load(q4.as_ptr().add(32 * j) as *const v128);
-                let q4_2 = v128_load(q4.as_ptr().add(32 * j + 16) as *const v128);
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j) as *mut v128,
-                    v128_and(q4_1, u8x16_splat(0x0F)),
-                );
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j + 16) as *mut v128,
-                    v128_and(q4_2, u8x16_splat(0x0F)),
-                );
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j + 32) as *mut v128,
-                    u8x16_shr(q4_1, 4),
-                );
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j + 48) as *mut v128,
-                    u8x16_shr(q4_2, 4),
-                );
-            }
-
-            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
-
-            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
-            let uaux = utmp[1] & KMASK1;
-            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
-            utmp[2] = uaux;
-            utmp[0] &= KMASK1;
-
-            //extract scales and mins
-            LittleEndian::write_u32_into(&utmp[0..2], &mut scales);
-            LittleEndian::write_u32_into(&utmp[2..4], &mut mins);
-
-            let mut sumi = i32x4_splat(0);
-            for j in (0..QK_K / 16).step_by(4) {
-                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(j));
-                let (m1, m2) = (mins[j / 2] as i32, mins[j / 2 + 1] as i32);
-                let mins = i32x4(m1, m1, m2, m2);
-                sumi = i32x4_add(sumi, i32x4_mul(bsums, mins));
-            }
-
-            let mut aux32 = i32x4_splat(0i32);
-            for (scale_i, scale) in scales.iter().enumerate() {
-                let scale = i32x4_splat(*scale as i32);
-                for j in 0..4 {
-                    let i = 32 * scale_i + 8 * j;
-                    let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(i));
-                    let aux8 = i16x8_load_extend_u8x8(aux8.as_ptr().add(i));
-                    let aux16 = i16x8_mul(q8, aux8);
-                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_low_i16x8(aux16)));
-                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_high_i16x8(aux16)));
-                }
-            }
-            let aux32 = f32x4_convert_i32x4(aux32);
-            let d = f32x4_splat(x.d.to_f32() * y.d);
-            sums = f32x4_add(sums, f32x4_mul(aux32, d));
-            let dmin = x.dmin.to_f32() * y.d;
-            let dmin = f32x4_splat(dmin);
-            let sumi = f32x4_convert_i32x4(sumi);
-            sums = f32x4_sub(sums, f32x4_mul(sumi, dmin));
-        }
-        let sums = f32x4_extract_lane::<0>(sums)
-            + f32x4_extract_lane::<1>(sums)
-            + f32x4_extract_lane::<2>(sums)
-            + f32x4_extract_lane::<3>(sums);
-        Ok(sums)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
-    }
-
-    let mut aux8 = [0i8; QK_K];
-    unsafe {
-        let mut sums = f32x4_splat(0f32);
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let q4 = &x.ql;
-            let qh = &x.qh;
-            let q8 = &y.qs;
-            let mut aux32 = f32x4_splat(0f32);
-
-            for j in (0..QK_K).step_by(128) {
-                let aux8 = aux8.as_mut_ptr().add(j);
-                let q4 = &q4.as_ptr().add(j / 2);
-                let qh = &qh.as_ptr().add(j / 4);
-                for l in (0..32).step_by(16) {
-                    // aux8[l] = (((q4[l] & 0xF) | ((qh[l] & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        v128_and(v128_load(q4.add(l) as *const v128), u8x16_splat(0xF)),
-                        u8x16_shl(
-                            v128_and(v128_load(qh.add(l) as *const v128), u8x16_splat(3)),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-
-                    // aux8[l + 32] =
-                    //    (((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        v128_and(v128_load(q4.add(l + 32) as *const v128), u8x16_splat(0xF)),
-                        u8x16_shl(
-                            v128_and(
-                                u8x16_shr(v128_load(qh.add(l) as *const v128), 2),
-                                u8x16_splat(3),
-                            ),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l + 32) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-
-                    // aux8[l + 64] = (((q4[l] >> 4) | (((qh[l] >> 4) & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        u8x16_shr(v128_load(q4.add(l) as *const v128), 4),
-                        u8x16_shl(
-                            v128_and(
-                                u8x16_shr(v128_load(qh.add(l) as *const v128), 4),
-                                u8x16_splat(3),
-                            ),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l + 64) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-
-                    // aux8[l + 96] =
-                    //    (((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        u8x16_shr(v128_load(q4.add(l + 32) as *const v128), 4),
-                        u8x16_shl(
-                            v128_and(
-                                u8x16_shr(v128_load(qh.add(l) as *const v128), 6),
-                                u8x16_splat(3),
-                            ),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l + 96) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-                }
-            }
-
-            for (j, &scale) in x.scales.iter().enumerate() {
-                let scale = f32x4_splat(scale as f32);
-                for offset in [0, 8] {
-                    let aux16 = i16x8_mul(
-                        i16x8_load_extend_i8x8(q8.as_ptr().add(16 * j + offset)),
-                        i16x8_load_extend_i8x8(aux8.as_ptr().add(16 * j + offset)),
-                    );
-                    aux32 = f32x4_add(
-                        aux32,
-                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_low_i16x8(aux16)), scale),
-                    );
-                    aux32 = f32x4_add(
-                        aux32,
-                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_high_i16x8(aux16)), scale),
-                    );
-                }
-            }
-
-            let d = f32x4_splat(x.d.to_f32() * y.d);
-            sums = f32x4_add(sums, f32x4_mul(aux32, d));
-        }
-        let sums = f32x4_extract_lane::<0>(sums)
-            + f32x4_extract_lane::<1>(sums)
-            + f32x4_extract_lane::<2>(sums)
-            + f32x4_extract_lane::<3>(sums);
-        Ok(sums)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
-    }
-
-    unsafe {
-        let mut acc = f32x4_splat(0.0f32);
-        for (xs, ys) in xs.iter().zip(ys.iter()) {
-            let x_qs = xs.qs.as_ptr();
-            let y_qs = ys.qs.as_ptr();
-            let mut sumi = i32x4_splat(0);
-            for j in (0..QK_K).step_by(8) {
-                let xs = i16x8_load_extend_i8x8(x_qs.add(j));
-                let ys = i16x8_load_extend_i8x8(y_qs.add(j));
-                let sum_xy = i32x4_dot_i16x8(xs, ys);
-                sumi = i32x4_add(sumi, sum_xy)
-            }
-            let d = f32x4_splat(xs.d * ys.d);
-            acc = f32x4_add(acc, f32x4_mul(f32x4_convert_i32x4(sumi), d))
-        }
-        let res = f32x4_extract_lane::<0>(acc)
-            + f32x4_extract_lane::<1>(acc)
-            + f32x4_extract_lane::<2>(acc)
-            + f32x4_extract_lane::<3>(acc);
-        Ok(res)
-    }
-}
--- a/candle-core/src/quantized/utils.rs
+++ b/candle-core/src/quantized/utils.rs
@ -17,7 +17,7 @@ pub(super) fn group_for_quantization<'a, 'b, T: super::k_quants::GgmlType>(
    let expected_blocks = xs.len() / block_size;
    let actual_blocks = ys.len();

-    // Validate that the input is the right size
+    //validate that the input is the right size
    if expected_blocks != actual_blocks {
        crate::bail!("quantize {dtype:?}: expected {expected_blocks} blocks but only {actual_blocks} were provided!")
    }
@ -37,12 +37,12 @@ pub(super) fn group_for_dequantization<'a, 'b, T: super::k_quants::GgmlType>(

    let actual_output_len = ys.len();
    let expected_output_len = xs.len() * block_size;
-    // Validate that the output is the right size
+    //validate that the output is the right size
    if expected_output_len != actual_output_len {
        crate::bail!("dequantize {dtype:?}: ys (len = {actual_output_len}) does not match the expected length of {expected_output_len}!")
    }

-    // Zip the blocks and outputs together
+    //zip the blocks and outputs together
    Ok(xs.iter().zip(ys.chunks_exact_mut(block_size)).collect())
 }

--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
@ -78,7 +78,11 @@ impl st::View for &Tensor {
 }

 impl Tensor {
-    pub fn save_safetensors<P: AsRef<Path>>(&self, name: &str, filename: P) -> Result<()> {
+    pub fn save_safetensors<P: AsRef<std::path::Path>>(
+        &self,
+        name: &str,
+        filename: P,
+    ) -> Result<()> {
        let data = [(name, self.clone())];
        Ok(st::serialize_to_file(data, &None, filename.as_ref())?)
    }
@ -251,134 +255,6 @@ pub fn save<K: AsRef<str> + Ord + std::fmt::Display, P: AsRef<Path>>(
    Ok(st::serialize_to_file(tensors, &None, filename.as_ref())?)
 }

-#[derive(yoke::Yokeable)]
-struct SafeTensors_<'a>(SafeTensors<'a>);
-
-pub struct MmapedSafetensors {
-    safetensors: Vec<yoke::Yoke<SafeTensors_<'static>, memmap2::Mmap>>,
-    routing: Option<HashMap<String, usize>>,
-}
-
-impl MmapedSafetensors {
-    /// Creates a wrapper around a memory mapped file and deserialize the safetensors header.
-    ///
-    /// # Safety
-    ///
-    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
-        let p = p.as_ref();
-        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
-        let file = memmap2::MmapOptions::new()
-            .map(&file)
-            .map_err(|e| Error::from(e).with_path(p))?;
-        let safetensors = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
-            file,
-            |data: &[u8]| {
-                let st = safetensors::SafeTensors::deserialize(data)
-                    .map_err(|e| Error::from(e).with_path(p))?;
-                Ok::<_, Error>(SafeTensors_(st))
-            },
-        )?;
-        Ok(Self {
-            safetensors: vec![safetensors],
-            routing: None,
-        })
-    }
-
-    /// Creates a wrapper around multiple memory mapped file and deserialize the safetensors headers.
-    ///
-    /// If a tensor name appears in multiple files, the last entry is returned.
-    ///
-    /// # Safety
-    ///
-    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn multi<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
-        let mut routing = HashMap::new();
-        let mut safetensors = vec![];
-        for (index, p) in paths.iter().enumerate() {
-            let p = p.as_ref();
-            let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
-            let file = memmap2::MmapOptions::new()
-                .map(&file)
-                .map_err(|e| Error::from(e).with_path(p))?;
-            let data = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
-                file,
-                |data: &[u8]| {
-                    let st = safetensors::SafeTensors::deserialize(data)
-                        .map_err(|e| Error::from(e).with_path(p))?;
-                    Ok::<_, Error>(SafeTensors_(st))
-                },
-            )?;
-            for k in data.get().0.names() {
-                routing.insert(k.to_string(), index);
-            }
-            safetensors.push(data)
-        }
-        Ok(Self {
-            safetensors,
-            routing: Some(routing),
-        })
-    }
-
-    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
-        self.get(name)?.load(dev)
-    }
-
-    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
-        let mut tensors = vec![];
-        for safetensors in self.safetensors.iter() {
-            tensors.push(safetensors.get().0.tensors())
-        }
-        tensors.into_iter().flatten().collect()
-    }
-
-    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
-        let index = match &self.routing {
-            None => 0,
-            Some(routing) => {
-                let index = routing.get(name).ok_or_else(|| {
-                    Error::CannotFindTensor {
-                        path: name.to_string(),
-                    }
-                    .bt()
-                })?;
-                *index
-            }
-        };
-        Ok(self.safetensors[index].get().0.tensor(name)?)
-    }
-}
-
-pub struct BufferedSafetensors {
-    safetensors: yoke::Yoke<SafeTensors_<'static>, Vec<u8>>,
-}
-
-impl BufferedSafetensors {
-    /// Creates a wrapper around a binary buffer and deserialize the safetensors header.
-    pub fn new(buffer: Vec<u8>) -> Result<Self> {
-        let safetensors = yoke::Yoke::<SafeTensors_<'static>, Vec<u8>>::try_attach_to_cart(
-            buffer,
-            |data: &[u8]| {
-                let st = safetensors::SafeTensors::deserialize(data)?;
-                Ok::<_, Error>(SafeTensors_(st))
-            },
-        )?;
-        Ok(Self { safetensors })
-    }
-
-    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
-        self.get(name)?.load(dev)
-    }
-
-    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
-        self.safetensors.get().0.tensors()
-    }
-
-    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
-        Ok(self.safetensors.get().0.tensor(name)?)
-    }
-}
-
 pub struct MmapedFile {
    path: std::path::PathBuf,
    inner: memmap2::Mmap,
@ -391,7 +267,7 @@ impl MmapedFile {
    /// # Safety
    ///
    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
+    pub unsafe fn new<P: AsRef<std::path::Path>>(p: P) -> Result<Self> {
        let p = p.as_ref();
        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
        let inner = memmap2::MmapOptions::new()
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -171,7 +171,7 @@ impl Shape {
        }
        let mut acc = 1;
        for (&stride, &dim) in stride.iter().zip(self.0.iter()).rev() {
-            if dim > 1 && stride != acc {
+            if stride != acc {
                return false;
            }
            acc *= dim;
@ -186,7 +186,7 @@ impl Shape {
        }
        let mut acc = 1;
        for (&stride, &dim) in stride.iter().zip(self.0.iter()) {
-            if dim > 1 && stride != acc {
+            if stride != acc {
                return false;
            }
            acc *= dim;
@ -203,7 +203,7 @@ impl Shape {

    /// Check whether the two shapes are compatible for broadcast, and if it is the case return the
    /// broadcasted shape. This is to be used for binary pointwise ops.
-    pub fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
+    pub(crate) fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
        let lhs = self;
        let lhs_dims = lhs.dims();
        let rhs_dims = rhs.dims();
@ -444,18 +444,6 @@ impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim> Dims for (D1, D2, D3, D4, D5)
    }
 }

-impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim, D6: Dim> Dims for (D1, D2, D3, D4, D5, D6) {
-    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
-        let d0 = self.0.to_index(shape, op)?;
-        let d1 = self.1.to_index(shape, op)?;
-        let d2 = self.2.to_index(shape, op)?;
-        let d3 = self.3.to_index(shape, op)?;
-        let d4 = self.4.to_index(shape, op)?;
-        let d5 = self.5.to_index(shape, op)?;
-        Ok(vec![d0, d1, d2, d3, d4, d5])
-    }
-}
-
 extract_dims!(dims0, 0, |_: &[usize]| (), ());
 extract_dims!(dims1, 1, |d: &[usize]| d[0], usize);
 extract_dims!(dims2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
@ -478,139 +466,6 @@ extract_dims!(
    (usize, usize, usize, usize, usize)
 );

-pub trait ShapeWithOneHole {
-    fn into_shape(self, el_count: usize) -> Result<Shape>;
-}
-
-impl<S: Into<Shape>> ShapeWithOneHole for S {
-    fn into_shape(self, _el_count: usize) -> Result<Shape> {
-        Ok(self.into())
-    }
-}
-
-impl ShapeWithOneHole for ((),) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        Ok(el_count.into())
-    }
-}
-
-fn hole_size(el_count: usize, prod_d: usize, s: &dyn std::fmt::Debug) -> Result<usize> {
-    if prod_d == 0 {
-        crate::bail!("cannot reshape tensor of {el_count} elements to {s:?}")
-    }
-    if el_count % prod_d != 0 {
-        crate::bail!("cannot reshape tensor with {el_count} elements to {s:?}")
-    }
-    Ok(el_count / prod_d)
-}
-
-impl ShapeWithOneHole for ((), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1) = self;
-        Ok((hole_size(el_count, d1, &self)?, d1).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, ()) = self;
-        Ok((d1, hole_size(el_count, d1, &self)?).into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1, d2) = self;
-        Ok((hole_size(el_count, d1 * d2, &self)?, d1, d2).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, (), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, (), d2) = self;
-        Ok((d1, hole_size(el_count, d1 * d2, &self)?, d2).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, ()) = self;
-        Ok((d1, d2, hole_size(el_count, d1 * d2, &self)?).into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize, usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1, d2, d3) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
-        Ok((d, d1, d2, d3).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, (), usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, (), d2, d3) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
-        Ok((d1, d, d2, d3).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, (), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, (), d3) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
-        Ok((d1, d2, d, d3).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, d3, ()) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
-        Ok((d1, d2, d3, d).into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize, usize, usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1, d2, d3, d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
-        Ok((d, d1, d2, d3, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, (), usize, usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, (), d2, d3, d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
-        Ok((d1, d, d2, d3, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, (), usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, (), d3, d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
-        Ok((d1, d2, d, d3, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, usize, (), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, d3, (), d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
-        Ok((d1, d2, d3, d, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, usize, usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, d3, d4, ()) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
-        Ok((d1, d2, d3, d4, d).into())
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@ -627,3 +482,171 @@ mod tests {
        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
    }
 }
+
+pub trait ShapeWithOneHole {
+    fn into_shape(self, el_count: usize) -> Result<Shape>;
+}
+
+impl<S: Into<Shape>> ShapeWithOneHole for S {
+    fn into_shape(self, _el_count: usize) -> Result<Shape> {
+        Ok(self.into())
+    }
+}
+
+impl ShapeWithOneHole for ((),) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        Ok(el_count.into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1) = self;
+        if el_count % d1 != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d1}")
+        }
+        Ok((el_count / d1, d1).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, ()) = self;
+        if el_count % d1 != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d1}")
+        }
+        Ok((d1, el_count / d1).into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1, d2) = self;
+        let d = d1 * d2;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((el_count / d, d1, d2).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, (), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, (), d2) = self;
+        let d = d1 * d2;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, el_count / d, d2).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, ()) = self;
+        let d = d1 * d2;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, d2, el_count / d).into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize, usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1, d2, d3) = self;
+        let d = d1 * d2 * d3;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((el_count / d, d1, d2, d3).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, (), usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, (), d2, d3) = self;
+        let d = d1 * d2 * d3;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, el_count / d, d2, d3).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, (), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, (), d3) = self;
+        let d = d1 * d2 * d3;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, d2, el_count / d, d3).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, d3, ()) = self;
+        let d = d1 * d2 * d3;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, d2, d3, el_count / d).into())
+    }
+}
+
+impl ShapeWithOneHole for ((), usize, usize, usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let ((), d1, d2, d3, d4) = self;
+        let d = d1 * d2 * d3 * d4;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((el_count / d, d1, d2, d3, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, (), usize, usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, (), d2, d3, d4) = self;
+        let d = d1 * d2 * d3 * d4;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, el_count / d, d2, d3, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, (), usize, usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, (), d3, d4) = self;
+        let d = d1 * d2 * d3 * d4;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, d2, el_count / d, d3, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, usize, (), usize) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, d3, (), d4) = self;
+        let d = d1 * d2 * d3 * d4;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, d2, d3, el_count / d, d4).into())
+    }
+}
+
+impl ShapeWithOneHole for (usize, usize, usize, usize, ()) {
+    fn into_shape(self, el_count: usize) -> Result<Shape> {
+        let (d1, d2, d3, d4, ()) = self;
+        let d = d1 * d2 * d3 * d4;
+        if el_count % d != 0 {
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
+        }
+        Ok((d1, d2, d3, d4, el_count / d).into())
+    }
+}
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,7 +1,6 @@
 use crate::backend::BackendStorage;
-use crate::op::{self, CmpOp, ReduceOp};
-use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
-use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3};
+use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
+use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};

 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -9,7 +8,6 @@ use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}
 pub enum Storage {
    Cpu(CpuStorage),
    Cuda(CudaStorage),
-    Metal(MetalStorage),
 }

 impl Storage {
@ -20,10 +18,6 @@ impl Storage {
                let storage = storage.try_clone(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.try_clone(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -31,7 +25,6 @@ impl Storage {
        match self {
            Self::Cpu(_) => Device::Cpu,
            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
-            Self::Metal(storage) => Device::Metal(storage.device().clone()),
        }
    }

@ -39,24 +32,13 @@ impl Storage {
        match self {
            Self::Cpu(storage) => storage.dtype(),
            Self::Cuda(storage) => storage.dtype(),
-            Self::Metal(storage) => storage.dtype(),
        }
    }

    pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
-        let lhs_device = self.device();
-        let rhs_device = rhs.device();
-        let lhs = lhs_device.location();
-        let rhs = rhs_device.location();
-        let same_device = if self.device().is_metal() {
-            // On metal, we require the device to be exactly the same rather than
-            // having the same location. In cuda this is not necessary as all CudaDevice on the
-            // same GPU will use the same cuda stream.
-            lhs_device.same_device(&rhs_device)
-        } else {
-            lhs == rhs
-        };
-        if !same_device {
+        let lhs = self.device().location();
+        let rhs = rhs.device().location();
+        if lhs != rhs {
            Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op }.bt())
        } else {
            Ok(())
@ -83,10 +65,6 @@ impl Storage {
                let storage = storage.affine(layout, mul, add)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.affine(layout, mul, add)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -100,10 +78,6 @@ impl Storage {
                let storage = storage.powf(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.powf(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -117,10 +91,6 @@ impl Storage {
                let storage = storage.elu(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.elu(layout, alpha)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -142,10 +112,6 @@ impl Storage {
                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -169,10 +135,6 @@ impl Storage {
                let storage = storage.reduce_op(op, layout, s)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.reduce_op(op, layout, s)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -186,10 +148,6 @@ impl Storage {
                let storage = storage.to_dtype(layout, dtype)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.to_dtype(layout, dtype)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -203,10 +161,6 @@ impl Storage {
                let (storage, shape) = c.cuda_fwd(storage, l)?;
                Ok((Self::Cuda(storage), shape))
            }
-            Self::Metal(storage) => {
-                let (storage, shape) = c.metal_fwd(storage, l)?;
-                Ok((Self::Metal(storage), shape))
-            }
        }
    }

@ -227,10 +181,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2)?;
-                Ok((Self::Metal(s), shape))
-            }
            _ => unreachable!(),
        }
    }
@ -255,55 +205,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
                Ok((Self::Cuda(s), shape))
            }
-            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
-                let (s, shape) = c.metal_fwd(s1, l1, s2, l2, s3, l3)?;
-                Ok((Self::Metal(s), shape))
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn inplace_op1(&mut self, l: &Layout, c: &dyn InplaceOp1) -> Result<()> {
-        match self {
-            Self::Cpu(storage) => c.cpu_fwd(storage, l),
-            Self::Cuda(storage) => c.cuda_fwd(storage, l),
-            Self::Metal(storage) => c.metal_fwd(storage, l),
-        }
-    }
-
-    pub(crate) fn inplace_op2(
-        &mut self,
-        l1: &Layout,
-        t2: &Self,
-        l2: &Layout,
-        c: &dyn InplaceOp2,
-    ) -> Result<()> {
-        self.same_device(t2, c.name())?;
-        match (self, t2) {
-            (Self::Cpu(s1), Self::Cpu(s2)) => c.cpu_fwd(s1, l1, s2, l2),
-            (Self::Cuda(s1), Self::Cuda(s2)) => c.cuda_fwd(s1, l1, s2, l2),
-            (Self::Metal(s1), Self::Metal(s2)) => c.metal_fwd(s1, l1, s2, l2),
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn inplace_op3(
-        &mut self,
-        l1: &Layout,
-        t2: &Self,
-        l2: &Layout,
-        t3: &Self,
-        l3: &Layout,
-        c: &dyn InplaceOp3,
-    ) -> Result<()> {
-        self.same_device(t2, c.name())?;
-        self.same_device(t3, c.name())?;
-        match (self, t2, t3) {
-            (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => c.cpu_fwd(s1, l1, s2, l2, s3, l3),
-            (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => c.cuda_fwd(s1, l1, s2, l2, s3, l3),
-            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
-                c.metal_fwd(s1, l1, s2, l2, s3, l3)
-            }
            _ => unreachable!(),
        }
    }
@ -318,10 +219,6 @@ impl Storage {
                let storage = storage.unary_impl::<B>(layout)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.unary_impl::<B>(layout)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -342,10 +239,6 @@ impl Storage {
                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -377,10 +270,6 @@ impl Storage {
                let s = inp.conv1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -390,37 +279,6 @@ impl Storage {
        }
    }

-    pub(crate) fn conv_transpose1d(
-        &self,
-        l: &Layout,
-        kernel: &Self,
-        kernel_l: &Layout,
-        params: &crate::conv::ParamsConvTranspose1D,
-    ) -> Result<Self> {
-        self.same_device(kernel, "conv-transpose1d")?;
-        self.same_dtype(kernel, "conv-transpose1d")?;
-        match (self, &kernel) {
-            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
-                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cpu(s))
-            }
-            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
-                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cuda(s))
-            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
-            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
-                lhs: lhs.device().location(),
-                rhs: rhs.device().location(),
-                op: "conv-transpose1d",
-            }
-            .bt()),
-        }
-    }
-
    pub(crate) fn conv2d(
        &self,
        l: &Layout,
@ -439,10 +297,6 @@ impl Storage {
                let s = inp.conv2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -470,10 +324,6 @@ impl Storage {
                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
-            (Storage::Metal(inp), Storage::Metal(kernel)) => {
-                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Metal(s))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -498,10 +348,6 @@ impl Storage {
                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -520,27 +366,6 @@ impl Storage {
                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Metal(storage))
-            }
-        }
-    }
-
-    pub(crate) fn upsample_nearest1d(&self, layout: &Layout, sz: usize) -> Result<Self> {
-        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Cuda(storage))
-            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -554,10 +379,6 @@ impl Storage {
                let storage = storage.upsample_nearest2d(layout, h, w)?;
                Ok(Self::Cuda(storage))
            }
-            Self::Metal(storage) => {
-                let storage = storage.upsample_nearest2d(layout, h, w)?;
-                Ok(Self::Metal(storage))
-            }
        }
    }

@ -581,10 +402,6 @@ impl Storage {
                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(cond), Self::Metal(t), Self::Metal(f)) => {
-                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
-                Ok(Self::Metal(storage))
-            }
            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -611,10 +428,6 @@ impl Storage {
                let storage = s.gather(l, indexes, indexes_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes)) => {
-                let storage = s.gather(l, indexes, indexes_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -639,10 +452,6 @@ impl Storage {
                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -667,10 +476,6 @@ impl Storage {
                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
-                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            _ => unreachable!(),
        }
    }
@ -692,10 +497,6 @@ impl Storage {
                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -723,10 +524,6 @@ impl Storage {
                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
-            (Self::Metal(lhs), Self::Metal(rhs)) => {
-                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
-                Ok(Self::Metal(storage))
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -746,9 +543,6 @@ impl Storage {
        match (self, dst) {
            (Self::Cpu(src), Self::Cpu(dst)) => src.copy_strided_src(dst, dst_offset, src_l),
            (Self::Cuda(src), Self::Cuda(dst)) => Ok(src.copy_strided_src(dst, dst_offset, src_l)?),
-            (Self::Metal(src), Self::Metal(dst)) => {
-                Ok(src.copy_strided_src(dst, dst_offset, src_l)?)
-            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -757,32 +551,4 @@ impl Storage {
            .bt()),
        }
    }
-
-    #[allow(clippy::too_many_arguments)]
-    pub(crate) fn copy2d(
-        &self,
-        dst: &mut Self,
-        d1: usize,
-        d2: usize,
-        src_s: usize,
-        dst_s: usize,
-        src_o: usize,
-        dst_o: usize,
-    ) -> Result<()> {
-        match (self, dst) {
-            (Self::Cpu(src), Self::Cpu(dst)) => src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o),
-            (Self::Cuda(src), Self::Cuda(dst)) => {
-                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
-            }
-            (Self::Metal(src), Self::Metal(dst)) => {
-                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
-            }
-            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
-                lhs: lhs.device().location(),
-                rhs: rhs.device().location(),
-                op: "copy2d",
-            }
-            .bt()),
-        }
-    }
 }
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
--- a/candle-core/src/tensor_cat.rs
+++ b/candle-core/src/tensor_cat.rs
@ -1,238 +0,0 @@
-use crate::{shape::Dim, Error, Result, Shape, Tensor};
-
-impl Tensor {
-    /// Concatenates two or more tensors along a particular dimension.
-    ///
-    /// All tensors must of the same rank, and the output will have
-    /// the same rank
-    ///
-    /// ```rust
-    /// # use candle_core::{Tensor, DType, Device};
-    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
-    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
-    ///
-    /// let c = Tensor::cat(&[&a, &b], 0)?;
-    /// assert_eq!(c.shape().dims(), &[4, 3]);
-    ///
-    /// let c = Tensor::cat(&[&a, &b], 1)?;
-    /// assert_eq!(c.shape().dims(), &[2, 6]);
-    /// # Ok::<(), candle_core::Error>(())
-    /// ```
-    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let dim = dim.to_index(arg0.shape(), "cat")?;
-        for arg in args {
-            arg.as_ref().check_dim(dim, "cat")?;
-        }
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg0.rank() != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: arg0.rank(),
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx != dim && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-        }
-        let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous());
-        if all_contiguous {
-            Self::cat_contiguous(args, dim)
-        } else if dim == 0 {
-            Self::cat0(args)
-        } else {
-            let args: Vec<Tensor> = args
-                .iter()
-                .map(|a| a.as_ref().transpose(0, dim))
-                .collect::<Result<Vec<_>>>()?;
-            let cat = Self::cat0(&args)?;
-            cat.transpose(0, dim)
-        }
-    }
-
-    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let rank = arg0.rank();
-        let device = arg0.device();
-        let dtype = arg0.dtype();
-        let first_dims = arg0.shape().dims();
-        let mut cat_dims = first_dims.to_vec();
-        cat_dims[0] = 0;
-        let mut offsets = vec![0usize];
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg.dtype() != dtype {
-                Err(Error::DTypeMismatchBinaryOp {
-                    lhs: dtype,
-                    rhs: arg.dtype(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if arg.device().location() != device.location() {
-                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: device.location(),
-                    rhs: arg.device().location(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if rank != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: rank,
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx == 0 {
-                    cat_dims[0] += v2;
-                }
-                if dim_idx != 0 && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-            let next_offset = offsets.last().unwrap() + arg.elem_count();
-            offsets.push(next_offset);
-        }
-        let shape = Shape::from(cat_dims);
-        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0));
-        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
-        for (arg, &offset) in args.iter().zip(offsets.iter()) {
-            let arg = arg.as_ref();
-            arg.storage()
-                .copy_strided_src(&mut storage, offset, arg.layout())?;
-        }
-        Ok(crate::tensor::from_storage(storage, shape, op, false))
-    }
-
-    fn cat_contiguous<A: AsRef<Tensor>>(args: &[A], dim: usize) -> Result<Self> {
-        if args.is_empty() {
-            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
-        }
-        let arg0 = args[0].as_ref();
-        if args.len() == 1 {
-            return Ok(arg0.clone());
-        }
-        let rank = arg0.rank();
-        let device = arg0.device();
-        let dtype = arg0.dtype();
-        let first_dims = arg0.shape().dims();
-        let mut cat_dims = first_dims.to_vec();
-        cat_dims[dim] = 0;
-        for (arg_idx, arg) in args.iter().enumerate() {
-            let arg = arg.as_ref();
-            if arg.dtype() != dtype {
-                Err(Error::DTypeMismatchBinaryOp {
-                    lhs: dtype,
-                    rhs: arg.dtype(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if arg.device().location() != device.location() {
-                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: device.location(),
-                    rhs: arg.device().location(),
-                    op: "cat",
-                }
-                .bt())?
-            }
-            if rank != arg.rank() {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: rank,
-                    got: arg.rank(),
-                    shape: arg.shape().clone(),
-                }
-                .bt())?
-            }
-            for (dim_idx, (v1, v2)) in arg0
-                .shape()
-                .dims()
-                .iter()
-                .zip(arg.shape().dims().iter())
-                .enumerate()
-            {
-                if dim_idx == dim {
-                    cat_dims[dim] += v2;
-                }
-                if dim_idx != dim && v1 != v2 {
-                    Err(Error::ShapeMismatchCat {
-                        dim: dim_idx,
-                        first_shape: arg0.shape().clone(),
-                        n: arg_idx + 1,
-                        nth_shape: arg.shape().clone(),
-                    }
-                    .bt())?
-                }
-            }
-        }
-        let cat_target_dim_len = cat_dims[dim];
-        let block_size: usize = cat_dims.iter().skip(1 + dim).product();
-        let shape = Shape::from(cat_dims);
-        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim));
-        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
-        let mut dst_o = 0;
-        for arg in args.iter() {
-            let arg = arg.as_ref();
-            let arg_dims = arg.shape().dims();
-            let d1: usize = arg_dims.iter().take(dim).product();
-            let d2 = block_size * arg_dims[dim];
-            let dst_s = block_size * cat_target_dim_len;
-            let src_o = arg.layout().start_offset();
-            arg.storage().copy2d(
-                &mut storage,
-                d1,
-                d2,
-                /* src_s */ d2,
-                dst_s,
-                src_o,
-                dst_o,
-            )?;
-            dst_o += d2;
-        }
-        Ok(crate::tensor::from_storage(storage, shape, op, false))
-    }
-}
--- a/candle-core/src/test_utils.rs
+++ b/candle-core/src/test_utils.rs
@ -4,7 +4,7 @@ use crate::{Result, Tensor};
 macro_rules! test_device {
    // TODO: Switch to generating the two last arguments automatically once concat_idents is
    // stable. https://github.com/rust-lang/rust/issues/29599
-    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident, $test_metal: ident) => {
+    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident) => {
        #[test]
        fn $test_cpu() -> Result<()> {
            $fn_name(&Device::Cpu)
@ -15,12 +15,6 @@ macro_rules! test_device {
        fn $test_cuda() -> Result<()> {
            $fn_name(&Device::new_cuda(0)?)
        }
-
-        #[cfg(feature = "metal")]
-        #[test]
-        fn $test_metal() -> Result<()> {
-            $fn_name(&Device::new_metal(0)?)
-        }
    };
 }

--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -23,10 +23,6 @@ pub fn cuda_is_available() -> bool {
    cfg!(feature = "cuda")
 }

-pub fn metal_is_available() -> bool {
-    cfg!(feature = "metal")
-}
-
 pub fn with_avx() -> bool {
    cfg!(target_feature = "avx")
 }
--- a/candle-core/src/variable.rs
+++ b/candle-core/src/variable.rs
@ -107,10 +107,6 @@ impl Var {
        Ok(Self(inner))
    }

-    pub fn as_detached_tensor(&self) -> Tensor {
-        self.0.detach()
-    }
-
    pub fn as_tensor(&self) -> &Tensor {
        &self.0
    }
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -13,14 +13,6 @@ res = torch.nn.functional.conv1d(t, w)
 print(res.flatten())
 res = torch.nn.functional.conv1d(t, w, padding=1)
 print(res.flatten())
-
-w_t = w.transpose(0, 1)
-res = torch.nn.functional.conv_transpose1d(t, w_t)
-print(res.shape)
-print(res)
-res = torch.nn.functional.conv_transpose1d(t, w_t, groups=2)
-print(res.shape)
-print(res)
 */
 fn conv1d(dev: &Device) -> Result<()> {
    let t = Tensor::new(
@ -53,31 +45,6 @@ fn conv1d(dev: &Device) -> Result<()> {
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
-
-    let w = w.transpose(0, 1)?;
-    // The CPU kernels applied in the contiguous and non contiguous cases are different.
-    for w in [w.clone(), w.contiguous()?] {
-        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 1)?;
-        assert_eq!(res.dims(), [1, 2, 7]);
-        assert_eq!(
-            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-            [
-                0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
-                4.7076, -5.9745, -0.8276, 1.621
-            ],
-        );
-        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 2)?;
-        assert_eq!(res.dims(), [1, 4, 7]);
-        assert_eq!(
-            test_utils::to_vec2_round(&res.squeeze(0)?, 4)?,
-            [
-                [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819],
-                [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721],
-                [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113],
-                [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488]
-            ]
-        );
-    }
    Ok(())
 }

@ -135,7 +102,7 @@ fn conv2d(dev: &Device) -> Result<()> {
            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
        ],
        dev,
    )?;
@ -163,9 +130,7 @@ fn conv2d(dev: &Device) -> Result<()> {
            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
        ]
    );
-
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-
    assert_eq!(res.dims(), [1, 2, 7, 7]);
    assert_eq!(
        test_utils::to_vec3_round(&res.i(0)?, 4)?,
@ -190,7 +155,6 @@ fn conv2d(dev: &Device) -> Result<()> {
            ]
        ]
    );
-
    // Dilations.
    let res = t.conv2d(&w, 0, 1, 2, 1)?;
    assert_eq!(res.dims(), [1, 2, 1, 1]);
@ -229,7 +193,6 @@ fn conv2d(dev: &Device) -> Result<()> {
            ]
        ]
    );
-
    Ok(())
 }

@ -276,13 +239,13 @@ fn conv2d_small(dev: &Device) -> Result<()> {
    assert_eq!(
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1640,
-            -0.0111, -0.1742, 0.0, 0.0, 0.0, 0.0, 2.6437, -2.0268, 1.1823, 0.0, 0.0, 0.0, 0.0,
-            3.2855, -1.0324, 0.2539, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-            0.0, 0.0, 0.0, 0.0
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1640, -0.0111, -0.1742, 0.0000, 0.0000,
+            0.0000, 0.0000, 2.6437, -2.0268, 1.1823, 0.0000, 0.0000, 0.0000, 0.0000, 3.2855,
+            -1.0324, 0.2539, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000
        ]
    );
-
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
    assert_eq!(res.dims(), [1, 1, 3, 3]);
    assert_eq!(
@ -384,7 +347,6 @@ print(w.grad.shape)
 print(w.grad[0])
 */
 fn conv2d_grad(dev: &Device) -> Result<()> {
-    // conv-transposes are not implemented for metal
    use candle_core::Var;
    let t = Var::from_slice(
        &[
@ -397,7 +359,7 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
        ],
        (1, 4, 5, 5),
        dev,
@ -517,251 +479,17 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
            ]
        ]
    );
-
-    // Replicate the issue from https://github.com/huggingface/candle/issues/1212
-    let res = t.i((.., .., 0..4, 0..4))?.conv2d(&w, 0, 2, 1, 1)?;
-    let loss = res.sqr()?.sum_all()?;
-    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 21.12f32);
-    let grads = loss.backward()?;
-    let grad_t = grads.get(&t).unwrap();
-    let grad_w = grads.get(&w).unwrap();
-    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
-    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_t.i(0)?, 2)?,
-        [
-            [
-                [9.29, -7.03, 7.87, 0.0, 0.0],
-                [-1.8, -7.82, 5.9, 0.0, 0.0],
-                [-3.12, 4.49, 5.52, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ],
-            [
-                [21.73, 3.39, 4.77, 0.0, 0.0],
-                [8.25, 3.73, 27.61, 0.0, 0.0],
-                [-20.55, -5.61, -2.77, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ],
-            [
-                [-8.98, 9.91, -7.15, 0.0, 0.0],
-                [4.93, -0.33, 4.56, 0.0, 0.0],
-                [-6.7, -5.76, -8.05, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ],
-            [
-                [23.54, 6.98, -10.0, 0.0, 0.0],
-                [9.65, 6.18, 18.72, 0.0, 0.0],
-                [3.29, -5.27, 0.79, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0]
-            ]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_w.i(0)?, 2)?,
-        [
-            [
-                [-3.47, 7.44, 0.66],
-                [12.89, -3.4, -9.29],
-                [-14.16, -0.83, 7.14]
-            ],
-            [
-                [-3.23, 5.37, -3.02],
-                [-2.12, -11.24, 1.94],
-                [6.97, 7.2, 2.99]
-            ],
-            [
-                [-4.04, -3.31, 4.87],
-                [-6.68, -5.68, 1.73],
-                [-5.54, 4.32, 0.52]
-            ],
-            [[-4.72, 1.5, 4.72], [3.79, 4.04, 6.76], [-4.6, 5.8, 6.93]]
-        ]
-    );
-
-    // Conv Transpose 2d Test
-    //tested against following python
-
-    // import torch
-    // torch.manual_seed(4242)
-    // padding = 4
-    // outpadding = 2
-    // dilation = 3
-    // stride = 3
-    // input = torch.randn((1, 4, 7, 5), requires_grad=True)
-    // kernel = torch.randn((4, 2, 3, 5), requires_grad=True)
-    // print("input", input.flatten())
-    // print("kernel", kernel.flatten())
-    // res = torch.nn.functional.conv_transpose2d(
-    //     input,
-    //     kernel,
-    //     stride=stride,
-    //     padding=padding,
-    //     dilation=dilation,
-    //     output_padding=outpadding,
-    // )
-    // res.retain_grad()
-    // print(res.shape)
-    // loss = (res**2).sum()
-    // print(loss)
-    // loss.backward()
-    // print(input.grad.shape)
-    // print("input grad", torch.round(input.grad, decimals=1))
-    // print(kernel.grad.shape)
-    // print("kernel grad", torch.round(kernel.grad.flatten(), decimals=1))
-
-    let padding = 4;
-    let outpadding = 2;
-    let dilation = 3;
-    let stride = 3;
-
-    let t = Var::from_slice(
-        &[
-            0.4056_f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997,
-            3.0616, 1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843,
-            0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013,
-            -0.6836, 0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130,
-            1.3123, 1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071,
-            1.1586, 0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090,
-            0.2049, 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323,
-            -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742,
-            0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.5411, -1.1715, -2.4997, 2.3249, -0.8912,
-            -0.4733, -0.5701, -2.8888, -1.4112, -0.5471, -0.9234, -1.1660, 0.4189, -0.7465,
-            -0.6473, 0.1402, 0.7875, 0.5377, -0.6779, -0.8088, -0.4864, -0.2312, 0.9279, 0.1264,
-            1.5480, 0.8265, -0.1025, 0.5138, -0.2512, 0.1576, 1.2705, 0.3641, -0.9325, 0.6451,
-            -0.8537, 0.2378, 0.1794, 0.2752, -0.3687, -1.1149, -0.1410, -0.5829, -0.0892, 1.4258,
-            -2.2789, 0.5270, 0.1825, 1.7007, -0.5263, -0.2954, 0.4440, 0.5537, 0.3492, 0.6186,
-            1.6475, 0.2219,
-        ],
-        (1, 4, 7, 5),
-        dev,
-    )?;
-
-    #[rustfmt::skip]
-    let w = Var::from_slice(
-        &[
-            -1.1744_f32, 0.3266, 2.5893, 1.0142, 0.1763, 0.7752, 0.6604, 0.2029, -0.2145, 0.7234,
-            -0.3441, -1.5400, -0.6333, 0.6613, 0.2083, 0.6230, -1.7002, 0.3393, 0.4049, 1.0762,
-            0.2723, 1.4181, 0.0029, -0.2122, 1.7668, 1.4168, 0.3320, -0.2719, 0.7932, -0.7204,
-            0.4447, 0.1211, 0.5908, 1.0089, -0.1646, 1.8033, -0.6286, 0.2016, -0.3370, 1.2555,
-            0.8009, -0.6488, -0.4652, -1.5685, 1.5860, 0.5583, 0.4623, 0.6026, 0.8828, 2.4990,
-            0.6811, -0.3369, 1.3320, 1.7669, -1.1067, 1.2958, -0.9415, -0.9655, -0.4462, 0.7181,
-            0.5181, -1.1658, -1.8467, -0.7763, 1.2769, 0.8651, 0.9890, 1.5092, 0.7207, -0.8481,
-            0.7417, 0.3375, -1.2685, 1.4572, 1.0915, 0.1093, -0.8550, -0.5831, -0.6309, -0.2509,
-            0.5220, -0.0914, 0.7900, 0.1096, 0.3258, 0.2723, -1.0942, -0.3393, -0.1653, 0.5732,
-            -0.8014, 1.8194, -1.9023, 0.2127, 1.8636, -0.8979, 0.1927, -0.2778, 0.3105, 0.0071,
-            -1.1823, 0.2476, -0.7178, -1.3821, 1.0769, -0.4376, -0.9967, -0.1227, 1.6197, -1.0604,
-            0.1372, 0.8141, -0.6163, 0.7304, -0.8285, 2.0636, -0.7176, 0.2495, -0.2581, -0.4478,
-        ],
-        (4, 2, 3, 5),
-        dev,
-    )?;
-    let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?;
-    let loss = res.sqr()?.sum_all()?;
-    assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 2904.0);
-    let grads = loss.backward()?;
-
-    let grad_t = grads.get(&t).unwrap();
-    let grad_w = grads.get(&w).unwrap();
-    assert_eq!(grad_t.dims(), [1, 4, 7, 5]);
-    assert_eq!(grad_w.dims(), [4, 2, 3, 5]);
-
-    assert_eq!(
-        test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?,
-        [
-            // torch gets 89.1
-            -89.0, -135.3, 136.7, 102.0, -53.4, 117.9, 118.6, -43.9, -218.0, -58.5, -114.3, -150.0,
-            -15.6, 172.1, 66.3, -64.3, -27.9, -19.8, 31.7, 62.1, 5.5, 92.6, 28.2, -29.6, 55.9,
-            52.7, -72.7, -119.8, 53.8, -25.5, 128.8, 19.3, 68.0, 190.9, -64.1, -86.2, -111.2,
-            106.6, -67.7, 37.8, 115.9, 50.4, -77.7, -54.9, 22.3, -4.6, 89.8, 61.7, 122.4, 192.6,
-            -27.8, -104.6, 57.0, 166.4, 27.1, 6.1, 18.7, -93.2, 31.5, 168.2, -3.7, -99.5, -55.5,
-            -10.8, 17.5, 20.8, 16.9, 43.8, 42.0, -89.2, 18.8, -9.6, -84.1, 212.6, 19.7, -50.0,
-            -52.0, -40.0, -166.6, -73.2, -10.8, -73.3, 31.5, -23.4, -79.3, -27.0, -84.4, -42.9,
-            -20.3, 51.8, -16.7, 76.3, -120.5, -65.8, 96.5, -10.7, -45.9, -88.1, 65.4, -7.0, -1.5,
-            92.8, -25.1, -114.2, -5.8, -14.8, -51.2, -20.7, 54.2, -79.8, 47.7, -29.2, -8.8, 53.5,
-            -28.4, 85.0, -18.3, 107.0, 28.3, -71.8
-        ]
-    );
-
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_t.i(0)?, 1)?,
-        [
-            [
-                [32.3, -41.6, -24.0, 14.1, 17.6],
-                [-11.8, 72.5, 87.6, 46.4, 61.5],
-                [115.0, 108.5, -48.6, -63.4, -50.0],
-                [51.3, 5.4, 31.3, 91.1, -30.9],
-                [52.7, 92.8, -68.0, -47.0, 83.0],
-                // pytorch gets -107.1
-                [-10.2, -107.0, -5.4, 213.1, -31.4],
-                [-2.4, 65.1, 9.2, -146.2, -24.2]
-            ],
-            [
-                [-72.6, -63.9, -61.9, 45.3, 33.0],
-                [79.3, -0.5, -26.2, 78.2, 42.7],
-                [90.9, 141.6, 40.1, -62.7, 37.0],
-                [32.8, 198.2, -0.8, -31.1, 27.3],
-                // torch gets 48.0
-                [34.5, 34.9, -47.9, 127.6, -12.3],
-                [-61.4, -3.2, -2.9, -10.9, -16.6],
-                [74.6, 60.1, -68.9, 34.5, -50.4]
-            ],
-            [
-                [37.5, -56.9, -43.6, -13.5, -9.9],
-                [40.0, 97.3, 28.6, 14.2, -30.1],
-                [-22.3, -126.3, -68.8, -8.2, 26.1],
-                [-32.9, 37.3, 108.5, -54.8, 29.6],
-                [34.9, -176.9, -125.0, -28.3, -13.9],
-                [-54.9, 142.6, 62.1, -80.4, -65.6],
-                [7.4, -91.1, -67.6, 35.0, 39.7]
-            ],
-            [
-                [-57.2, -40.9, -10.1, 32.6, 29.4],
-                [18.7, -18.0, 29.5, -1.2, 59.2],
-                [-14.0, -74.4, 19.8, -117.0, 58.2],
-                [-21.8, 163.5, -71.1, -99.0, 80.9],
-                [-58.9, -10.9, 93.8, -139.6, 98.0],
-                // torch gets 54.5
-                [-54.4, 135.3, 6.0, -79.1, 134.6],
-                [27.5, -76.0, 43.4, -2.8, -7.8]
-            ]
-        ]
-    );
    Ok(())
 }

-test_device!(conv1d, conv1d_cpu, conv1d_gpu, conv1d_metal);
-test_device!(
-    conv1d_small,
-    conv1d_small_cpu,
-    conv1d_small_gpu,
-    conv1d_small_metal
-);
-test_device!(conv2d, conv2d_cpu, conv2d_gpu, conv2d_metal);
+test_device!(conv1d, conv1d_cpu, conv1d_gpu);
+test_device!(conv1d_small, conv1d_small_cpu, conv1d_small_gpu);
+test_device!(conv2d, conv2d_cpu, conv2d_gpu);
 test_device!(
    conv2d_non_square,
    conv2d_non_square_cpu,
-    conv2d_non_square_gpu,
-    conv2d_non_square_metal
-);
-test_device!(
-    conv2d_small,
-    conv2d_small_cpu,
-    conv2d_small_gpu,
-    conv2d_small_metal
-);
-test_device!(
-    conv2d_smaller,
-    conv2d_smaller_cpu,
-    conv2d_smaller_gpu,
-    conv2d_smaller_metal
-);
-test_device!(
-    conv2d_grad,
-    conv2d_grad_cpu,
-    conv2d_grad_gpu,
-    conv2_grad_metal
+    conv2d_non_square_gpu
 );
+test_device!(conv2d_small, conv2d_small_cpu, conv2d_small_gpu);
+test_device!(conv2d_smaller, conv2d_smaller_cpu, conv2d_smaller_gpu);
+test_device!(conv2d_grad, conv2d_grad_cpu, conv2d_grad_gpu);
--- a/candle-core/tests/custom_op_tests.rs
+++ b/candle-core/tests/custom_op_tests.rs
@ -112,34 +112,3 @@ fn custom_op1_with_backward() -> Result<()> {

    Ok(())
 }
-
-impl candle_core::InplaceOp1 for Elu {
-    fn name(&self) -> &'static str {
-        "elu"
-    }
-
-    fn cpu_fwd(&self, s: &mut CpuStorage, _l: &Layout) -> Result<()> {
-        let alpha = self.alpha;
-        match s {
-            CpuStorage::BF16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            CpuStorage::F16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            CpuStorage::F32(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            CpuStorage::F64(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
-            _ => candle_core::bail!("unsupported dtype for inplace elu"),
-        }
-        Ok(())
-    }
-}
-
-#[test]
-fn inplace_op1() -> Result<()> {
-    let cpu = &Device::Cpu;
-    let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?;
-    let t = (t - 5.)?;
-    t.inplace_op1(&Elu { alpha: 1. })?;
-    assert_eq!(
-        to_vec1_round(&t, 4)?,
-        &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-    );
-    Ok(())
-}
--- a/candle-core/tests/fortran_tensor_3d.pth
+++ b/candle-core/tests/fortran_tensor_3d.pth
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -1,4 +1,3 @@
-#![allow(clippy::approx_constant)]
 use anyhow::{Context, Result};
 use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var};

@ -97,24 +96,24 @@ fn unary_grad(device: &Device) -> Result<()> {
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [20.0855, 2.7183, 54.5982, 1.1618]
+        y.to_vec1::<f32>()?,
+        [20.085537, 2.7182817, 54.59815, 1.1618342]
    );
    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [20.0855, 2.7183, 54.5982, 1.1618]
+        grad_x.to_vec1::<f32>()?,
+        [20.085537, 2.7182817, 54.59815, 1.1618342]
    );
    let y = x.exp()?.sqr()?;
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        test_utils::to_vec1_round(&y, 3)?,
-        [403.429, 7.389, 2980.958, 1.35]
+        y.to_vec1::<f32>()?,
+        [403.4288, 7.3890557, 2980.9578, 1.3498588]
    );
    // exp(x)^2 = exp(2*x)
    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 2)?,
-        [806.86, 14.78, 5961.92, 2.7]
+        grad_x.to_vec1::<f32>()?,
+        [806.8576, 14.778111, 5961.9155, 2.6997175]
    );
    let y = x.sin()?;
    let grads = y.backward()?;
@ -193,273 +192,6 @@ fn unary_grad(device: &Device) -> Result<()> {
        test_utils::to_vec1_round(grad_x, 2)?,
        [0.01, 0.42, 0.0, 0.98],
    );
-
-    // testing compared to pytorch nn.GELU(approximate = 'tanh')
-    let y = x.gelu()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [2.9964, 0.8412, 3.9999, 0.0839]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [1.0116, 1.0830, 1.0003, 0.6188],
-    );
-
-    // Testing compared to pytorch torch.erf
-    //
-    // import torch
-    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
-    // y = x.erf()
-    // print(y)
-    // loss = y.sum()
-    // loss.backward()
-    // print(x.grad)
-    let y = x.erf()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(test_utils::to_vec1_round(&y, 4)?, [1.0, 0.8427, 1.0, 0.168]);
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [0.0001, 0.4151, 0.0, 1.1033],
-    );
-
-    // Testing compared to pytorch nn.GELU(approximate = 'none')
-    //
-    // import torch
-    // import torch.nn.functional as F
-    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
-    // y = F.gelu(x, approximate='none')
-    // print(y)
-    // loss = y.sum()
-    // loss.backward()
-    // print(x.grad)
-    let y = x.gelu_erf()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [2.9960, 0.8413, 3.9999, 0.0839]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [1.0119, 1.0833, 1.0005, 0.6188],
-    );
-
-    // Testing compared to pytorch elu
-    //
-    // import torch
-    // import torch.nn.functional as F
-    // x = torch.tensor([-1.0, 0.0, -2.0, 3.0], requires_grad=True)
-    // y = F.elu(x, alpha=2.0)
-    // print(y)
-    // loss = y.min
-    // loss = y.sum()
-    // loss.backward()
-    // print(x.grad)
-    let elu_x = Var::new(&[-1.0f32, 0., -2., 3.], device)?;
-    let y = elu_x.elu(2.)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&elu_x).context("no grad for x")?;
-
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [-1.2642, 0.0000, -1.7293, 3.0000]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [0.7358, 2.0000, 0.2707, 1.0000]
-    );
-
-    // testing compared to pytorch nn.Silu()
-    let y = x.silu()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [2.8577, 0.7311, 3.9281, 0.0806]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [1.0881, 0.9277, 1.0527, 0.5747],
-    );
-
-    if device.is_cpu() {
-        let x = Var::new(&[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]], device)?;
-        let y = x.interpolate1d(12)?.reshape(36)?;
-
-        let z = Tensor::new(
-            &[
-                1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16.,
-                17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
-                33., 34., 35., 36.,
-            ],
-            device,
-        )?;
-
-        let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-        let grads = loss.backward()?;
-        let grad_x = grads.get(&x).context("no grad for x")?;
-
-        assert_eq!(
-            test_utils::to_vec3_round(grad_x, 4)?,
-            [[[10_f32, 26., 42.], [58., 74., 90.], [106., 122., 138.]]]
-        );
-    }
-
-    // manually checked: see comments
-    let x = Var::new(&[[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]]], device)?;
-    let y = x.interpolate2d(6, 6)?.reshape(36)?;
-
-    let z = Tensor::new(
-        &[
-            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
-            35., 36.,
-        ],
-        device,
-    )?;
-    // gradient should be
-    // row 1
-    // 1+2+7+8 = 18
-    // 3+4+9+10 = 26
-    // 5+6+11+12 = 34
-    // row 2
-    // 13+14+19+20 = 66
-    // 15+16+21+22 = 74
-    // 17+18+23+24 = 82
-    // row 3
-    // 25+26+31+32 = 114
-    // 27+28+33+34 = 122
-    // 29+30+35+36 = 130
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
-        [[18_f32, 26., 34.], [66., 74., 82.], [114., 122., 130.]]
-    );
-
-    // manually checked: see comments
-    let x = Var::new(&[[[[1f32, 2.], [4., 5.]]]], device)?;
-    let y = x.interpolate2d(6, 6)?.reshape(36)?;
-
-    let z = Tensor::new(
-        &[
-            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
-            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
-            35., 36.,
-        ],
-        device,
-    )?;
-    // gradient should be
-    // row 1
-    // 1+2+3+7+8+9+13+14+15 = 72
-    // 4+5+6+10+11+12+16+17+18 = 99
-    // row 2
-    // 19+20+21+25+26+27+31+32+33 = 234
-    // 22+23+24+28+29+30+34+35+36 = 243
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
-        [[72_f32, 99.], [234., 261.]]
-    );
-
-    // manually checked: see comments
-    let x = Var::new(&[[[[1f32, 2.], [4., 5.]], [[6f32, 7.], [8., 9.]]]], device)?;
-
-    let y = x.interpolate2d(4, 4)?.reshape(32)?;
-
-    #[rustfmt::skip]
-    let z = Tensor::new(
-        &[
-            1_f32, 02., 03., 04.,
-            05.,   06., 07., 08.,
-            09.,   10., 11., 12.,
-            13.,   14., 15., 16.,
-            17.,   18., 19., 20.,
-            21.,   22., 23., 24.,
-            25.,   26., 27., 28.,
-            29.,   30., 31., 32.
-        ],
-        device,
-    )?;
-    // gradient should be
-    // m1r1
-    // 1+2+5+6=14
-    // 3+4+7+8=22
-    // m1r2
-    // 9+10+13+14=46
-    // 11+12+15+16=54
-    // m2r1
-    // 17+18+21+22=78
-    // 19+20+23+24=86
-    // m2r2
-    // 25+26+29+30=110
-    // 27+28+31+32=118
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
-        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
-    );
-
-    // manually checked: see comments
-    let x = Var::new(
-        &[[[[1f32, 2.], [4., 5.]]], [[[6f32, 7.], [8., 9.]]]],
-        device,
-    )?;
-
-    let y = x.interpolate2d(4, 4)?.reshape(32)?;
-
-    #[rustfmt::skip]
-       let z = Tensor::new(
-           &[
-               1_f32, 02., 03., 04.,
-               05.,   06., 07., 08.,
-               09.,   10., 11., 12.,
-               13.,   14., 15., 16.,
-               17.,   18., 19., 20.,
-               21.,   22., 23., 24.,
-               25.,   26., 27., 28.,
-               29.,   30., 31., 32.
-           ],
-           device,
-       )?;
-    // gradient should be
-    // m1r1
-    // 1+2+5+6=14
-    // 3+4+7+8=22
-    // m1r2
-    // 9+10+13+14=46
-    // 11+12+15+16=54
-    // m2r1
-    // 17+18+21+22=78
-    // 19+20+23+24=86
-    // m2r2
-    // 25+26+29+30=110
-    // 27+28+31+32=118
-    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
-
-    let grads = loss.backward()?;
-
-    let grad_x = grads.get(&x).context("no grad for x")?;
-
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
-        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
-    );
    Ok(())
 }

@ -486,48 +218,12 @@ fn binary_grad(device: &Device) -> Result<()> {
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(y.to_vec1::<f32>()?, [3., 1., -4., -1.]);
    assert_eq!(grad_x.to_vec1::<f32>()?, [1., 1., 1., 1.]);
-
-    let x_var = Var::new(&[3f32, 1., -4., -1., 5., 9.], device)?;
-    let x = x_var.as_tensor();
-    let y_var = Var::new(&[2f32, 7., 1.], device)?;
-    let y = y_var.as_tensor();
-
-    let ss = x
-        .reshape((2, 3))?
-        .slice_scatter0(&y.reshape((1, 3))?, 1)?
-        .sqr()?;
-    let grads = ss.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    let grad_y = grads.get(y).context("no grad for y")?;
-    assert_eq!(ss.to_vec2::<f32>()?, [[9., 1., 16.], [4., 49., 1.]]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [6.0, 2.0, -8.0, 0.0, 0.0, 0.0]);
-    assert_eq!(grad_y.to_vec1::<f32>()?, [4.0, 14.0, 2.0]);
    Ok(())
 }

-test_device!(
-    simple_grad,
-    simple_grad_cpu,
-    simple_grad_gpu,
-    simple_grad_metal
-);
-test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu, sum_grad_metal);
-test_device!(
-    matmul_grad,
-    matmul_grad_cpu,
-    matmul_grad_gpu,
-    matmul_grad_metal
-);
-test_device!(
-    grad_descent,
-    grad_descent_cpu,
-    grad_descent_gpu,
-    grad_descent_metal
-);
-test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu, unary_grad_metal);
-test_device!(
-    binary_grad,
-    binary_grad_cpu,
-    binary_grad_gpu,
-    binary_grad_metal
-);
+test_device!(simple_grad, simple_grad_cpu, simple_grad_gpu);
+test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu);
+test_device!(matmul_grad, matmul_grad_cpu, matmul_grad_gpu);
+test_device!(grad_descent, grad_descent_cpu, grad_descent_gpu);
+test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu);
+test_device!(binary_grad, binary_grad_cpu, binary_grad_gpu);
--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
@ -91,32 +91,3 @@ fn index_3d() -> Result<()> {
    assert_eq!(tensor.i((1, .., 3))?.to_vec1::<u32>()?, &[15, 19, 23]);
    Ok(())
 }
-
-#[test]
-fn slice_assign() -> Result<()> {
-    let dev = Device::Cpu;
-
-    let tensor = Tensor::arange(0u32, 4 * 5, &dev)?.reshape((4, 5))?;
-    let src = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((3, 2))?;
-    let out = tensor.slice_assign(&[1..4, 3..5], &src)?;
-    assert_eq!(
-        out.to_vec2::<u32>()?,
-        &[
-            [0, 1, 2, 3, 4],
-            [5, 6, 7, 0, 1],
-            [10, 11, 12, 2, 3],
-            [15, 16, 17, 4, 5]
-        ]
-    );
-    let out = tensor.slice_assign(&[0..3, 0..2], &src)?;
-    assert_eq!(
-        out.to_vec2::<u32>()?,
-        &[
-            [0, 1, 2, 3, 4],
-            [2, 3, 7, 8, 9],
-            [4, 5, 12, 13, 14],
-            [15, 16, 17, 18, 19]
-        ]
-    );
-    Ok(())
-}
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -49,7 +49,7 @@ fn contiguous(device: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(contiguous, contiguous_cpu, contiguous_gpu, contiguous_metal);
+test_device!(contiguous, contiguous_cpu, contiguous_gpu);

 #[test]
 fn strided_blocks() -> Result<()> {
@ -88,7 +88,7 @@ fn strided_blocks() -> Result<()> {
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
-    let tensor = tensor.i((.., 1))?.contiguous()?;
+    let tensor = tensor.i((.., 1))?;
    match tensor.strided_blocks() {
        candle::StridedBlocks::SingleBlock { start_offset, len } => {
            assert_eq!(start_offset, 0);
@ -100,20 +100,6 @@ fn strided_blocks() -> Result<()> {
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
-    let tensor = tensor.i((.., 1))?;
-    match tensor.strided_blocks() {
-        candle::StridedBlocks::SingleBlock { .. } => {
-            panic!("unexpected block structure")
-        }
-        candle::StridedBlocks::MultipleBlocks {
-            block_len,
-            block_start_index,
-        } => {
-            assert_eq!(block_len, 4);
-            assert_eq!(block_start_index.collect::<Vec<_>>(), &[4, 16])
-        }
-    };
-    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
    match tensor.t()?.strided_blocks() {
        candle::StridedBlocks::SingleBlock { .. } => {
            panic!("unexpected block structure")
--- a/candle-core/tests/matmul_tests.rs
+++ b/candle-core/tests/matmul_tests.rs
@ -1,106 +0,0 @@
-use candle_core::{test_device, DType, Device, IndexOp, Result, Tensor};
-
-fn matmul(device: &Device) -> Result<()> {
-    let data = vec![1.0f32, 2.0, 3.0, 4.0];
-    let a = Tensor::from_slice(&data, (2, 2), device)?;
-    let data = vec![1.0f32, 2.0, 3.0, 4.0];
-    let b = Tensor::from_slice(&data, (2, 2), device)?;
-
-    let c = a.matmul(&b)?;
-    assert_eq!(c.to_vec2::<f32>()?, &[[7.0f32, 10.0], [15.0, 22.0]]);
-
-    let data = vec![1.0f32, 2.0];
-    let a = Tensor::from_slice(&data, (2, 1), device)?;
-    let data = vec![3.0f32, 4.0];
-    let b = Tensor::from_slice(&data, (1, 2), device)?;
-    let c = a.matmul(&b)?;
-    assert_eq!(c.to_vec2::<f32>()?, &[&[3.0, 4.0], &[6.0, 8.0]]);
-
-    let data: Vec<_> = (0..6).map(|i| i as f32).collect();
-    let a = Tensor::from_slice(&data, (2, 3), device)?;
-    let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect();
-    let b = Tensor::from_slice(&data, (3, 2), device)?;
-    let c = a.matmul(&b)?;
-    assert_eq!(c.to_vec2::<f32>()?, &[&[16., 19.], &[52., 64.]]);
-
-    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
-    let a = Tensor::from_slice(&data, (2, 2, 3), device)?;
-    let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect();
-    let b = Tensor::from_slice(&data, (2, 3, 2), device)?;
-    let expected = [[[16., 19.], [52., 64.]], [[214., 235.], [304., 334.]]];
-
-    let c = a.matmul(&b)?;
-    assert_eq!(c.to_vec3::<f32>()?, &expected);
-
-    // Also perform the matmul on contiguous transposed versions.
-    let a_tt = a.t()?.contiguous()?.t()?;
-    assert!(!a_tt.is_contiguous());
-    assert_eq!(a.dims(), a_tt.dims());
-    assert_eq!(a_tt.stride(), &[6, 1, 2]);
-
-    let b_tt = b.t()?.contiguous()?.t()?;
-    assert!(!b_tt.is_contiguous());
-    assert_eq!(b.dims(), b_tt.dims());
-    assert_eq!(b_tt.stride(), &[6, 1, 3]);
-
-    assert_eq!(a_tt.matmul(&b)?.to_vec3::<f32>()?, &expected);
-    assert_eq!(a.matmul(&b_tt)?.to_vec3::<f32>()?, &expected);
-    assert_eq!(a_tt.matmul(&b_tt)?.to_vec3::<f32>()?, &expected);
-    Ok(())
-}
-
-fn broadcast_matmul(device: &Device) -> Result<()> {
-    let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?;
-    let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?;
-    let out = lhs.broadcast_matmul(&rhs)?;
-    assert_eq!(out.dims(), &[3, 6, 4, 2]);
-    for idx1 in 0..3 {
-        for idx2 in 0..6 {
-            let out = out.i((idx1, idx2))?;
-            let lhs = lhs.i((idx1, 0))?;
-            let rhs = rhs.i(idx2)?;
-            let out2 = lhs.matmul(&rhs);
-            let sum_diff2 = (out - out2)?.sqr()?.sum_all()?;
-            // With cuda, we see errors of up to ~1e-12.
-            assert!(sum_diff2.to_vec0::<f32>()? < 1e-6)
-        }
-    }
-    Ok(())
-}
-
-// https://github.com/huggingface/candle/issues/1948
-fn squeeze_mm(device: &Device) -> Result<()> {
-    let seq_len = 8_usize;
-    let a = Tensor::zeros((1, seq_len, 16), DType::F32, device)?;
-    let x = a.i((.., seq_len - 1, ..))?;
-    let w = Tensor::zeros((32, 16), DType::F32, device)?.t()?;
-    let x = x.matmul(&w)?;
-    assert_eq!(x.dims(), &[1, 32]);
-    Ok(())
-}
-
-// https://github.com/huggingface/candle/issues/1992
-fn mm_layout(device: &Device) -> Result<()> {
-    let a = Tensor::arange(0f32, 16f32, device)?.reshape((1, 1, 4, 4))?;
-    let b = Tensor::arange(0f32, 8f32, device)?.reshape((1, 1, 4, 2))?;
-    let mm1 = a.matmul(&b)?;
-    // Forces the layout to be:
-    // shape: [1, 1, 4, 2], stride: [8, 2, 2, 1], start_offset: 0
-    // This is still a contiguous matrix but matmul checks are only the two last dimensions have
-    // non 1 sizes but matmul check may be reluctant to handle it.
-    let b = b.transpose(1, 2)?.force_contiguous()?.transpose(1, 2)?;
-    let mm2 = a.matmul(&b)?;
-    let diff = (mm1 - mm2)?.abs()?.sum_all()?.to_vec0::<f32>()?;
-    assert_eq!(diff, 0.);
-    Ok(())
-}
-
-test_device!(matmul, matmul_cpu, matmul_gpu, matmul_metal);
-test_device!(
-    broadcast_matmul,
-    broadcast_matmul_cpu,
-    broadcast_matmul_gpu,
-    broadcast_matmul_metal
-);
-test_device!(squeeze_mm, squeeze_mm_cpu, squeeze_mm_gpu, squeeze_mm_metal);
-test_device!(mm_layout, mm_layout_cpu, mm_layout_gpu, mm_layout_metal);
--- a/candle-core/tests/npy.py
+++ b/candle-core/tests/npy.py
@ -1,9 +0,0 @@
-import numpy as np
-x = np.arange(10)
-
-# Write a npy file.
-np.save("test.npy", x)
-
-# Write multiple values to a npz file.
-values = { "x": x, "x_plus_one": x + 1 }
-np.savez("test.npz", **values)
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
@ -43,9 +43,6 @@ res = torch.nn.functional.avg_pool2d(t, 2)
 print(res)
 */
 fn avg_pool2d_pytorch(dev: &Device) -> Result<()> {
-    if dev.is_metal() {
-        return Ok(());
-    }
    let t = Tensor::new(
        &[
            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
@ -101,17 +98,15 @@ fn upsample_nearest2d(dev: &Device) -> Result<()> {
    Ok(())
 }

-test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu, avg_pool2d_metal);
+test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu);
 test_device!(
    avg_pool2d_pytorch,
    avg_pool2d_pytorch_cpu,
-    avg_pool2d_pytorch_gpu,
-    avg_pool2d_pytorch_metal
+    avg_pool2d_pytorch_gpu
 );
-test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu, max_pool2d_metal);
+test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu);
 test_device!(
    upsample_nearest2d,
    upsample_nearest2d_cpu,
-    upsample_nearest2d_gpu,
-    upsample_nearest2d_metal
+    upsample_nearest2d_gpu
 );
--- a/candle-core/tests/pth.py
+++ b/candle-core/tests/pth.py
@ -1,37 +0,0 @@
-import torch
-from collections import OrderedDict
-
-# Write a trivial tensor to a pt file
-a= torch.tensor([[1,2,3,4], [5,6,7,8]])
-o = OrderedDict()
-o["test"] = a
-
-# Write a trivial tensor to a pt file
-torch.save(o, "test.pt")
-
-############################################################################################################
-# Write a trivial tensor to a pt file with a key
-torch.save({"model_state_dict": o}, "test_with_key.pt")
-
-############################################################################################################
-# Create a tensor with fortran contiguous memory layout
-import numpy as np
-
-# Step 1: Create a 3D NumPy array with Fortran order using a range of numbers
-# For example, creating a 2x3x4 array
-array_fortran = np.asfortranarray(np.arange(1, 2*3*4 + 1).reshape(2, 3, 4))
-
-# Verify the memory order
-print("Is Fortran contiguous (F order):", array_fortran.flags['F_CONTIGUOUS'])  # Should be True
-print("Is C contiguous (C order):", array_fortran.flags['C_CONTIGUOUS'])  # Should be False
-
-# Step 2: Convert the NumPy array to a PyTorch tensor
-tensor_fortran = torch.from_numpy(array_fortran)
-
-# Verify the tensor layout
-print("Tensor stride:", tensor_fortran.stride())  # Stride will reflect the Fortran memory layout
-
-# Step 3: Save the PyTorch tensor to a .pth file
-torch.save({"tensor_fortran": tensor_fortran}, 'fortran_tensor_3d.pth')
-
-print("3D Tensor saved with Fortran layout.")
--- a/candle-core/tests/pth_tests.rs
+++ b/candle-core/tests/pth_tests.rs
@ -1,31 +0,0 @@
-/// Regression test for pth files not loading on Windows.
-#[test]
-fn test_pth() {
-    let tensors = candle_core::pickle::PthTensors::new("tests/test.pt", None).unwrap();
-    tensors.get("test").unwrap().unwrap();
-}
-
-#[test]
-fn test_pth_with_key() {
-    let tensors =
-        candle_core::pickle::PthTensors::new("tests/test_with_key.pt", Some("model_state_dict"))
-            .unwrap();
-    tensors.get("test").unwrap().unwrap();
-}
-
-#[test]
-fn test_pth_fortran_congiguous() {
-    let tensors =
-        candle_core::pickle::PthTensors::new("tests/fortran_tensor_3d.pth", None).unwrap();
-    let tensor = tensors.get("tensor_fortran").unwrap().unwrap();
-
-    assert_eq!(tensor.dims3().unwrap(), (2, 3, 4));
-
-    assert_eq!(
-        tensor.to_vec3::<i64>().unwrap(),
-        [
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
-            [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]
-        ]
-    );
-}
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
@ -1,9 +1,7 @@
 use candle_core::{
-    bail,
    quantized::{self, GgmlDType},
-    test_device,
    test_utils::to_vec2_round,
-    Device, Module, Result, Tensor,
+    Device, Result, Tensor,
 };
 use quantized::{k_quants, GgmlType};
 use rand::prelude::*;
@ -15,48 +13,16 @@ const GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS: f32 = 0.0075;
 const GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS: f32 = 0.0040;
 const GGML_MAX_DOT_PRODUCT_ERROR: f32 = 0.02;

-fn test_matmul(
-    device: &Device,
-    (b, m, n, k): (usize, usize, usize, usize),
-    dtype: GgmlDType,
-) -> Result<()> {
-    let lhs = (0..(m * k))
-        .map(|v| v as f32 / (m * k) as f32)
-        .collect::<Vec<_>>();
-    let rhs = (0..(k * n))
-        .map(|v| v as f32 / (n * k) as f32)
-        .collect::<Vec<_>>();
-
-    let lhs = Tensor::from_slice(&lhs, (m, k), device)?;
-    let rhs = Tensor::from_slice(&rhs, (k, n), device)?;
-    let mm = lhs.matmul(&rhs)?;
-    let qtensor = quantized::QTensor::quantize(&rhs.t()?, dtype)?;
-    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
-    let res = matmul.forward(&lhs)?;
-
-    let error: f32 = ((&mm - &res)?.abs()? / &mm.abs()?)?
-        .sum_all()?
-        .to_scalar()?;
-    let error = error / (b * m * n) as f32;
-    assert!(
-        error <= 0.02,
-        "Error {error} is too big. \nExpected:\n {mm} \nFound:\n {res}\n for {dtype:?}"
-    );
-
-    Ok(())
-}
-
-fn quantized_matmul(device: &Device) -> Result<()> {
-    // TODO Enable this later when we enable cuda.
-    if device.is_cuda() {
-        return Ok(());
-    }
+#[test]
+fn quantized_matmul() -> Result<()> {
+    let cpu = &Device::Cpu;
    let (m, k, n) = (3, 64, 4);
    let lhs = (0..(m * k)).map(|v| v as f32).collect::<Vec<_>>();
-    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), cpu)?;
    let mut dst = vec![42.; 3 * 4];
    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
    let rhs = (0..(k * n)).map(|v| v as f32).collect::<Vec<_>>();
+    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), cpu)?.t()?;
    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
    assert_eq!(
@ -66,7 +32,6 @@ fn quantized_matmul(device: &Device) -> Result<()> {
            341876.0, 994283.0, 1655709.0, 2301518.0
        ]
    );
-    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), device)?.t()?;
    let mm = tensor_lhs.matmul(&tensor_rhs)?;
    assert_eq!(
        mm.to_vec2::<f32>()?,
@ -77,49 +42,35 @@ fn quantized_matmul(device: &Device) -> Result<()> {
        ]
    );

-    let qtensor = quantized::QTensor::quantize(&tensor_rhs.t()?, GgmlDType::Q4_0)?;
-    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
+    let qtensor = quantized::QTensor::new(rhs_t, (4, 64))?;
+    let matmul = quantized::QMatMul::from_qtensor(qtensor);
    let res = matmul.forward(&tensor_lhs)?;
-    match device {
-        Device::Metal(_) => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [84946.0, 214126.0, 344757.0, 473798.0],
-                [213458.0, 604350.0, 1000469.0, 1387990.0],
-                [341970.0, 994574.0, 1656181.0, 2302182.0]
-            ]
-        ),
-        _ => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [85120.0, 214562.0, 345455.0, 474748.0],
-                [213475.0, 604465.0, 1000686.0, 1388317.0],
-                [341876.0, 994283.0, 1655709.0, 2301518.0]
-            ]
-        ),
-    }
-
-    test_matmul(device, (1, 3, 4, 256), GgmlDType::Q4_0)?;
+    assert_eq!(
+        to_vec2_round(&res, 0)?,
+        &[
+            [85120.0, 214562.0, 345455.0, 474748.0],
+            [213475.0, 604465.0, 1000686.0, 1388317.0],
+            [341876.0, 994283.0, 1655709.0, 2301518.0]
+        ]
+    );

    Ok(())
 }

-fn quantized_matmul_neg(device: &Device) -> Result<()> {
-    // TODO Enable this later when we enable cuda.
-    if device.is_cuda() {
-        return Ok(());
-    }
+#[test]
+fn quantized_matmul_neg() -> Result<()> {
+    let cpu = &Device::Cpu;
    let (m, k, n) = (3, 64, 4);
    let lhs = (0..(m * k))
        .map(|v| v as f32 - (m * k) as f32 / 2.0)
        .collect::<Vec<_>>();
-    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
+    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), cpu)?;
    let mut dst = vec![42.; 3 * 4];
    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
    let rhs = (0..k * n)
        .map(|v| v as f32 - (k * n) as f32 / 3.0)
        .collect::<Vec<_>>();
-    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), device)?.t()?;
+    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), cpu)?.t()?;
    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
    assert_eq!(
@ -139,52 +90,32 @@ fn quantized_matmul_neg(device: &Device) -> Result<()> {
        ]
    );

-    let qtensor = quantized::QTensor::quantize(&tensor_rhs.t()?, GgmlDType::Q4_0)?;
-    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
+    let qtensor = quantized::QTensor::new(rhs_t, (4, 64))?;
+    let matmul = quantized::QMatMul::from_qtensor(qtensor);
    let res = matmul.forward(&tensor_lhs)?;
-    match device {
-        Device::Metal(_) => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [243666.0, -19714.0, -285433.0, -550453.0],
-                [23782.0, 21654.0, 19400.0, 18369.0],
-                [-196102.0, 63022.0, 324233.0, 587191.0]
-            ]
-        ),
-        _ => assert_eq!(
-            to_vec2_round(&res, 0)?,
-            &[
-                [243524.0, -19596.0, -285051.0, -549815.0],
-                [23777.0, 21651.0, 19398.0, 18367.0],
-                [-196472.0, 63012.0, 324585.0, 587902.0]
-            ]
-        ),
-    }
+    assert_eq!(
+        to_vec2_round(&res, 0)?,
+        &[
+            [243524.0, -19596.0, -285051.0, -549815.0],
+            [23777.0, 21651.0, 19398.0, 18367.0],
+            [-196472.0, 63012.0, 324585.0, 587902.0]
+        ]
+    );

    Ok(())
 }

-test_device!(
-    quantized_matmul,
-    quantized_matmul_cpu,
-    quantized_matmul_cuda,
-    quantized_matmul_metal
-);
-test_device!(
-    quantized_matmul_neg,
-    quantized_matmul_neg_cpu,
-    quantized_matmul_neg_cuda,
-    quantized_matmul_neg_metal
-);
+#[test]
+fn quantize_q4_0() -> Result<()> {
+    use k_quants::BlockQ4_0;

-fn quantize_q4_0(device: &Device) -> Result<()> {
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_0)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ4_0::zeros(); 4];
+    BlockQ4_0::from_float(&src, &mut quant)?;
+    BlockQ4_0::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        dst.to_vec1::<f32>()?,
+        dst,
        &[
            -0.0, -0.0, 3.875, 3.875, 3.875, 3.875, 7.75, 7.75, 7.75, 7.75, 11.625, 11.625, 11.625,
            11.625, 15.5, 15.5, 15.5, 15.5, 19.375, 19.375, 19.375, 19.375, 23.25, 23.25, 23.25,
@ -200,17 +131,21 @@ fn quantize_q4_0(device: &Device) -> Result<()> {
            127.0, 127.0
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q4_0, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ4_0>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q4_1(device: &Device) -> Result<()> {
+#[test]
+fn quantize_q4_1() -> Result<()> {
+    use k_quants::BlockQ4_1;
+
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_1)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ4_1::zeros(); 4];
+    BlockQ4_1::from_float(&src, &mut quant)?;
+    BlockQ4_1::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        round_vector(&dst.to_vec1::<f32>()?),
+        round_vector(&dst),
        &[
            0.0, 0.0, 2.066, 2.066, 4.133, 4.133, 6.199, 6.199, 8.266, 8.266, 10.332, 10.332,
            12.398, 12.398, 14.465, 14.465, 16.531, 16.531, 18.598, 18.598, 20.664, 20.664, 22.73,
@ -226,17 +161,21 @@ fn quantize_q4_1(device: &Device) -> Result<()> {
            118.73, 118.73, 120.797, 120.797, 122.863, 122.863, 124.93, 124.93, 126.996, 126.996
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q4_1, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ4_1>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q5_0(device: &Device) -> Result<()> {
+#[test]
+fn quantize_q5_0() -> Result<()> {
+    use k_quants::BlockQ5_0;
+
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_0)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ5_0::zeros(); 4];
+    BlockQ5_0::from_float(&src, &mut quant)?;
+    BlockQ5_0::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        round_vector(&dst.to_vec1::<f32>()?),
+        round_vector(&dst),
        &[
            -0.0, 1.938, 1.938, 3.875, 3.875, 5.813, 5.813, 7.75, 7.75, 9.688, 9.688, 11.625,
            11.625, 13.563, 13.563, 15.5, 15.5, 17.438, 17.438, 19.375, 19.375, 21.313, 21.313,
@ -252,17 +191,21 @@ fn quantize_q5_0(device: &Device) -> Result<()> {
            119.063, 119.063, 119.063, 119.063, 127.0, 127.0, 127.0, 127.0
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q5_0, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ5_0>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q5_1(device: &Device) -> Result<()> {
+#[test]
+fn quantize_q5_1() -> Result<()> {
+    use k_quants::BlockQ5_1;
+
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
-    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_1)?;
-    let dst = quant.dequantize(device)?;
+    let mut dst = vec![0f32; 32 * 4];
+    let mut quant = vec![BlockQ5_1::zeros(); 4];
+    BlockQ5_1::from_float(&src, &mut quant)?;
+    BlockQ5_1::to_float(&quant, dst.as_mut_slice())?;
    assert_eq!(
-        round_vector(&dst.to_vec1::<f32>()?),
+        dst,
        &[
            0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
            16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
@ -276,11 +219,13 @@ fn quantize_q5_1(device: &Device) -> Result<()> {
            124.0, 125.0, 126.0, 127.0
        ]
    );
-    ggml_quantization_error_test(GgmlDType::Q5_1, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
+    ggml_quantization_error_test::<BlockQ5_1>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn get_test_vector2(bound: f32, size: usize, device: &Device) -> Result<Tensor> {
+/// Generates a small test vector ranging from -`bound` to `bound` with `size` steps
+fn get_test_vector(bound: f32, size: usize) -> (Vec<f32>, Vec<f32>) {
    assert!(
        size % crate::quantized::k_quants::QK_K == 0,
        "size must be a multiple of {}",
@ -290,8 +235,10 @@ fn get_test_vector2(bound: f32, size: usize, device: &Device) -> Result<Tensor>
    let src = (0..size)
        .map(|v| (v as f32 - size as f32 / 2.) * bound / (size as f32 / 2.))
        .collect::<Vec<_>>();
+
+    let dst = vec![0f32; size];
    assert_eq!([src[0], src[size / 2]], [-bound, 0.0]);
-    Tensor::from_vec(src, (size,), device)
+    (src, dst)
 }

 /// Round a vector
@ -318,8 +265,7 @@ fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) {
    }
 }

-/// Creates a vector similar to the ones used in GGML unit tests:
-/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
+/// Creates a vector simillarly to the one used in GGML unit tests: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
 fn create_ggml_like_vector(offset: f32) -> Vec<f32> {
    (0..GGML_TEST_SIZE)
        .map(|i| 0.1 + 2.0 * (i as f32 + offset).cos())
@ -338,16 +284,14 @@ fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 {
    sum / a.len() as f32
 }

-/// Similar to the GGML quantization unit test:
-/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
-fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f32) -> Result<()> {
+/// Mirrores the GGML quanitzation unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
+fn ggml_quantization_error_test<T: GgmlType>(max_error: f32) -> Result<()> {
    let src = create_ggml_like_vector(0.0);
-    let src = Tensor::from_slice(&src, (GGML_TEST_SIZE,), device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
-    let error = calculate_rmse(&src.to_vec1::<f32>()?, &dst.to_vec1::<f32>()?);
+    let mut dst = vec![0.0; GGML_TEST_SIZE];
+    let _quant = quantize_roundtrip::<T>(src.as_slice(), dst.as_mut_slice())?;
+    let error = calculate_rmse(src.as_slice(), dst.as_slice());
    if error > max_error {
-        bail!(
+        candle_core::bail!(
            "Quantization error {} exceeds max error {}",
            error,
            max_error
@ -356,15 +300,19 @@ fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f3
    Ok(())
 }

-fn quantize_q2k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q2K;
+fn quantize_roundtrip<T: GgmlType>(src: &[f32], dst: &mut [f32]) -> Result<Vec<T>> {
+    let mut quant = vec![T::zeros(); src.len() / T::BLCK_SIZE];
+    T::from_float(src, &mut quant)?;
+    T::to_float(&quant, dst)?;
+    Ok(quant)
+}

-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q2k() -> Result<()> {
+    use k_quants::BlockQ2K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ2K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.1);

    // Test some specific values
@ -378,26 +326,20 @@ fn quantize_q2k(device: &Device) -> Result<()> {
        [-0.499, -0.366, -0.249, 0.0, 0.295, 0.492]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ2K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 6.0);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS)?;
+    ggml_quantization_error_test::<BlockQ2K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS)?;
    Ok(())
 }

-fn quantize_q3k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q3K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q3k() -> Result<()> {
+    use k_quants::BlockQ3K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ3K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.03);

    // Test some specific values
@ -411,26 +353,20 @@ fn quantize_q3k(device: &Device) -> Result<()> {
        [-0.493, -0.37, -0.243, -0.0, 0.292, 0.492]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ3K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 3.5);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS)?;
+    ggml_quantization_error_test::<BlockQ3K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS)?;
    Ok(())
 }

-fn quantize_q4k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q4K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q4k() -> Result<()> {
+    use k_quants::BlockQ4K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ4K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.017);

    // Test some specific values
@ -444,27 +380,21 @@ fn quantize_q4k(device: &Device) -> Result<()> {
        [-0.5, -0.373, -0.25, 0.0, 0.288, 0.498]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ4K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 4.5);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ4K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
    Ok(())
 }

-fn quantize_q5k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q5K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q5k() -> Result<()> {
+    use k_quants::BlockQ5K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.009);
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ5K>(src.as_slice(), dst.as_mut_slice())?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);

    // Test some specific values
    assert_eq!(
@ -474,29 +404,24 @@ fn quantize_q5k(device: &Device) -> Result<()> {
    let dst = round_vector(&dst);
    assert_eq!(
        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.5, -0.373, -0.25, 0.0, 0.279, 0.499]
+        [-0.499, -0.372, -0.249, 0.001, 0.279, 0.499]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ5K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.5);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ5K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
    Ok(())
 }

-fn quantize_q6k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q6K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q6k() -> Result<()> {
+    use k_quants::BlockQ6K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ6K>(src.as_slice(), dst.as_mut_slice())?;
    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);

    // Test some specific values
@ -510,27 +435,22 @@ fn quantize_q6k(device: &Device) -> Result<()> {
        [-0.497, -0.372, -0.25, -0.0, 0.284, 0.5]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ6K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.0);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ6K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
    Ok(())
 }

-fn quantize_q8k(device: &Device) -> Result<()> {
-    let dtype = GgmlDType::Q8K;
-    let src = get_test_vector2(0.5, 1024, device)?;
-    let quant = quantized::QTensor::quantize(&src, dtype)?;
-    let dst = quant.dequantize(device)?;
+#[test]
+fn quantize_q8k() -> Result<()> {
+    use k_quants::BlockQ8K;

-    let src = src.to_vec1::<f32>()?;
-    let dst = dst.to_vec1::<f32>()?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);
+    let (src, mut dst) = get_test_vector(0.5, 1024);
+    let _quant = quantize_roundtrip::<BlockQ8K>(src.as_slice(), dst.as_mut_slice())?;
+    compare_with_error(dst.as_slice(), src.as_slice(), 0.003);

    // Test some specific values
    assert_eq!(
@ -543,79 +463,15 @@ fn quantize_q8k(device: &Device) -> Result<()> {
        [-0.5, -0.375, -0.25, -0.0, 0.281, 0.499]
    );

-    let src_big = get_test_vector2(128.0, 1024, device)?;
-    let quant_big = quantized::QTensor::quantize(&src_big, dtype)?;
-    let dst_big = quant_big.dequantize(device)?;
-
-    let src_big = src_big.to_vec1::<f32>()?;
-    let dst_big = dst_big.to_vec1::<f32>()?;
+    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
+    let _quant_big = quantize_roundtrip::<BlockQ8K>(src_big.as_slice(), dst_big.as_mut_slice())?;
    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 0.6);

-    ggml_quantization_error_test(dtype, device, GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+    ggml_quantization_error_test::<BlockQ8K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
+
    Ok(())
 }

-test_device!(
-    quantize_q4_0,
-    quantize_q4_0_cpu,
-    quantize_q4_0_cuda,
-    quantize_q4_0_metal
-);
-test_device!(
-    quantize_q4_1,
-    quantize_q4_1_cpu,
-    quantize_q4_1_cuda,
-    quantize_q4_1_metal
-);
-test_device!(
-    quantize_q5_0,
-    quantize_q5_0_cpu,
-    quantize_q5_0_cuda,
-    quantize_q5_0_metal
-);
-test_device!(
-    quantize_q5_1,
-    quantize_q5_1_cpu,
-    quantize_q5_1_cuda,
-    quantize_q5_1_metal
-);
-test_device!(
-    quantize_q2k,
-    quantize_q2k_cpu,
-    quantize_q2k_cuda,
-    quantize_q2k_metal
-);
-test_device!(
-    quantize_q3k,
-    quantize_q3k_cpu,
-    quantize_q3k_cuda,
-    quantize_q3k_metal
-);
-test_device!(
-    quantize_q4k,
-    quantize_q4k_cpu,
-    quantize_q4k_cuda,
-    quantize_q4k_metal
-);
-test_device!(
-    quantize_q5k,
-    quantize_q5k_cpu,
-    quantize_q5k_cuda,
-    quantize_q5k_metal
-);
-test_device!(
-    quantize_q6k,
-    quantize_q6k_cpu,
-    quantize_q6k_cuda,
-    quantize_q6k_metal
-);
-test_device!(
-    quantize_q8k,
-    quantize_q8k_cpu,
-    quantize_q8k_cuda,
-    quantize_q8k_metal
-);
-
 /// Very simple dot product implementation
 fn vec_dot_reference(a: &[f32], b: &[f32]) -> f32 {
    a.iter().zip(b).map(|(a, b)| a * b).sum()
@ -631,66 +487,46 @@ fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> {
        GgmlDType::Q5K => 0.000740,
        GgmlDType::Q6K => 0.000952,
        GgmlDType::Q4_0 => 0.001143,
-        GgmlDType::Q4_1 => 0.008,
+        GgmlDType::Q4_1 => 0.007784,
        GgmlDType::Q5_0 => 0.001353,
-        GgmlDType::Q5_1 => 0.00149,
+        GgmlDType::Q5_1 => 0.001363,
        GgmlDType::Q8_0 => 0.000092,
-
-        // Not from the ggml repo.
-        GgmlDType::Q8K => 0.00065,
-        _ => bail!("No GGML results for quantization type {dtype:?}",),
+        _ => candle_core::bail!("No GGML results for quantization type {dtype:?}",),
    };
    Ok(err)
 }

-/// Similar to the GGML matmul unit test:
-/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
+/// Mirrores the GGML matmul unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
 fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
    let a = create_ggml_like_vector(0.0);
    let b = create_ggml_like_vector(1.0);
-    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 1.0)?;
-    // Another example that is more likely to trigger the overflow reported in #1526
-    let a = (0..GGML_TEST_SIZE)
-        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
-        .collect::<Vec<_>>();
-    let b = (0..GGML_TEST_SIZE)
-        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
-        .collect::<Vec<_>>();
-    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 2.0)?;
-    Ok(())
-}
-
-fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Result<()> {
    let length = a.len();

    let mut a_quant = vec![T::zeros(); length / T::BLCK_SIZE];
    let mut b_quant = vec![T::VecDotType::zeros(); length / T::VecDotType::BLCK_SIZE];
-    T::from_float(a, &mut a_quant)?;
-    T::VecDotType::from_float(b, &mut b_quant)?;
+    T::from_float(&a, &mut a_quant)?;
+    T::VecDotType::from_float(&b, &mut b_quant)?;

    let result = T::vec_dot(length, &a_quant, &b_quant)?;
-    let result_unopt = T::vec_dot_unopt(length, &a_quant, &b_quant)?;
-    let reference_result = vec_dot_reference(a, b);
-
-    if (result - result_unopt).abs() / length as f32 > 1e-6 {
-        bail!(
-            "the opt and unopt vec-dot returned different values, opt {result}, unopt {result_unopt}"
-        )
-    }
+    let reference_result = vec_dot_reference(&a, &b);

    let error = (result - reference_result).abs() / length as f32;

-    let ggml_error = ggml_reference_matmul_error(T::DTYPE)? * err_m;
+    let ggml_error = ggml_reference_matmul_error(T::DTYPE)?;

-    if !error.is_finite() || error > GGML_MAX_DOT_PRODUCT_ERROR {
-        bail!("Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",);
+    if error > GGML_MAX_DOT_PRODUCT_ERROR {
+        candle_core::bail!(
+            "Dot product error {} exceeds max error {}",
+            error,
+            GGML_MAX_DOT_PRODUCT_ERROR
+        );
    }

    // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML
    // => we use a slightly higher error threshold
    const ERROR_LENIENCY: f32 = 0.00001;
    if error - ERROR_LENIENCY > ggml_error {
-        bail!(
+        candle_core::bail!(
            "Dot product error {} exceeds ggml reference error {}",
            error,
            ggml_error
@ -699,16 +535,6 @@ fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Res
    Ok(())
 }

-#[test]
-fn quantized_mm() -> Result<()> {
-    ggml_matmul_error_test::<k_quants::BlockQ4_0>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ4_1>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ5_0>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ5_1>()?;
-    ggml_matmul_error_test::<k_quants::BlockQ8_0>()?;
-    Ok(())
-}
-
 /// generates random tensors of size `m x k` and `n x k` and calculates their expected matrix multiplication result.
 fn get_random_tensors(
    m: usize,
@ -732,108 +558,6 @@ fn get_random_tensors(
    Ok((lhs, rhs, mm))
 }

-#[macro_export]
-macro_rules! quantized_matmul {
-    // TODO: Switch to generating the two last arguments automatically once concat_idents is
-    // stable. https://github.com/rust-lang/rust/issues/29599
-    ($fn_name: ident, $fn_name_cpu: ident, $fn_name_cuda: ident, $fn_name_metal: ident, $dtype: expr) => {
-        fn $fn_name(device: &Device) -> Result<()> {
-            test_matmul(device, (1, 3, 4, 256), $dtype)?;
-            Ok(())
-        }
-
-        test_device!($fn_name, $fn_name_cpu, $fn_name_cuda, $fn_name_metal);
-    };
-}
-
-quantized_matmul!(
-    quantized_matmul_q4_0_bis,
-    quantized_matmul_q4_0_cpu,
-    quantized_matmul_q4_0_cuda,
-    quantized_matmul_q4_0_metal,
-    GgmlDType::Q4_0
-);
-quantized_matmul!(
-    quantized_matmul_q4_1_bis,
-    quantized_matmul_q4_1_cpu,
-    quantized_matmul_q4_1_cuda,
-    quantized_matmul_q4_1_metal,
-    GgmlDType::Q4_1
-);
-quantized_matmul!(
-    quantized_matmul_q5_0_bis,
-    quantized_matmul_q5_0_cpu,
-    quantized_matmul_q5_0_cuda,
-    quantized_matmul_q5_0_metal,
-    GgmlDType::Q5_0
-);
-quantized_matmul!(
-    quantized_matmul_q5_1_bis,
-    quantized_matmul_q5_1_cpu,
-    quantized_matmul_q5_1_cuda,
-    quantized_matmul_q5_1_metal,
-    GgmlDType::Q5_1
-);
-quantized_matmul!(
-    quantized_matmul_q8_0_bis,
-    quantized_matmul_q8_0_cpu,
-    quantized_matmul_q8_0_cuda,
-    quantized_matmul_q8_0_metal,
-    GgmlDType::Q8_0
-);
-// Not implemented in Ggml
-// quantized_matmul!(
-//     quantized_matmul_q8_1_bis,
-//     quantized_matmul_q8_1_cpu,
-//     quantized_matmul_q8_1_cuda,
-//     quantized_matmul_q8_1_metal,
-//     GgmlDType::Q8_1
-// );
-// TODO This is bugged (also bugged in GGML
-quantized_matmul!(
-    quantized_matmul_q2k_bis,
-    quantized_matmul_q2k_cpu,
-    quantized_matmul_q2k_cuda,
-    quantized_matmul_q2k_metal,
-    GgmlDType::Q2K
-);
-quantized_matmul!(
-    quantized_matmul_q3k_bis,
-    quantized_matmul_q3k_cpu,
-    quantized_matmul_q3k_cuda,
-    quantized_matmul_q3k_metal,
-    GgmlDType::Q3K
-);
-quantized_matmul!(
-    quantized_matmul_q4k_bis,
-    quantized_matmul_q4k_cpu,
-    quantized_matmul_q4k_cuda,
-    quantized_matmul_q4k_metal,
-    GgmlDType::Q4K
-);
-quantized_matmul!(
-    quantized_matmul_q5k_bis,
-    quantized_matmul_q5k_cpu,
-    quantized_matmul_q5k_cuda,
-    quantized_matmul_q5k_metal,
-    GgmlDType::Q5K
-);
-quantized_matmul!(
-    quantized_matmul_q6k_bis,
-    quantized_matmul_q6k_cpu,
-    quantized_matmul_q6k_cuda,
-    quantized_matmul_q6k_metal,
-    GgmlDType::Q6K
-);
-// Not implemented on metal
-// quantized_matmul!(
-//     quantized_matmul_q8k_bis,
-//     quantized_matmul_q8k_cpu,
-//     quantized_matmul_q8k_cuda,
-//     quantized_matmul_q8k_metal,
-//     GgmlDType::Q8K
-// );
-
 #[test]
 fn quantized_matmul_q2k() -> Result<()> {
    use k_quants::BlockQ2K;
@ -846,8 +570,8 @@ fn quantized_matmul_q2k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q2K)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ2K>(&rhs)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs);
    let mm = rhs.forward(&lhs)?;

    assert_eq!(mm.dims(), [m, n]);
@ -872,8 +596,8 @@ fn quantized_matmul_q3k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q3K)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ3K>(&rhs)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs);
    let mm = rhs.forward(&lhs)?;

    assert_eq!(mm.dims(), [m, n]);
@ -898,8 +622,8 @@ fn quantized_matmul_q4k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q4K)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ4K>(&rhs)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs);
    let mm = rhs.forward(&lhs)?;

    assert_eq!(mm.dims(), [m, n]);
@ -924,8 +648,8 @@ fn quantized_matmul_q5k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q5K)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ5K>(&rhs)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs);
    let mm = rhs.forward(&lhs)?;

    assert_eq!(mm.dims(), [m, n]);
@ -951,8 +675,8 @@ fn quantized_matmul_q6k() -> Result<()> {
    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);

-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q6K)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
+    let rhs = quantized::QTensor::quantize::<BlockQ6K>(&rhs)?;
+    let rhs = quantized::QMatMul::from_qtensor(rhs);
    let mm = rhs.forward(&lhs)?;

    assert_eq!(mm.dims(), [m, n]);
@ -963,28 +687,3 @@ fn quantized_matmul_q6k() -> Result<()> {
    ggml_matmul_error_test::<BlockQ6K>()?;
    Ok(())
 }
-
-#[test]
-fn quantized_matmul_q8k() -> Result<()> {
-    use k_quants::BlockQ8K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize(&rhs, GgmlDType::Q8K)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.266, 1.504, -0.204, 1.7]);
-
-    ggml_matmul_error_test::<BlockQ8K>()?;
-    Ok(())
-}
--- a/candle-core/tests/serialization_tests.rs
+++ b/candle-core/tests/serialization_tests.rs
@ -1,24 +0,0 @@
-use candle_core::{DType, Result, Tensor};
-
-#[test]
-fn npy() -> Result<()> {
-    let npy = Tensor::read_npy("tests/test.npy")?;
-    assert_eq!(
-        npy.to_dtype(DType::U8)?.to_vec1::<u8>()?,
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-    );
-    Ok(())
-}
-
-#[test]
-fn npz() -> Result<()> {
-    let npz = Tensor::read_npz("tests/test.npz")?;
-    assert_eq!(npz.len(), 2);
-    assert_eq!(npz[0].0, "x");
-    assert_eq!(npz[1].0, "x_plus_one");
-    assert_eq!(
-        npz[1].1.to_dtype(DType::U8)?.to_vec1::<u8>()?,
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    );
-    Ok(())
-}
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -1,4 +1,4 @@
-use candle_core::{test_device, test_utils, DType, Device, IndexOp, Result, Tensor, D};
+use candle_core::{test_device, DType, Device, IndexOp, Result, Tensor};

 fn zeros(device: &Device) -> Result<()> {
    let tensor = Tensor::zeros((5, 2), DType::F32, device)?;
@ -8,58 +8,6 @@ fn zeros(device: &Device) -> Result<()> {
    Ok(())
 }

-fn ones(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::ones((2, 3), DType::U8, device)?.to_vec2::<u8>()?,
-        [[1, 1, 1], [1, 1, 1]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::U32, device)?.to_vec2::<u32>()?,
-        [[1, 1, 1], [1, 1, 1]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::I64, device)?.to_vec2::<i64>()?,
-        [[1, 1, 1], [1, 1, 1]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::F32, device)?.to_vec2::<f32>()?,
-        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::F64, device)?.to_vec2::<f64>()?,
-        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
-    );
-    Ok(())
-}
-
-fn full(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::full(42u32, (2, 3), device)?.to_vec2::<u32>()?,
-        [[42, 42, 42], [42, 42, 42]],
-    );
-    Ok(())
-}
-
-fn arange(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::arange(0u8, 5u8, device)?.to_vec1::<u8>()?,
-        [0, 1, 2, 3, 4],
-    );
-    assert_eq!(
-        Tensor::arange_step(0u8, 5u8, 2, device)?.to_vec1::<u8>()?,
-        [0, 2, 4],
-    );
-    assert_eq!(
-        Tensor::arange_step(0u8, 5u8, 3, device)?.to_vec1::<u8>()?,
-        [0, 3],
-    );
-    assert_eq!(
-        Tensor::arange_step(5i64, 0i64, -1, device)?.to_vec1::<i64>()?,
-        [5, 4, 3, 2, 1],
-    );
-    Ok(())
-}
-
 fn add_mul(device: &Device) -> Result<()> {
    let tensor = Tensor::new(&[3f32, 1., 4.], device)?;
    let dim1 = tensor.dims1()?;
@ -85,83 +33,6 @@ fn tensor_2d(device: &Device) -> Result<()> {
    Ok(())
 }

-fn clamp(device: &Device) -> Result<()> {
-    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
-    let tensor = Tensor::new(data, device)?;
-    let tensor = tensor.clamp(1.5, 6.2)?;
-    assert_eq!(
-        tensor.to_vec2::<f32>()?,
-        [[3.0, 1.5, 4.0, 1.5, 5.0], [2.0, 1.5, 6.2, 6.2, 2.0]],
-    );
-    Ok(())
-}
-
-fn unary_op(device: &Device) -> Result<()> {
-    let data = &[[-3f32, 1., 4., -0.1, 0.5], [2.7, -1.8, -0.28, 1.8, 2.8]];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.gelu()?, 4)?,
-        [
-            [-0.0036, 0.8412, 3.9999, -0.046, 0.3457],
-            [2.6911, -0.0647, -0.1091, 1.7353, 2.7933]
-        ]
-    );
-    let t_f16 = tensor.to_dtype(DType::F16)?.gelu()?.to_dtype(DType::F32)?;
-    let max_diff = (tensor.gelu()? - t_f16)?.flatten_all()?.max(0)?;
-    assert!(max_diff.to_vec0::<f32>()? < 5e-3);
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.gelu_erf()?, 4)?,
-        [
-            [-0.004, 0.8413, 3.9999, -0.046, 0.3457],
-            [2.6906, -0.0647, -0.1091, 1.7353, 2.7928]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.erf()?, 4)?,
-        [
-            [-1.0, 0.8427, 1.0, -0.1125, 0.5205],
-            [0.9999, -0.9891, -0.3079, 0.9891, 0.9999]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.silu()?, 4)?,
-        [
-            [-0.1423, 0.7311, 3.9281, -0.0475, 0.3112],
-            [2.53, -0.2553, -0.1205, 1.5447, 2.6395]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.ceil()?, 4)?,
-        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -1.0, -0.0, 2.0, 3.0]]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.floor()?, 4)?,
-        [[-3.0, 1.0, 4.0, -1.0, 0.0], [2.0, -2.0, -1.0, 1.0, 2.0]]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.round()?, 4)?,
-        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -2.0, -0.0, 2.0, 3.0]]
-    );
-    let tensor = Tensor::new(&[2997.9246, 314.15926f32], device)?;
-    assert_eq!(
-        test_utils::to_vec1_round(&tensor.round_to(2)?, 4)?,
-        [2997.92, 314.16]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(&tensor.round_to(-2)?, 4)?,
-        [3000.0, 300.]
-    );
-    let tensor = Tensor::new(
-        &[-1.01f32, -0.9, -0.1, 0.0, -0.0, 0.1, 0.9, 1.0, 1.1],
-        device,
-    )?;
-    assert_eq!(
-        tensor.sign()?.to_vec1::<f32>()?,
-        [-1., -1., -1., 0., 0., 1., 1., 1., 1.]
-    );
-    Ok(())
-}
-
 fn binary_op(device: &Device) -> Result<()> {
    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
    let tensor1 = Tensor::new(data, device)?;
@ -206,22 +77,6 @@ fn transpose(device: &Device) -> Result<()> {
    Ok(())
 }

-fn var(device: &Device) -> Result<()> {
-    // Values taken from https://pytorch.org/docs/stable/generated/torch.var.html
-    let data = &[
-        [0.2035f32, 1.2959, 1.8101, -0.4644],
-        [1.5027, -0.3270, 0.5905, 0.6538],
-        [-1.5745, 1.3330, -0.5596, -0.6548],
-        [0.1264, -0.5080, 1.6420, 0.1992],
-    ];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.var_keepdim(1)?, 4)?,
-        &[[1.0631], [0.559], [1.4893], [0.8258]]
-    );
-    Ok(())
-}
-
 fn sum(device: &Device) -> Result<()> {
    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
    let tensor = Tensor::new(data, device)?;
@ -683,31 +538,6 @@ fn cat(device: &Device) -> Result<()> {
            [2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0]
        ]
    );
-
-    // 3D
-    let t1 = Tensor::arange(0, 48i64, device)?.reshape((2, 6, 4))?;
-    let t2 = Tensor::arange(100, 124i64, device)?.reshape((2, 3, 4))?;
-    let t3 = Tensor::arange(10000, 10032i64, device)?.reshape((2, 4, 4))?;
-
-    let t_cat = Tensor::cat(&[&t1, &t2, &t3], 1)?;
-
-    let t1 = t1.t()?.contiguous()?.t()?;
-    let t2 = t2.t()?.contiguous()?.t()?;
-    let t3 = t3.t()?.contiguous()?.t()?;
-    let t_cat2 = Tensor::cat(&[&t1, &t2, &t3], 1)?;
-
-    let diff = t_cat.eq(&t_cat2)?.to_dtype(DType::F32)?.sum_all()?;
-    assert_eq!(diff.to_vec0::<f32>()?, 104.0);
-    assert_eq!(t_cat.i((0, 0, 0))?.to_vec0::<i64>()?, 0);
-    assert_eq!(t_cat.i((0, 4, 0))?.to_vec0::<i64>()?, 16);
-    assert_eq!(t_cat.i((0, 5, 0))?.to_vec0::<i64>()?, 20);
-    assert_eq!(t_cat.i((1, 5, 0))?.to_vec0::<i64>()?, 44);
-    assert_eq!(t_cat.i((0, 6, 0))?.to_vec0::<i64>()?, 100);
-    assert_eq!(t_cat.i((1, 6, 0))?.to_vec0::<i64>()?, 112);
-    assert_eq!(t_cat.i((0, 6, 1))?.to_vec0::<i64>()?, 101);
-    assert_eq!(t_cat.i((0, 7, 1))?.to_vec0::<i64>()?, 105);
-    assert_eq!(t_cat.i((0, 12, 1))?.to_vec0::<i64>()?, 10013);
-    assert_eq!(t_cat.i((1, 12, 3))?.to_vec0::<i64>()?, 10031);
    Ok(())
 }

@ -718,8 +548,6 @@ fn embeddings(device: &Device) -> Result<()> {
    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
    let hs = t.index_select(&ids, 0)?;
    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
-    let hs = t.index_select(&ids.to_dtype(DType::I64)?, 0)?;
-    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
    Ok(())
 }

@ -747,48 +575,21 @@ fn index_select(device: &Device) -> Result<()> {
            [9.0, 10.0, 11.0]
        ]
    );
-    for dtype in [DType::U8, DType::U32, DType::I64] {
-        let ids = ids.to_dtype(dtype)?;
-        let hs = t.index_select(&ids, 1)?;
-        assert_eq!(
-            hs.to_vec2::<f32>()?,
-            &[
-                [0.0, 2.0, 1.0],
-                [3.0, 5.0, 4.0],
-                [6.0, 8.0, 7.0],
-                [9.0, 11.0, 10.0]
-            ]
-        );
-        let hs = t.index_select(&ids, 0)?;
-        assert_eq!(
-            hs.to_vec2::<f32>()?,
-            &[[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]]
-        );
-        // Prior to https://github.com/huggingface/candle/pull/1022
-        // There would be a bug where the last values in the result tensor would be set to 0.
-        let ids = Tensor::new(&[0u32, 2u32, 1u32, 0u32, 2u32, 1u32], device)?;
-        let hs = t.index_select(&ids, 0)?;
-        assert_eq!(
-            hs.to_vec2::<f32>()?,
-            &[
-                [0.0, 1.0, 2.0],
-                [6.0, 7.0, 8.0],
-                [3.0, 4.0, 5.0],
-                [0.0, 1.0, 2.0],
-                [6.0, 7.0, 8.0],
-                [3.0, 4.0, 5.0],
-            ]
-        );
-
-        // Test when selecting dim > 0 with ids size different from elem count of
-        // target dim in source/input.
-        let ids = Tensor::new(&[1u32, 0u32, 1u32], device)?;
-        let t = Tensor::arange(1f32, 5f32, device)?.reshape((2, 2))?;
-        assert_eq!(t.to_vec2::<f32>()?, &[[1.0, 2.0], [3.0, 4.0]]);
-        let hs = t.index_select(&ids, 1)?;
-        assert_eq!(hs.to_vec2::<f32>()?, &[[2.0, 1.0, 2.0], [4.0, 3.0, 4.0]]);
-    }
-
+    let hs = t.index_select(&ids, 1)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[
+            [0.0, 2.0, 1.0],
+            [3.0, 5.0, 4.0],
+            [6.0, 8.0, 7.0],
+            [9.0, 11.0, 10.0]
+        ]
+    );
+    let hs = t.index_select(&ids, 0)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]]
+    );
    Ok(())
 }

@ -835,48 +636,6 @@ fn index_add(device: &Device) -> Result<()> {
    Ok(())
 }

-fn slice_scatter(device: &Device) -> Result<()> {
-    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    let src = Tensor::arange(100f32, 106f32, device)?.reshape((2, 3))?;
-    assert_eq!(
-        t.slice_scatter0(&src, 0)?.to_vec2::<f32>()?,
-        &[
-            [100.0, 101.0, 102.0],
-            [103.0, 104.0, 105.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    assert_eq!(
-        t.slice_scatter0(&src, 1)?.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [100.0, 101.0, 102.0],
-            [103.0, 104.0, 105.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    assert_eq!(
-        t.slice_scatter0(&src, 2)?.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [100.0, 101.0, 102.0],
-            [103.0, 104.0, 105.0],
-        ]
-    );
-    Ok(())
-}
-
 fn scatter_add(device: &Device) -> Result<()> {
    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
    assert_eq!(
@ -949,6 +708,74 @@ fn gather(device: &Device) -> Result<()> {
    Ok(())
 }

+fn matmul(device: &Device) -> Result<()> {
+    let data = vec![1.0f32, 2.0, 3.0, 4.0];
+    let a = Tensor::from_slice(&data, (2, 2), device)?;
+    let data = vec![1.0f32, 2.0, 3.0, 4.0];
+    let b = Tensor::from_slice(&data, (2, 2), device)?;
+
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec2::<f32>()?, &[[7.0f32, 10.0], [15.0, 22.0]]);
+
+    let data = vec![1.0f32, 2.0];
+    let a = Tensor::from_slice(&data, (2, 1), device)?;
+    let data = vec![3.0f32, 4.0];
+    let b = Tensor::from_slice(&data, (1, 2), device)?;
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec2::<f32>()?, &[&[3.0, 4.0], &[6.0, 8.0]]);
+
+    let data: Vec<_> = (0..6).map(|i| i as f32).collect();
+    let a = Tensor::from_slice(&data, (2, 3), device)?;
+    let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect();
+    let b = Tensor::from_slice(&data, (3, 2), device)?;
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec2::<f32>()?, &[&[16., 19.], &[52., 64.]]);
+
+    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+    let a = Tensor::from_slice(&data, (2, 2, 3), device)?;
+    let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect();
+    let b = Tensor::from_slice(&data, (2, 3, 2), device)?;
+    let expected = [[[16., 19.], [52., 64.]], [[214., 235.], [304., 334.]]];
+
+    let c = a.matmul(&b)?;
+    assert_eq!(c.to_vec3::<f32>()?, &expected);
+
+    // Also perform the matmul on contiguous transposed versions.
+    let a_tt = a.t()?.contiguous()?.t()?;
+    assert!(!a_tt.is_contiguous());
+    assert_eq!(a.dims(), a_tt.dims());
+    assert_eq!(a_tt.stride(), &[6, 1, 2]);
+
+    let b_tt = b.t()?.contiguous()?.t()?;
+    assert!(!b_tt.is_contiguous());
+    assert_eq!(b.dims(), b_tt.dims());
+    assert_eq!(b_tt.stride(), &[6, 1, 3]);
+
+    assert_eq!(a_tt.matmul(&b)?.to_vec3::<f32>()?, &expected);
+    assert_eq!(a.matmul(&b_tt)?.to_vec3::<f32>()?, &expected);
+    assert_eq!(a_tt.matmul(&b_tt)?.to_vec3::<f32>()?, &expected);
+    Ok(())
+}
+
+fn broadcast_matmul(device: &Device) -> Result<()> {
+    let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?;
+    let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?;
+    let out = lhs.broadcast_matmul(&rhs)?;
+    assert_eq!(out.dims(), &[3, 6, 4, 2]);
+    for idx1 in 0..3 {
+        for idx2 in 0..6 {
+            let out = out.i((idx1, idx2))?;
+            let lhs = lhs.i((idx1, 0))?;
+            let rhs = rhs.i(idx2)?;
+            let out2 = lhs.matmul(&rhs);
+            let sum_diff2 = (out - out2)?.sqr()?.sum_all()?;
+            // With cuda, we see errors of up to ~1e-12.
+            assert!(sum_diff2.to_vec0::<f32>()? < 1e-6)
+        }
+    }
+    Ok(())
+}
+
 fn broadcasting(device: &Device) -> Result<()> {
    let t1 = Tensor::arange(0f32, 24f32, device)?.reshape((4, 2, 3))?;
    let t2 = Tensor::new(&[100f32, 200f32], device)?;
@ -1050,87 +877,28 @@ fn broadcasting(device: &Device) -> Result<()> {
    Ok(())
 }

-fn randn(device: &Device) -> Result<()> {
-    let tensor = Tensor::randn(0f32, 1f32, (5, 3), device)?;
-    assert_eq!(tensor.dims(), [5, 3]);
-    // Check that the seed gets updated by checking that
-    // a new series of numbers is generated each time
-    let tensor2 = Tensor::randn(0f32, 1f32, (5, 3), device)?;
-    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
-    let tensor = Tensor::rand(0f32, 1f32, (5, 3), device)?;
-    assert_eq!(tensor.dims(), [5, 3]);
-    // Check that the seed gets updated by checking that
-    // a new series of numbers is generated each time
-    let tensor2 = Tensor::rand(0f32, 1f32, (5, 3), device)?;
-    assert_ne!(tensor.to_vec2::<f32>()?, tensor2.to_vec2::<f32>()?);
-    // We do not expect deterministic elements at any index.
-    // There once was a bug that had a deterministic zero element in evenly sized tensors.
-    const N: usize = 2;
-    let v = (0..100)
-        .map(|_| Tensor::randn(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
-        .collect::<Result<Vec<_>>>()?;
-    assert!(
-        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
-        "There are deterministic values in the randn tensors"
-    );
-    let v = (0..100)
-        .map(|_| Tensor::rand(0f32, 1f32, N, device).and_then(|t| t.to_vec1::<f32>()))
-        .collect::<Result<Vec<_>>>()?;
-    assert!(
-        (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])),
-        "There are deterministic values in the rand tensors"
-    );
-    Ok(())
-}
-
-test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal);
-test_device!(ones, ones_cpu, ones_gpu, ones_metal);
-test_device!(full, full_cpu, full_gpu, full_metal);
-test_device!(arange, arange_cpu, arange_gpu, arange_metal);
-test_device!(add_mul, add_mul_cpu, add_mul_gpu, add_mul_metal);
-test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu, tensor_2d_metal);
-test_device!(narrow, narrow_cpu, narrow_gpu, narrow_metal);
-test_device!(broadcast, broadcast_cpu, broadcast_gpu, broadcast_metal);
-test_device!(cat, cat_cpu, cat_gpu, cat_metal);
-test_device!(sum, sum_cpu, sum_gpu, sum_metal);
-test_device!(min, min_cpu, min_gpu, min_metal);
-test_device!(max, max_cpu, max_gpu, max_metal);
-test_device!(argmax, argmax_cpu, argmax_gpu, argmax_metal);
-test_device!(argmin, argmin_cpu, argmin_gpu, argmin_metal);
-test_device!(transpose, transpose_cpu, transpose_gpu, transpose_metal);
-test_device!(unary_op, unary_op_cpu, unary_op_gpu, unary_op_metal);
-test_device!(binary_op, binary_op_cpu, binary_op_gpu, binary_op_metal);
-test_device!(embeddings, embeddings_cpu, embeddings_gpu, embeddings_metal);
-test_device!(cmp, cmp_cpu, cmp_gpu, cmp_metal);
-test_device!(
-    broadcasting,
-    broadcasting_cpu,
-    broadcasting_gpu,
-    broadcasting_metal
-);
-test_device!(
-    index_select,
-    index_select_cpu,
-    index_select_gpu,
-    index_select_metal
-);
-test_device!(index_add, index_add_cpu, index_add_gpu, index_add_metal);
-test_device!(gather, gather_cpu, gather_gpu, gather_metal);
-test_device!(
-    scatter_add,
-    scatter_add_cpu,
-    scatter_add_gpu,
-    scatter_add_metal
-);
-test_device!(
-    slice_scatter,
-    slice_scatter_cpu,
-    slice_scatter_gpu,
-    slice_scatter_metal
-);
-test_device!(randn, randn_cpu, randn_gpu, randn_metal);
-test_device!(clamp, clamp_cpu, clamp_gpu, clamp_metal);
-test_device!(var, var_cpu, var_gpu, var_metal);
+test_device!(zeros, zeros_cpu, zeros_gpu);
+test_device!(add_mul, add_mul_cpu, add_mul_gpu);
+test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu);
+test_device!(narrow, narrow_cpu, narrow_gpu);
+test_device!(broadcast, broadcast_cpu, broadcast_gpu);
+test_device!(cat, cat_cpu, cat_gpu);
+test_device!(sum, sum_cpu, sum_gpu);
+test_device!(min, min_cpu, min_gpu);
+test_device!(max, max_cpu, max_gpu);
+test_device!(argmax, argmax_cpu, argmax_gpu);
+test_device!(argmin, argmin_cpu, argmin_gpu);
+test_device!(transpose, transpose_cpu, transpose_gpu);
+test_device!(binary_op, binary_op_cpu, binary_op_gpu);
+test_device!(embeddings, embeddings_cpu, embeddings_gpu);
+test_device!(cmp, cmp_cpu, cmp_gpu);
+test_device!(matmul, matmul_cpu, matmul_gpu);
+test_device!(broadcast_matmul, broadcast_matmul_cpu, broadcast_matmul_gpu);
+test_device!(broadcasting, broadcasting_cpu, broadcasting_gpu);
+test_device!(index_select, index_select_cpu, index_select_gpu);
+test_device!(index_add, index_add_cpu, index_add_gpu);
+test_device!(gather, gather_cpu, gather_gpu);
+test_device!(scatter_add, scatter_add_cpu, scatter_add_gpu);

 // There was originally a bug on the CPU implementation for randn
 // https://github.com/huggingface/candle/issues/381
@ -1142,124 +910,3 @@ fn randn_hasneg() -> Result<()> {
    }
    Ok(())
 }
-
-#[test]
-fn pad_with_same() -> Result<()> {
-    let t = Tensor::arange(1f32, 5f32, &Device::Cpu)?.reshape((2, 2))?;
-    let t0 = t.pad_with_same(0, 1, 2)?;
-    assert_eq!(
-        t0.to_vec2::<f32>()?,
-        [[1.0, 2.0], [1.0, 2.0], [3.0, 4.0], [3.0, 4.0], [3.0, 4.0]]
-    );
-    let t1 = t.pad_with_same(1, 1, 2)?;
-    assert_eq!(
-        t1.to_vec2::<f32>()?,
-        [[1.0, 1.0, 2.0, 2.0, 2.0], [3.0, 3.0, 4.0, 4.0, 4.0]]
-    );
-    Ok(())
-}
-
-#[test]
-fn i64_abs() -> Result<()> {
-    let t = Tensor::new(&[-42i64, 1337], &Device::Cpu)?;
-    let t = t.abs()?;
-    assert_eq!(t.to_vec1::<i64>()?, [42, 1337]);
-    Ok(())
-}
-
-#[test]
-fn tril_triu_eye() -> Result<()> {
-    let t = Tensor::tril2(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 1.0, 0.0, 0.0],
-            [1.0, 1.0, 1.0, 0.0],
-            [1.0, 1.0, 1.0, 1.0]
-        ],
-    );
-    let t = Tensor::triu2(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 1.0, 1.0, 1.0],
-            [0.0, 1.0, 1.0, 1.0],
-            [0.0, 0.0, 1.0, 1.0],
-            [0.0, 0.0, 0.0, 1.0]
-        ]
-    );
-    let t = Tensor::eye(4, DType::F32, &Device::Cpu)?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        [
-            [1.0, 0.0, 0.0, 0.0],
-            [0.0, 1.0, 0.0, 0.0],
-            [0.0, 0.0, 1.0, 0.0],
-            [0.0, 0.0, 0.0, 1.0]
-        ]
-    );
-    Ok(())
-}
-
-#[test]
-fn cumsum() -> Result<()> {
-    let t = &[3f32, 1., 4., 1., 5.];
-    let t = Tensor::new(t, &Device::Cpu)?;
-    assert_eq!(t.cumsum(0)?.to_vec1::<f32>()?, [3., 4., 8., 9., 14.]);
-    let t = t.unsqueeze(1)?;
-    assert_eq!(
-        t.cumsum(0)?.to_vec2::<f32>()?,
-        [[3.0], [4.0], [8.0], [9.0], [14.0]]
-    );
-    assert_eq!(
-        t.cumsum(1)?.to_vec2::<f32>()?,
-        [[3.0], [1.0], [4.0], [1.0], [5.0]]
-    );
-    let t = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
-    let t = Tensor::new(t, &Device::Cpu)?;
-    assert_eq!(
-        t.cumsum(1)?.to_vec2::<f32>()?,
-        [[3.0, 4.0, 8.0, 9.0, 14.0], [2.0, 3.0, 10.0, 18.0, 20.0]],
-    );
-    assert_eq!(
-        t.cumsum(0)?.to_vec2::<f32>()?,
-        [[3.0, 1.0, 4.0, 1.0, 5.0], [5.0, 2.0, 11.0, 9.0, 7.0]]
-    );
-    Ok(())
-}
-
-/// A helper function for floating point comparison. Both a and b must be 1D Tensor and contains the same amount of data.
-/// Assertion passes if the difference of all pairs of a and b is smaller than epsilon.
-fn assert_close(a: &Tensor, b: &Tensor, epsilon: f64) -> Result<()> {
-    let a_vec: Vec<f64> = a.to_vec1()?;
-    let b_vec: Vec<f64> = b.to_vec1()?;
-
-    assert_eq!(a_vec.len(), b_vec.len());
-    for (a, b) in a_vec.iter().zip(b_vec.iter()) {
-        assert!((a - b).abs() < epsilon);
-    }
-    Ok(())
-}
-
-#[test]
-fn log_sum_exp() -> Result<()> {
-    let input = Tensor::new(&[[1f64, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
-    let output = input.log_sum_exp(D::Minus1)?;
-    // The expectations obtained from pytorch.
-    let expected = Tensor::new(&[3.4076, 6.4076], &Device::Cpu)?;
-    assert_close(&output, &expected, 0.00001)?;
-    Ok(())
-}
-
-#[test]
-fn pow() -> Result<()> {
-    let lhs = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
-    let rhs = (&lhs - 2.)?;
-    let res = lhs.pow(&rhs)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&res, 3)?,
-        [[1.0, 1.0, 3.0], [16.0, 125.0, 1296.0]]
-    );
-    Ok(())
-}
--- a/candle-core/tests/test.npy
+++ b/candle-core/tests/test.npy
--- a/candle-core/tests/test.npz
+++ b/candle-core/tests/test.npz
--- a/candle-core/tests/test.pt
+++ b/candle-core/tests/test.pt
--- a/candle-core/tests/test_with_key.pt
+++ b/candle-core/tests/test_with_key.pt
--- a/candle-datasets/Cargo.toml
+++ b/candle-datasets/Cargo.toml
@ -11,8 +11,8 @@ readme = "README.md"

 [dependencies]
 byteorder = { workspace = true }
-candle = { workspace = true }
-candle-nn = { workspace = true }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
 hf-hub = { workspace = true}
 intel-mkl-src = { workspace = true, optional = true }
 memmap2 = { workspace = true }
--- a/candle-datasets/src/vision/cifar.rs
+++ b/candle-datasets/src/vision/cifar.rs
@ -4,9 +4,7 @@
 //! <https://www.cs.toronto.edu/~kriz/cifar.html>
 //! The binary version of the dataset is used.
 use crate::vision::Dataset;
-use candle::{DType, Device, Error, Result, Tensor};
-use hf_hub::{api::sync::Api, Repo, RepoType};
-use parquet::file::reader::{FileReader, SerializedFileReader};
+use candle::{DType, Device, Result, Tensor};
 use std::fs::File;
 use std::io::{BufReader, Read};

@ -62,58 +60,3 @@ pub fn load_dir<T: AsRef<std::path::Path>>(dir: T) -> Result<Dataset> {
        labels: 10,
    })
 }
-
-fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor, Tensor)> {
-    let samples = parquet.metadata().file_metadata().num_rows() as usize;
-    let mut buffer_images: Vec<u8> = Vec::with_capacity(samples * 1_024);
-    let mut buffer_labels: Vec<u8> = Vec::with_capacity(samples);
-    for row in parquet.into_iter().flatten() {
-        for (_name, field) in row.get_column_iter() {
-            if let parquet::record::Field::Group(subrow) = field {
-                for (_name, field) in subrow.get_column_iter() {
-                    if let parquet::record::Field::Bytes(value) = field {
-                        let image = image::load_from_memory(value.data()).unwrap();
-                        buffer_images.extend(image.to_rgb8().as_raw());
-                    }
-                }
-            } else if let parquet::record::Field::Long(label) = field {
-                buffer_labels.push(*label as u8);
-            }
-        }
-    }
-    let images = (Tensor::from_vec(buffer_images, (samples, 3, 32, 32), &Device::Cpu)?
-        .to_dtype(DType::U8)?
-        / 255.)?;
-    let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?;
-    Ok((images, labels))
-}
-
-pub fn load() -> Result<Dataset> {
-    let api = Api::new().map_err(|e| Error::Msg(format!("Api error: {e}")))?;
-    let dataset_id = "cifar10".to_string();
-    let repo = Repo::with_revision(
-        dataset_id,
-        RepoType::Dataset,
-        "refs/convert/parquet".to_string(),
-    );
-    let repo = api.repo(repo);
-    let test_parquet_filename = repo
-        .get("plain_text/test/0000.parquet")
-        .map_err(|e| Error::Msg(format!("Api error: {e}")))?;
-    let train_parquet_filename = repo
-        .get("plain_text/train/0000.parquet")
-        .map_err(|e| Error::Msg(format!("Api error: {e}")))?;
-    let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?)
-        .map_err(|e| Error::Msg(format!("Parquet error: {e}")))?;
-    let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?)
-        .map_err(|e| Error::Msg(format!("Parquet error: {e}")))?;
-    let (test_images, test_labels) = load_parquet(test_parquet)?;
-    let (train_images, train_labels) = load_parquet(train_parquet)?;
-    Ok(crate::vision::Dataset {
-        train_images,
-        train_labels,
-        test_images,
-        test_labels,
-        labels: 10,
-    })
-}
--- a/Show More
+++ b/Show More