CustomOp for einsum.

2025-06-17 02:58:50 +00:00 · 2023-09-08 20:46:30 +01:00
794 changed files with 11601 additions and 153646 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,7 +0,0 @@
 version: 2
 updates:
  - package-ecosystem: "cargo"
    directory: "/"
    schedule:
      interval: "weekly"
    open-pull-requests-limit: 5
--- a/.github/workflows/book-cd.yml
+++ b/.github/workflows/book-cd.yml
@ -0,0 +1,40 @@
 name: Deploy Rust book
 on:
  push:
    branches:
      - main
 jobs:
  deploy:
    runs-on: ubuntu-latest
    permissions:
      contents: write  # To push a branch 
      pull-requests: write  # To create a PR from that branch
    steps:
    - uses: actions/checkout@v3
      with:
        fetch-depth: 0
    - name: Install latest mdbook
      run: |
        tag=$(curl 'https://api.github.com/repos/rust-lang/mdbook/releases/latest' | jq -r '.tag_name')
        url="https://github.com/rust-lang/mdbook/releases/download/${tag}/mdbook-${tag}-x86_64-unknown-linux-gnu.tar.gz"
        mkdir mdbook
        curl -sSL $url | tar -xz --directory=./mdbook
        echo `pwd`/mdbook >> $GITHUB_PATH
    - name: Deploy GitHub Pages
      run: |
        # This assumes your book is in the root of your repository.
        # Just add a `cd` here if you need to change to another directory.
        cd candle-book
        mdbook build
        git worktree add gh-pages
        git config user.name "Deploy from CI"
        git config user.email ""
        cd gh-pages
        # Delete the ref to avoid keeping history.
        git update-ref -d refs/heads/gh-pages
        rm -rf *
        mv ../book/* .
        git add .
        git commit -m "Deploy $GITHUB_SHA to gh-pages"
        git push --force --set-upstream origin gh-pages
--- a/.github/workflows/book.yml
+++ b/.github/workflows/book.yml
@ -0,0 +1,29 @@
 name: CI
 on: 
  pull_request:
 jobs:
  test:
    name: Test candle-book
    runs-on: ubuntu-latest
    permissions:
      contents: write  # To push a branch 
      pull-requests: write  # To create a PR from that branch
    steps:
    - uses: actions/checkout@master
    - name: Install Rust
      run: |
        rustup set profile minimal
        rustup toolchain install stable
        rustup default stable
    - name: Install latest mdbook
      run: |
        tag=$(curl 'https://api.github.com/repos/rust-lang/mdbook/releases/latest' | jq -r '.tag_name')
        url="https://github.com/rust-lang/mdbook/releases/download/${tag}/mdbook-${tag}-x86_64-unknown-linux-gnu.tar.gz"
        mkdir bin
        curl -sSL $url | tar -xz --directory=bin
        echo "$(pwd)/bin" >> $GITHUB_PATH
    - name: Run tests
      run: cd candle-book && cargo build && mdbook test -L ../target/debug/deps/
--- a/.github/workflows/ci_cuda.yaml
+++ b/.github/workflows/ci_cuda.yaml
@ -5,16 +5,47 @@ on:
  pull_request:
 jobs:
  start-runner:
    name: Start self-hosted EC2 runner
    runs-on: ubuntu-latest
    env:
      AWS_REGION: us-east-1
      EC2_AMI_ID: ami-03cfed9ea28f4b002
      EC2_INSTANCE_TYPE: g5.xlarge
      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
      EC2_SECURITY_GROUP: sg-030175c435ac141d6
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}
      - name: Start EC2 runner
        id: start-ec2-runner
        uses: philschmid/philschmid-ec2-github-runner@main
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ${{ env.EC2_AMI_ID }}
          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
          subnet-id: ${{ env.EC2_SUBNET_ID }}
          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
          aws-resource-tags: > # optional, requires additional permissions
            [
              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
            ]
  test-cuda:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    runs-on:
+    needs: start-runner # required to start the main job when the runner is ready
-      group: aws-g4dn-2xlarge
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    container:
      image: nvidia/cuda:12.3.1-devel-ubuntu22.04
      options: --gpus 0 
    if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
    permissions:
      contents: write
      packages: write
@ -25,10 +56,32 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Install dependencies
        run: apt-get update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y
      - name: Install Rust Stable
-        uses: actions-rust-lang/setup-rust-toolchain@v1
+        run: curl https://sh.rustup.rs -sSf | sh -s -- -y
      - uses: Swatinem/rust-cache@v2
      - run: apt-get update -y && apt-get install libssl-dev -y
      - name: Test (cuda)
-        run: cargo test --features cuda
+        run: PATH=$PATH:/usr/local/cuda-11.8/bin/ /root/.cargo/bin/cargo test --features cuda
  stop-runner:
    name: Stop self-hosted EC2 runner
    needs:
      - start-runner
      - test-cuda
    runs-on: ubuntu-latest
    env:
      AWS_REGION: us-east-1
    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}
      - name: Stop EC2 runner
        uses: philschmid/philschmid-ec2-github-runner@main
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/.github/workflows/maturin.yml
+++ b/.github/workflows/maturin.yml
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@ -1,68 +0,0 @@
 name: PyO3-CI
 on:
  workflow_dispatch:
  push:
    branches:
      - main
    paths:
      - candle-pyo3/**
  pull_request:
    paths:
      - candle-pyo3/**
 jobs:
  build_and_test:
    name: Check everything builds & tests
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest] # For now, only test on Linux
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Rust
        uses: actions-rs/toolchain@v1
        with:
          toolchain: stable
      - name: Install Python
        uses: actions/setup-python@v4
        with:
          python-version: 3.11
          architecture: "x64"
      - name: Cache Cargo Registry
        uses: actions/cache@v1
        with:
          path: ~/.cargo/registry
          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
      - name: Install Protoc
        uses: arduino/setup-protoc@v2
        with:
            version: "25.0"
            repo-token: ${{ secrets.GITHUB_TOKEN }}
      - name: Install
        working-directory: ./candle-pyo3
        run: |
          python -m venv .env
          source .env/bin/activate
          pip install -U pip
          pip install pytest maturin black
          python -m maturin develop -r --features onnx
      - name: Check style
        working-directory: ./candle-pyo3
        run: |
          source .env/bin/activate
          python stub.py --check
          black --check .
      - name: Run tests
        working-directory: ./candle-pyo3
        run: |
          source .env/bin/activate
          python -m pytest -s -v tests
--- a/.github/workflows/rust-ci.yml
+++ b/.github/workflows/rust-ci.yml
@ -1,6 +1,6 @@
-on:
+on: 
  push:
-    branches:
+    branches: 
      - main
  pull_request:
@ -15,10 +15,7 @@ jobs:
        os: [ubuntu-latest, windows-latest, macOS-latest]
        rust: [stable]
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - uses: actions-rs/toolchain@v1
        with:
          profile: minimal
@ -37,13 +34,7 @@ jobs:
        os: [ubuntu-latest, windows-latest, macOS-latest]
        rust: [stable]
    steps:
-      - name: Delete huge unnecessary tools folder
+      - uses: actions/checkout@v2
        if: runner.os == 'Linux'
        run: rm -rf /opt/hostedtoolcache
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - uses: actions-rs/toolchain@v1
        with:
          profile: minimal
@ -58,7 +49,7 @@ jobs:
    name: Rustfmt
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2
      - uses: actions-rs/toolchain@v1
        with:
          profile: minimal
@ -74,7 +65,7 @@ jobs:
    name: Clippy
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2
      - uses: actions-rs/toolchain@v1
        with:
          profile: minimal
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -1,15 +0,0 @@
 on:
  push:
 name: Secret Leaks
 jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
        fetch-depth: 0
    - name: Secret Scanning
      uses: trufflesecurity/trufflehog@main
--- a/.gitignore
+++ b/.gitignore
@ -9,10 +9,6 @@ target/
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 # editor config
 .helix
 .vscode
 # These are backup files generated by rustfmt
 **/*.rs.bk
@ -27,22 +23,14 @@ flamegraph.svg
 *.dylib
 *.so
 *.swp
 *.swo
 trace-*.json
 candle-wasm-examples/*/build
 candle-wasm-examples/*/*.bin
 candle-wasm-examples/*/*.jpeg
-candle-wasm-examples/*/audios/*.wav
+candle-wasm-examples/*/*.wav
-candle-wasm-examples/**/*.safetensors
+candle-wasm-examples/*/*.safetensors
 candle-wasm-examples/**/*.gguf
 candle-wasm-examples/*/package-lock.json
-candle-wasm-examples/**/config*.json
+
 .DS_Store
 .idea/*
 __pycache__
 out.safetensors
 out.wav
 bria.mp3
 bria.safetensors
 bria.wav
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,11 +0,0 @@
 {
    "[python]": {
        "editor.defaultFormatter": "ms-python.black-formatter"
    },
    "python.formatting.provider": "none",
    "python.testing.pytestArgs": [
        "candle-pyo3"
    ],
    "python.testing.unittestEnabled": false,
    "python.testing.pytestEnabled": true
 }
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,84 +1,13 @@
 # Changelog
 This documents the main changes to the `candle` crate.
-## v0.3.1 - Unreleased
+## v0.2.1 - Unreleased
 ### Added
 ### Modified
 ## v0.3.0 - 2023-10-01
 ### Added
 - Added the Mistral 7b v0.1 model
  [983](https://github.com/huggingface/candle/pull/983).
 - Quantized version of the Mistral model
  [1009](https://github.com/huggingface/candle/pull/1009).
 - Add the gelu-erf op and activation function
  [969](https://github.com/huggingface/candle/pull/969).
 - Add the mixformer/phi-v1.5 model
  [930](https://github.com/huggingface/candle/pull/930).
 - Add the sclice-scatter op
  [927](https://github.com/huggingface/candle/pull/927).
 - Add the Wuerstchen diffusion model
  [911](https://github.com/huggingface/candle/pull/911).
 ### Modified
 - Support for simd128 intrinsics in some quantized vecdots
  [982](https://github.com/huggingface/candle/pull/982).
 - Optimize the index-select cuda kernel
  [976](https://github.com/huggingface/candle/pull/976).
 - Self-contained safetensor wrappers
  [946](https://github.com/huggingface/candle/pull/946).
 ## v0.2.2 - 2023-09-18
 ### Added
 - Support for `top_p` sampling
  [819](https://github.com/huggingface/candle/pull/819).
 - T5 model including decoding
  [864](https://github.com/huggingface/candle/pull/864).
 - 1-d upsampling
  [839](https://github.com/huggingface/candle/pull/839).
 ### Modified
 - Bugfix for conv2d
  [820](https://github.com/huggingface/candle/pull/820).
 - Support tensor based indexing using `.i`
  [842](https://github.com/huggingface/candle/pull/842).
 ## v0.2.1 - 2023-09-11
 ### Added
 - Add some RNNs (GRU and LSTM) in `candle-nn`
  [674](https://github.com/huggingface/candle/pull/674),
  [688](https://github.com/huggingface/candle/pull/688).
 - gguf v2 support
  [725](https://github.com/huggingface/candle/pull/725).
 - Quantized llama example in Python using the pyo3 api
  [716](https://github.com/huggingface/candle/pull/716).
 - `candle-nn` layer for conv2d-transposed
  [760](https://github.com/huggingface/candle/pull/760).
 - Add the Segment-Anything Model (SAM) as an example
  [773](https://github.com/huggingface/candle/pull/773).
 - TinyViT backbone for the segment anything example
  [787](https://github.com/huggingface/candle/pull/787).
 - Shape with holes support
  [770](https://github.com/huggingface/candle/pull/770).
 ### Modified
 - Dilations are now supported in conv-transpose2d.
  [671](https://github.com/huggingface/candle/pull/671).
 - Interactive mode for the quantized model
  [690](https://github.com/huggingface/candle/pull/690).
 - Faster softmax operation
  [747](https://github.com/huggingface/candle/pull/747).
 - Faster convolution operations on CPU and CUDA via im2col
  [802](https://github.com/huggingface/candle/pull/802).
 - Moving some models to a more central location
  [796](https://github.com/huggingface/candle/pull/796).
 ## v0.2.0 - 2023-08-30
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,24 +3,22 @@ members = [
    "candle-core",
    "candle-datasets",
    "candle-examples",
    "candle-book",
    "candle-nn",
    "candle-pyo3",
    "candle-transformers",
-    "candle-wasm-examples/*",
+    "candle-wasm-examples/llama2-c",
-    "candle-wasm-tests",
+    "candle-wasm-examples/whisper",
-    "tensor-tools",
+    "candle-wasm-examples/yolo",
 ]
 exclude = [
-   "candle-book",
+    "candle-flash-attn",
-   "candle-flash-attn",
+    "candle-kernels",
   "candle-kernels",
   "candle-metal-kernels",
   "candle-onnx",
 ]
 resolver = "2"
 [workspace.package]
-version = "0.9.0-alpha.2"
+version = "0.2.1"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@ -29,53 +27,38 @@ categories = ["science"]
 license = "MIT OR Apache-2.0"
 [workspace.dependencies]
 ab_glyph = "0.2.23"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
 candle = { path = "./candle-core", package = "candle-core", version = "0.9.0-alpha.2" }
 candle-datasets = { path = "./candle-datasets", version = "0.9.0-alpha.2" }
 candle-flash-attn = { path = "./candle-flash-attn", version = "0.9.0-alpha.2" }
 candle-kernels = { path = "./candle-kernels", version = "0.9.0-alpha.2" }
 candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.9.0-alpha.2" }
 candle-nn = { path = "./candle-nn", version = "0.9.0-alpha.2" }
 candle-onnx = { path = "./candle-onnx", version = "0.9.0-alpha.2" }
 candle-transformers = { path = "./candle-transformers", version = "0.9.0-alpha.2" }
 clap = { version = "4.2.4", features = ["derive"] }
-criterion = { version = "0.5.1", default-features=false }
+cudarc = { version = "0.9.14", features = ["f16"] }
-cudarc = { version = "0.15.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
+# TODO: Switch back to the official gemm implementation once it has caught up.
-fancy-regex = "0.13.0"
+gemm = { version = "0.15.6", package = "candle-gemm" }
-gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
+hf-hub = "0.3.0"
-hf-hub = "0.4.1"
+half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
-half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"] }
+image = { version = "0.24.7", default-features = false, features = ["jpeg", "png"] }
-hound = "3.5.1"
+imageproc = { version = "0.23.0", default-features = false }
 image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
-memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
+memmap2 = "0.7.1"
 num_cpus = "1.15.0"
 num-traits = "0.2.15"
-parquet = { version = "51.0.0" }
+rand = "0.8.5"
-rand = "0.9.0"
+rand_distr = "0.4.3"
 rand_distr = "0.5.1"
 rayon = "1.7.0"
-safetensors = "0.4.1"
+rusttype = { version = "0.9", default-features = false }
 safetensors = "0.3.1"
 serde = { version = "1.0.171", features = ["derive"] }
 serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
-tokenizers = { version = "0.21.0", default-features = false }
+tokenizers = { version = "0.13.4", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
-ug = "0.3.1"
+wav = "1.0.0"
-ug-cuda = "0.3.1"
+zip = { version = "0.6.6", default-features = false }
-ug-metal = "0.3.1"
+parquet = { version = "45.0.0" }
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "1.1.1", default-features = false }
 metal = { version = "0.27.0", features = ["mps"]}
 [profile.release-with-debug]
 inherits = "release"
--- a/README.md
+++ b/README.md
@ -2,17 +2,13 @@
 [![discord server](https://dcbadge.vercel.app/api/server/hugging-face-879548962464493619)](https://discord.gg/hugging-face-879548962464493619)
 [![Latest version](https://img.shields.io/crates/v/candle-core.svg)](https://crates.io/crates/candle-core)
 [![Documentation](https://docs.rs/candle-core/badge.svg)](https://docs.rs/candle-core)
-[![License](https://img.shields.io/github/license/base-org/node?color=blue)](https://github.com/huggingface/candle/blob/main/LICENSE-MIT)
+![License](https://img.shields.io/crates/l/candle-core.svg)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](https://github.com/huggingface/candle/blob/main/LICENSE-APACHE)
 Candle is a minimalist ML framework for Rust with a focus on performance (including GPU support) 
 and ease of use. Try our online demos: 
 [whisper](https://huggingface.co/spaces/lmz/candle-whisper),
 [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2),
-[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
+[yolo](https://huggingface.co/spaces/lmz/candle-yolo).
 [yolo](https://huggingface.co/spaces/lmz/candle-yolo),
 [Segment
 Anything](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).
 ## Get started
@ -49,101 +45,40 @@ For more advanced examples, please have a look at the following section.
 ## Check out our examples
-These online demos run entirely in your browser:
+Check out our [examples](./candle-examples/examples/):
 - [yolo](https://huggingface.co/spaces/lmz/candle-yolo): pose estimation and
  object recognition.
 - [whisper](https://huggingface.co/spaces/lmz/candle-whisper): speech recognition.
 - [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2): text generation.
 - [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm): text generation.
 - [Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
 - [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm): Image segmentation.
 - [BLIP](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning): image captioning.
 We also provide a some command line based examples using state of the art models:
 - [LLaMA v1, v2, and v3](./candle-examples/examples/llama/): general LLM, includes
  the SOLAR-10.7B variant.
 - [Falcon](./candle-examples/examples/falcon/): general LLM.
 - [Codegeex4](./candle-examples/examples/codegeex4-9b/): Code completion,code interpreter,web search,fuction calling,repository-level
 - [GLM4](./candle-examples/examples/glm4/): Open Multilingual Multimodal Chat LMs by THUDM
 - [Gemma v1 and v2](./candle-examples/examples/gemma/): 2b and 7b+/9b general LLMs from Google Deepmind.
 - [RecurrentGemma](./candle-examples/examples/recurrent-gemma/): 2b and 7b
  Griffin based models from Google that mix attention with a RNN like state.
 - [Phi-1, Phi-1.5, Phi-2, and Phi-3](./candle-examples/examples/phi/): 1.3b,
  2.7b, and 3.8b general LLMs with performance on par with 7b models.
 - [StableLM-3B-4E1T](./candle-examples/examples/stable-lm/): a 3b general LLM
  pre-trained on 1T tokens of English and code datasets. Also supports
  StableLM-2, a 1.6b LLM trained on 2T tokens, as well as the code variants.
 - [Mamba](./candle-examples/examples/mamba/): an inference only
  implementation of the Mamba state space model.
 - [Mistral7b-v0.1](./candle-examples/examples/mistral/): a 7b general LLM with
  better performance than all publicly available 13b models as of 2023-09-28.
 - [Mixtral8x7b-v0.1](./candle-examples/examples/mixtral/): a sparse mixture of
  experts 8x7b general LLM with better performance than a Llama 2 70B model with
  much faster inference.
 - [StarCoder](./candle-examples/examples/bigcode/) and
  [StarCoder2](./candle-examples/examples/starcoder2/): LLM specialized to code generation.
 - [Qwen1.5](./candle-examples/examples/qwen/): Bilingual (English/Chinese) LLMs.
 - [RWKV v5 and v6](./candle-examples/examples/rwkv/): An RNN with transformer level LLM
  performance.
 - [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
 - [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
  (English/Chinese) general LLMs with 6b and 34b parameters.
 - [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
  the LLaMA model using the same quantization techniques as
  [llama.cpp](https://github.com/ggerganov/llama.cpp).
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/quantized/assets/aoc.gif" width="600">
 - [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
  image generative model, support for the 1.5, 2.1, SDXL 1.0 and Turbo versions.
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg" width="200">
 - [Wuerstchen](./candle-examples/examples/wuerstchen/): another text to
  image generative model.
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/wuerstchen/assets/cat.jpg" width="200">
 - [yolo-v3](./candle-examples/examples/yolo-v3/) and
  [yolo-v8](./candle-examples/examples/yolo-v8/): object detection and pose
  estimation models.
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.od.jpg" width="200"><img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.pose.jpg" width="200">
 - [segment-anything](./candle-examples/examples/segment-anything/): image
  segmentation model with prompt.
 <img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/segment-anything/assets/sam_merged.jpg" width="200">
 - [SegFormer](./candle-examples/examples/segformer/): transformer based semantic segmentation model.
 - [Whisper](./candle-examples/examples/whisper/): speech recognition model.
- [EnCodec](./candle-examples/examples/encodec/): high-quality audio compression
+- [LLaMA and LLaMA-v2](./candle-examples/examples/llama/): general LLM.
-  model using residual vector quantization.
+- [Falcon](./candle-examples/examples/falcon/): general LLM.
- [MetaVoice](./candle-examples/examples/metavoice/): foundational model for
+- [Bert](./candle-examples/examples/bert/): useful for sentence embeddings.
-  text-to-speech.
+- [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code
- [Parler-TTS](./candle-examples/examples/parler-tts/): large text-to-speech
+  generation.
-  model.
+- [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
- [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
+  image generative model, support for the 1.5, 2.1, and SDXL 1.0 versions.
  [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
  using self-supervision (can be used for imagenet classification, depth
  evaluation, segmentation).
- [VGG](./candle-examples/examples/vgg/),
+- [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
-  [RepVGG](./candle-examples/examples/repvgg): computer vision models.
+  the LLaMA model using the same quantization techniques as
- [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
+  [llama.cpp](https://github.com/ggerganov/llama.cpp).
-  generate captions for an image.
+- [yolo-v3](./candle-examples/examples/yolo-v3/) and
- [CLIP](./candle-examples/examples/clip/): multi-model vision and language
+  [yolo-v8](./candle-examples/examples/yolo-v8/): object detection and pose
-  model.
+  estimation models.
- [TrOCR](./candle-examples/examples/trocr/): a transformer OCR model, with
+  [segment-anything](./candle-examples/examples/segment-anything/): image
-  dedicated submodels for hand-writing and printed recognition.
+  segmentation model with prompt.
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
+Run them using the following commands:
  model, generates the translated text from the input text.
 - [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model 
  that can answer real-world questions about images.
 Run them using commands like:
 ```
 cargo run --example whisper --release
 cargo run --example llama --release
 cargo run --example falcon --release
 cargo run --example bert --release
 cargo run --example bigcode --release
 cargo run --example stable-diffusion --release -- --prompt "a rusty robot holding a fire torch"
 cargo run --example dinov2 --release -- --image path/to/myinput.jpg
 cargo run --example quantized --release
 cargo run --example yolo-v3 --release -- myimage.jpg
 cargo run --example yolo-v8 --release -- myimage.jpg # for pose estimation, add --task pose 
 cargo run --example segment-anything --release -- --image myimage.jpg
 ```
 In order to use **CUDA** add `--features cuda` to the example command line. If
@ -153,10 +88,7 @@ There are also some wasm examples for whisper and
 [llama2.c](https://github.com/karpathy/llama2.c). You can either build them with
 `trunk` or try them online:
 [whisper](https://huggingface.co/spaces/lmz/candle-whisper),
-[llama2](https://huggingface.co/spaces/lmz/candle-llama2),
+[llama2](https://huggingface.co/spaces/lmz/candle-llama2).
 [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
 [Phi-1.5, and Phi-2](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm),
 [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).
 For LLaMA2, run the following command to retrieve the weight files and start a
 test server:
@ -169,32 +101,6 @@ trunk serve --release --port 8081
 And then head over to
 [http://localhost:8081/](http://localhost:8081/).
 <!--- ANCHOR: useful_libraries --->
 ## Useful External Resources
 - [`candle-tutorial`](https://github.com/ToluClassics/candle-tutorial): A
  very detailed tutorial showing how to convert a PyTorch model to Candle.
 - [`candle-lora`](https://github.com/EricLBuehler/candle-lora): Efficient and
  ergonomic LoRA implementation for Candle. `candle-lora` has      
  out-of-the-box LoRA support for many models from Candle, which can be found
  [here](https://github.com/EricLBuehler/candle-lora/tree/master/candle-lora-transformers/examples).
 - [`optimisers`](https://github.com/KGrewal1/optimisers): A collection of optimisers
  including SGD with momentum, AdaGrad, AdaDelta, AdaMax, NAdam, RAdam, and RMSprop.
 - [`candle-vllm`](https://github.com/EricLBuehler/candle-vllm): Efficient platform for inference and
  serving local LLMs including an OpenAI compatible API server.
 - [`candle-ext`](https://github.com/mokeyish/candle-ext): An extension library to Candle that provides PyTorch functions not currently available in Candle.
 - [`candle-coursera-ml`](https://github.com/vishpat/candle-coursera-ml): Implementation of ML algorithms from Coursera's [Machine Learning Specialization](https://www.coursera.org/specializations/machine-learning-introduction) course.
 - [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more.
 - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
 - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
 - [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.
 - [`atoma-infer`](https://github.com/atoma-network/atoma-infer): A Rust library for fast inference at scale, leveraging FlashAttention2 for efficient attention computation, PagedAttention for efficient KV-cache memory management, and multi-GPU support. It is OpenAI api compatible.
 - [`llms-from-scratch-rs`](https://github.com/nerdai/llms-from-scratch-rs): A comprehensive Rust translation of the code from Sebastian Raschka's Build an LLM from Scratch book.
 If you have an addition to this list, please submit a pull request.
 <!--- ANCHOR_END: useful_libraries --->
 <!--- ANCHOR: features --->
 ## Features
@ -207,47 +113,10 @@ If you have an addition to this list, please submit a pull request.
    - CUDA backend for efficiently running on GPUs, multiple GPU distribution via NCCL.
    - WASM support, run your models in a browser.
 - Included models.
-    - Language Models.
+    - LLMs: LLaMA v1 and v2, Falcon, StarCoder.
-        - LLaMA v1, v2, and v3 with variants such as SOLAR-10.7B.
+    - Whisper (multi-lingual support).
-        - Falcon.
+    - Stable Diffusion.
-        - StarCoder, StarCoder2.
+    - Computer Vision: DINOv2, EfficientNet, yolo-v3, yolo-v8.
        - Phi 1, 1.5, 2, and 3.
        - Mamba, Minimal Mamba
        - Gemma v1 2b and 7b+, v2 2b and 9b.
        - Mistral 7b v0.1.
        - Mixtral 8x7b v0.1.
        - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
        - Replit-code-v1.5-3B.
        - Bert.
        - Yi-6B and Yi-34B.
        - Qwen1.5, Qwen1.5 MoE.
        - RWKV v5 and v6.
    - Quantized LLMs.
        - Llama 7b, 13b, 70b, as well as the chat and code variants.
        - Mistral 7b, and 7b instruct.
        - Mixtral 8x7b.
        - Zephyr 7b a and b (Mistral-7b based).
        - OpenChat 3.5 (Mistral-7b based).
    - Text to text.
        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
        - Marian MT (Machine Translation).
    - Text to image.
        - Stable Diffusion v1.5, v2.1, XL v1.0.
        - Wurstchen v2.
    - Image to text.
        - BLIP.
        - TrOCR.
    - Audio.
        - Whisper, multi-lingual speech-to-text.
        - EnCodec, audio compression model.
        - MetaVoice-1B, text-to-speech model.
        - Parler-TTS, text-to-speech model.
    - Computer Vision Models.
        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4, Hiera, FastViT.
        - yolo-v3, yolo-v8.
        - Segment-Anything Model (SAM).
        - SegFormer.
 - File formats: load models from safetensors, npz, ggml, or PyTorch files.
 - Serverless (on CPU), small and fast deployments.
 - Quantization support using the llama.cpp quantized types.
@ -284,7 +153,6 @@ Cheatsheet:
 - [candle-datasets](./candle-datasets/): Datasets and data loaders.
 - [candle-transformers](./candle-transformers): transformers-related utilities.
 - [candle-flash-attn](./candle-flash-attn): Flash attention v2 layer.
 - [candle-onnx](./candle-onnx/): ONNX model evaluation.
 ## FAQ
@ -384,42 +252,12 @@ git submodule update --init
 /usr/include/c++/11/bits/std_function.h:530:146: error: parameter packs not expanded with ‘...’:
 ```
-This is a bug in gcc-11 triggered by the Cuda compiler. To fix this, install a different, supported gcc version - for example gcc-10, and specify the path to the compiler in the NVCC_CCBIN environment variable.
+This is a bug in gcc-11 triggered by the Cuda compiler. To fix this, install a different, supported gcc version - for example gcc-10, and specify the path to the compiler in the CANDLE_NVCC_CCBIN environment variable.
 ```
-env NVCC_CCBIN=/usr/lib/gcc/x86_64-linux-gnu/10 cargo ...
+env CANDLE_NVCC_CCBIN=/usr/lib/gcc/x86_64-linux-gnu/10 cargo ...
 ```
 #### Linking error on windows when running rustdoc or mdbook tests
 ```
 Couldn't compile the test.
 ---- .\candle-book\src\inference\hub.md - Using_the_hub::Using_in_a_real_model_ (line 50) stdout ----
 error: linking with `link.exe` failed: exit code: 1181
 //very long chain of linking
 = note: LINK : fatal error LNK1181: cannot open input file 'windows.0.48.5.lib'
 ```
 Make sure you link all native libraries that might be located outside a project target, e.g., to run mdbook tests, you should run:
 ```
 mdbook test candle-book -L .\target\debug\deps\ `
 -L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.42.2\lib `
 -L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.48.5\lib
 ```
 #### Extremely slow model load time with WSL
 This may be caused by the models being loaded from `/mnt/c`, more details on
 [stackoverflow](https://stackoverflow.com/questions/68972448/why-is-wsl-extremely-slow-when-compared-with-native-windows-npm-yarn-processing).
 #### Tracking down errors
 You can set `RUST_BACKTRACE=1` to be provided with backtraces when a candle
 error is generated.
 #### CudaRC error
 If you encounter an error like this one `called `Result::unwrap()` on an `Err` value: LoadLibraryExW { source: Os { code: 126, kind: Uncategorized, message: "The specified module could not be found." } }` on windows. To fix copy and rename these 3 files (make sure they are in path). The paths depend on your cuda version.
 `c:\Windows\System32\nvcuda.dll` -> `cuda.dll`
 `c:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\cublas64_12.dll` -> `cublas.dll`
 `c:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\curand64_10.dll` -> `curand.dll`
--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@ -11,11 +11,11 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { workspace = true }
+candle = { path = "../candle-core", version = "0.2.1", package = "candle-core" }
-candle-datasets = { workspace = true }
+candle-datasets = { path = "../candle-datasets", version = "0.2.1" }
-candle-nn = { workspace = true }
+candle-nn = { path = "../candle-nn", version = "0.2.1" }
-candle-transformers = { workspace = true }
+candle-transformers = { path = "../candle-transformers", version = "0.2.1" }
-candle-flash-attn = { workspace = true, optional = true }
+candle-flash-attn = { path = "../candle-flash-attn", version = "0.2.1", optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
@ -24,10 +24,9 @@ intel-mkl-src = { workspace = true, optional = true }
 cudarc = { workspace = true, optional = true }
 half = { workspace = true, optional = true }
 image = { workspace = true, optional = true }
 anyhow = { workspace = true }
 tokio = "1.43.0"
 [dev-dependencies]
 anyhow = { workspace = true }
 byteorder = { workspace = true }
 hf-hub = { workspace = true, features=["tokio"]}
 clap = { workspace = true }
@ -37,7 +36,9 @@ tokenizers = { workspace = true, features = ["onig"] }
 tracing = { workspace = true }
 tracing-chrome = { workspace = true }
 tracing-subscriber = { workspace = true }
 wav = { workspace = true }
 # Necessary to disambiguate with tokio in wasm examples which are 1.28.1
 tokio = "1.29.1"
 parquet = { workspace = true }
 image = { workspace = true }
--- a/candle-book/src/SUMMARY.md
+++ b/candle-book/src/SUMMARY.md
@ -10,11 +10,10 @@
 # Reference Guide
- [Running a model](inference/inference.md)
+- [Running a model](inference/README.md)
    - [Using the hub](inference/hub.md)
 - [Error management](error_manage.md)
- [Training](training/training.md)
+- [Training](training/README.md)
    - [Simplified](training/simplified.md)
    - [MNIST](training/mnist.md)
    - [Fine-tuning]()
    - [Serialization]()
--- a/candle-book/src/apps/dekstop.md
+++ b/candle-book/src/apps/dekstop.md
--- a/candle-book/src/error_manage.md
+++ b/candle-book/src/error_manage.md
@ -29,7 +29,7 @@ After adding `RUST_BACKTRACE=1`:
 Error: WithBacktrace { inner: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }, backtrace: Backtrace [{ fn: "candle::error::Error::bt", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/error.rs", line: 200 }, { fn: "candle::tensor::Tensor::matmul", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/tensor.rs", line: 816 }, { fn: "myapp::main", file: "./src/main.rs", line: 29 }, { fn: "core::ops::function::FnOnce::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 250 }, { fn: "std::sys_common::backtrace::__rust_begin_short_backtrace", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/sys_common/backtrace.rs", line: 135 }, { fn: "std::rt::lang_start::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 166 }, { fn: "core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 284 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::rt::lang_start", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 165 }, { fn: "main" }, { fn: "__libc_start_main" }, { fn: "_start" }] }
 ```
-Not super pretty at the moment, but we can see error occurred on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`
+Not super pretty at the moment, but we can see error occured on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`
 Another thing to note, is that since Rust is compiled it is not necessarily as easy to recover proper stacktraces
--- a/candle-book/src/guide/hello_world.md
+++ b/candle-book/src/guide/hello_world.md
@ -6,7 +6,7 @@ Open `src/main.rs` and fill in this content:
 ```rust
 # extern crate candle_core;
-use candle_core::{Device, Result, Tensor};
+use candle_core::{DType, Device, Result, Tensor};
 struct Model {
    first: Tensor,
@ -25,11 +25,11 @@ fn main() -> Result<()> {
    // Use Device::new_cuda(0)?; to use the GPU.
    let device = Device::Cpu;
-    let first = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
+    let first = Tensor::zeros((784, 100), DType::F32, &device)?;
-    let second = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
+    let second = Tensor::zeros((100, 10), DType::F32, &device)?;
    let model = Model { first, second };
-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
+    let dummy_image = Tensor::zeros((1, 784), DType::F32, &device)?;
    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
@ -50,7 +50,7 @@ the classical `Linear` layer. We can do as such
 ```rust
 # extern crate candle_core;
-# use candle_core::{Device, Result, Tensor};
+# use candle_core::{DType, Device, Result, Tensor};
 struct Linear{
    weight: Tensor,
    bias: Tensor,
@ -80,7 +80,7 @@ This will change the model running code into a new function
 ```rust
 # extern crate candle_core;
-# use candle_core::{Device, Result, Tensor};
+# use candle_core::{DType, Device, Result, Tensor};
 # struct Linear{
 #     weight: Tensor,
 #     bias: Tensor,
@ -110,15 +110,15 @@ fn main() -> Result<()> {
    let device = Device::cuda_if_available(0)?;
    // Creating a dummy model
-    let weight = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
+    let weight = Tensor::zeros((784, 100), DType::F32, &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
+    let bias = Tensor::zeros((100, ), DType::F32, &device)?;
    let first = Linear{weight, bias};
-    let weight = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
+    let weight = Tensor::zeros((100, 10), DType::F32, &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
+    let bias = Tensor::zeros((10, ), DType::F32, &device)?;
    let second = Linear{weight, bias};
    let model = Model { first, second };
-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
+    let dummy_image = Tensor::zeros((1, 784), DType::F32, &device)?;
    // Inference on the model
    let digit = model.forward(&dummy_image)?;
@ -146,7 +146,7 @@ And rewrite our examples using it
 ```rust
 # extern crate candle_core;
 # extern crate candle_nn;
-use candle_core::{Device, Result, Tensor};
+use candle_core::{DType, Device, Result, Tensor};
 use candle_nn::{Linear, Module};
 struct Model {
@ -167,15 +167,15 @@ fn main() -> Result<()> {
    let device = Device::Cpu;
    // This has changed (784, 100) -> (100, 784) !
-    let weight = Tensor::randn(0f32, 1.0, (100, 784), &device)?;
+    let weight = Tensor::zeros((100, 784), DType::F32, &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
+    let bias = Tensor::zeros((100, ), DType::F32, &device)?;
    let first = Linear::new(weight, Some(bias));
-    let weight = Tensor::randn(0f32, 1.0, (10, 100), &device)?;
+    let weight = Tensor::zeros((10, 100), DType::F32, &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
+    let bias = Tensor::zeros((10, ), DType::F32, &device)?;
    let second = Linear::new(weight, Some(bias));
    let model = Model { first, second };
-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
+    let dummy_image = Tensor::zeros((1, 784), DType::F32, &device)?;
    let digit = model.forward(&dummy_image)?;
    println!("Digit {digit:?} digit");
@ -188,8 +188,8 @@ Feel free to modify this example to use `Conv2d` to create a classical convnet i
 Now that we have the running dummy code we can get to more advanced topics:
- [For PyTorch users](../guide/cheatsheet.md)
+- [For PyTorch users](./guide/cheatsheet.md)
- [Running existing models](../inference/inference.md)
+- [Running existing models](./inference/README.md)
- [Training models](../training/training.md)
+- [Training models](./training/README.md)
--- a/candle-book/src/guide/installation.md
+++ b/candle-book/src/guide/installation.md
@ -12,9 +12,6 @@ compute_cap
 8.9
 ```
 You can also compile the Cuda kernels for a specific compute cap using the 
 `CUDA_COMPUTE_CAP=<compute cap>` environment variable.
 If any of the above commands errors out, please make sure to update your Cuda version.
 2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support.
--- a/candle-book/src/inference/inference.md
+++ b/candle-book/src/inference/inference.md
--- a/candle-book/src/lib.rs
+++ b/candle-book/src/lib.rs
@ -1,6 +1,3 @@
 #[cfg(test)]
 pub mod simplified;
 #[cfg(test)]
 mod tests {
    use anyhow::Result;
@ -28,7 +25,6 @@ let weights = candle::safetensors::load(weights_filename, &Device::Cpu).unwrap()
    #[rustfmt::skip]
    #[test]
    fn book_hub_2() {
        {
 // ANCHOR: book_hub_2
 use candle::Device;
 use hf_hub::api::sync::Api;
@ -46,10 +42,9 @@ let weights = candle::safetensors::load_buffer(&mmap[..], &Device::Cpu).unwrap()
        assert_eq!(weights.len(), 206);
    }
-    // #[rustfmt::skip]
+    #[rustfmt::skip]
-    // #[test]
+    #[test]
-    // fn book_hub_3() {
+    fn book_hub_3() {
    {
 // ANCHOR: book_hub_3
 use candle::{DType, Device, Tensor};
 use hf_hub::api::sync::Api;
@ -81,7 +76,7 @@ let mut tp_shape = view.shape().to_vec();
 let size = tp_shape[0];
 if size % world_size != 0 {
-    panic!("The dimension is not divisible by `world_size`");
+    panic!("The dimension is not divisble by `world_size`");
 }
 let block_size = size / world_size;
 let start = rank * block_size;
@ -104,10 +99,9 @@ let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).un
        assert_eq!(view.shape(), &[768, 768]);
        assert_eq!(tp_tensor.dims(), &[192, 768]);
    }
 }
    #[allow(unused)]
    #[rustfmt::skip]
    #[test]
    fn book_training_1() -> Result<()>{
 // ANCHOR: book_training_1
 use hf_hub::{api::sync::Api, Repo, RepoType};
--- a/candle-book/src/simplified.rs
+++ b/candle-book/src/simplified.rs
@ -1,196 +0,0 @@
 //! #A simplified example in Rust of training a neural network and then using it based on the Candle Framework by Hugging Face.
 //! Author: Evgeny Igumnov 2023 igumnovnsk@gmail.com
 //! This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
 //!
 //! ##Basic moments:
 //!
 //! A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
 //! The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
 //! The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
 //! For training, samples with real data on the results of the first and second stages of different elections are used.
 //! The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
 //! Model parameters (weights of neurons) are initialized randomly, then optimized during training.
 //! After training, the model is tested on a deferred sample to evaluate the accuracy.
 //! If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
 //! Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.
 #[rustfmt::skip]
 mod tests {
 use candle::{DType, Result, Tensor, D, Device};
 use candle_nn::{loss, ops, Linear, Module, VarBuilder, VarMap, Optimizer};
 // ANCHOR: book_training_simplified1
 const VOTE_DIM: usize = 2;
 const RESULTS: usize = 1;
 const EPOCHS: usize = 10;
 const LAYER1_OUT_SIZE: usize = 4;
 const LAYER2_OUT_SIZE: usize = 2;
 const LEARNING_RATE: f64 = 0.05;
 #[derive(Clone)]
 pub struct Dataset {
    pub train_votes: Tensor,
    pub train_results: Tensor,
    pub test_votes: Tensor,
    pub test_results: Tensor,
 }
 struct MultiLevelPerceptron {
    ln1: Linear,
    ln2: Linear,
    ln3: Linear,
 }
 impl MultiLevelPerceptron {
    fn new(vs: VarBuilder) -> Result<Self> {
        let ln1 = candle_nn::linear(VOTE_DIM, LAYER1_OUT_SIZE, vs.pp("ln1"))?;
        let ln2 = candle_nn::linear(LAYER1_OUT_SIZE, LAYER2_OUT_SIZE, vs.pp("ln2"))?;
        let ln3 = candle_nn::linear(LAYER2_OUT_SIZE, RESULTS + 1, vs.pp("ln3"))?;
        Ok(Self { ln1, ln2, ln3 })
    }
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let xs = self.ln1.forward(xs)?;
        let xs = xs.relu()?;
        let xs = self.ln2.forward(&xs)?;
        let xs = xs.relu()?;
        self.ln3.forward(&xs)
    }
 }
 // ANCHOR_END: book_training_simplified1
 // ANCHOR: book_training_simplified3
 #[tokio::test]
 async fn simplified() -> anyhow::Result<()> {
    let dev = Device::cuda_if_available(0)?;
    let train_votes_vec: Vec<u32> = vec![
        15, 10,
        10, 15,
        5, 12,
        30, 20,
        16, 12,
        13, 25,
        6, 14,
        31, 21,
    ];
    let train_votes_tensor = Tensor::from_vec(train_votes_vec.clone(), (train_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
    let train_results_vec: Vec<u32> = vec![
        1,
        0,
        0,
        1,
        1,
        0,
        0,
        1,
    ];
    let train_results_tensor = Tensor::from_vec(train_results_vec, train_votes_vec.len() / VOTE_DIM, &dev)?;
    let test_votes_vec: Vec<u32> = vec![
        13, 9,
        8, 14,
        3, 10,
    ];
    let test_votes_tensor = Tensor::from_vec(test_votes_vec.clone(), (test_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
    let test_results_vec: Vec<u32> = vec![
        1,
        0,
        0,
    ];
    let test_results_tensor = Tensor::from_vec(test_results_vec.clone(), test_results_vec.len(), &dev)?;
    let m = Dataset {
        train_votes: train_votes_tensor,
        train_results: train_results_tensor,
        test_votes: test_votes_tensor,
        test_results: test_results_tensor,
    };
    let trained_model: MultiLevelPerceptron;
    loop {
        println!("Trying to train neural network.");
        match train(m.clone(), &dev) {
            Ok(model) => {
                trained_model = model;
                break;
            },
            Err(e) => {
                println!("Error: {}", e);
                continue;
            }
        }
    }
    let real_world_votes: Vec<u32> = vec![
        13, 22,
    ];
    let tensor_test_votes = Tensor::from_vec(real_world_votes.clone(), (1, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
    let final_result = trained_model.forward(&tensor_test_votes)?;
    let result = final_result
        .argmax(D::Minus1)?
        .to_dtype(DType::F32)?
        .get(0).map(|x| x.to_scalar::<f32>())??;
    println!("real_life_votes: {:?}", real_world_votes);
    println!("neural_network_prediction_result: {:?}", result);
    Ok(())
 }
 // ANCHOR_END: book_training_simplified3
 // ANCHOR: book_training_simplified2
 fn train(m: Dataset, dev: &Device) -> anyhow::Result<MultiLevelPerceptron> {
    let train_results = m.train_results.to_device(dev)?;
    let train_votes = m.train_votes.to_device(dev)?;
    let varmap = VarMap::new();
    let vs = VarBuilder::from_varmap(&varmap, DType::F32, dev);
    let model = MultiLevelPerceptron::new(vs.clone())?;
    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), LEARNING_RATE)?;
    let test_votes = m.test_votes.to_device(dev)?;
    let test_results = m.test_results.to_device(dev)?;
    let mut final_accuracy: f32 = 0.0;
    for epoch in 1..EPOCHS + 1 {
        let logits = model.forward(&train_votes)?;
        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
        let loss = loss::nll(&log_sm, &train_results)?;
        sgd.backward_step(&loss)?;
        let test_logits = model.forward(&test_votes)?;
        let sum_ok = test_logits
            .argmax(D::Minus1)?
            .eq(&test_results)?
            .to_dtype(DType::F32)?
            .sum_all()?
            .to_scalar::<f32>()?;
        let test_accuracy = sum_ok / test_results.dims1()? as f32;
        final_accuracy = 100. * test_accuracy;
        println!("Epoch: {epoch:3} Train loss: {:8.5} Test accuracy: {:5.2}%",
                 loss.to_scalar::<f32>()?,
                 final_accuracy
        );
        if final_accuracy == 100.0 {
            break;
        }
    }
    if final_accuracy < 100.0 {
        Err(anyhow::Error::msg("The model is not trained well enough."))
    } else {
        Ok(model)
    }
 }
 // ANCHOR_END: book_training_simplified2
 }
--- a/candle-book/src/training/training.md
+++ b/candle-book/src/training/training.md
--- a/candle-book/src/training/simplified.md
+++ b/candle-book/src/training/simplified.md
@ -1,45 +0,0 @@
 # Simplified
 ## How its works
 This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
 Basic moments:
 1. A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
 2. The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
 3. The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
 4. For training, samples with real data on the results of the first and second stages of different elections are used.
 5. The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
 6. Model parameters (weights of neurons) are initialized randomly, then optimized during training.
 7. After training, the model is tested on a deferred sample to evaluate the accuracy.
 8. If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
 Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.
 ```rust,ignore
 {{#include ../simplified.rs:book_training_simplified1}}
 ```
 ```rust,ignore
 {{#include ../simplified.rs:book_training_simplified2}}
 ```
 ```rust,ignore
 {{#include ../simplified.rs:book_training_simplified3}}
 ```
 ## Example output
 ```bash
 Trying to train neural network.
 Epoch:   1 Train loss:  4.42555 Test accuracy:  0.00%
 Epoch:   2 Train loss:  0.84677 Test accuracy: 33.33%
 Epoch:   3 Train loss:  2.54335 Test accuracy: 33.33%
 Epoch:   4 Train loss:  0.37806 Test accuracy: 33.33%
 Epoch:   5 Train loss:  0.36647 Test accuracy: 100.00%
 real_life_votes: [13, 22]
 neural_network_prediction_result: 0.0
 ```
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -12,9 +12,7 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { workspace = true, optional = true }
+candle-kernels = { path = "../candle-kernels", version = "0.2.1", optional = true }
 candle-metal-kernels = { workspace = true, optional = true }
 metal = { workspace = true, optional = true }
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
@ -28,35 +26,15 @@ rand_distr = { workspace = true }
 rayon = { workspace = true }
 safetensors = { workspace = true }
 thiserror = { workspace = true }
 ug-cuda = { workspace = true, optional = true }
 ug-metal = { workspace = true, optional = true }
 yoke = { workspace = true }
 zip = { workspace = true }
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 ug = { workspace = true }
 [dev-dependencies]
 anyhow = { workspace = true }
 clap = { workspace = true }
 criterion = { workspace = true }
 [features]
 default = []
-cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda"]
+cuda = ["cudarc", "dep:candle-kernels"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels", "dep:ug-metal"]
 [[bench]]
 name = "bench_main"
 harness = false
 [[example]]
 name = "metal_basics"
 required-features = ["metal"]
 [[example]]
 name = "cuda_basics"
 required-features = ["cuda"]
--- a/candle-core/benches/bench_main.rs
+++ b/candle-core/benches/bench_main.rs
@ -1,14 +0,0 @@
 mod benchmarks;
 use criterion::criterion_main;
 criterion_main!(
    benchmarks::affine::benches,
    benchmarks::matmul::benches,
    benchmarks::random::benches,
    benchmarks::reduce::benches,
    benchmarks::where_cond::benches,
    benchmarks::conv_transpose2d::benches,
    benchmarks::qmatmul::benches,
    benchmarks::unary::benches
 );
--- a/candle-core/benches/benchmarks/affine.rs
+++ b/candle-core/benches/benchmarks/affine.rs
@ -1,43 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn run(a: &Tensor) {
    a.affine(12.34, 56.78).unwrap();
 }
 fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let b = 1;
    let m = 1024;
    let k = 1024;
    let tensor = Tensor::zeros((b, m, k), dtype, device).unwrap();
    let flops = b * m * k * dtype.size_in_bytes();
    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_affine_benchmark(c, &device, DType::F32, "affine_f32");
        run_affine_benchmark(c, &device, DType::F16, "affine_f16");
        run_affine_benchmark(c, &device, DType::BF16, "affine_bf16");
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/conv_transpose2d.rs
+++ b/candle-core/benches/benchmarks/conv_transpose2d.rs
@ -1,59 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn run(
    x: &Tensor,
    k: &Tensor,
    padding: usize,
    output_padding: usize,
    stride: usize,
    dilation: usize,
 ) {
    x.conv_transpose2d(k, padding, output_padding, stride, dilation)
        .unwrap();
 }
 fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let t = Tensor::arange(0.0f32, 10000.0, device)
        .unwrap()
        .reshape((1, 4, 50, 50))
        .unwrap()
        .to_dtype(dtype)
        .unwrap();
    let kernel = Tensor::arange(0.0f32, 100.0, device)
        .unwrap()
        .reshape((4, 1, 5, 5))
        .unwrap()
        .to_dtype(dtype)
        .unwrap();
    let flops = t.dims().iter().product::<usize>() * dtype.size_in_bytes();
    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&t), black_box(&kernel), 1, 0, 1, 2);
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_benchmark(c, &device, DType::F32, "conv_transpose2d_f32");
        run_benchmark(c, &device, DType::F16, "conv_transpose2d_f16");
        run_benchmark(c, &device, DType::BF16, "conv_transpose2d_bf16");
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/matmul.rs
+++ b/candle-core/benches/benchmarks/matmul.rs
@ -1,44 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn run(a: &Tensor, b: &Tensor) {
    a.matmul(&b.t().unwrap()).unwrap();
 }
 fn run_bench(c: &mut Criterion, device: &Device) {
    let b = 1;
    let m = 1;
    let n = 2048;
    let k = 2048;
    let dtype = DType::F32;
    let lhs = Tensor::zeros((b, m, k), dtype, device).unwrap();
    let rhs = Tensor::zeros((b, n, k), dtype, device).unwrap();
    let flops = b * m * n * k;
    let mut group = c.benchmark_group(device.bench_name("matmul"));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&lhs), black_box(&rhs));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_bench(c, &device);
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/mod.rs
+++ b/candle-core/benches/benchmarks/mod.rs
@ -1,72 +0,0 @@
 pub(crate) mod affine;
 pub(crate) mod conv_transpose2d;
 pub(crate) mod matmul;
 pub(crate) mod qmatmul;
 pub(crate) mod random;
 pub(crate) mod reduce;
 pub(crate) mod unary;
 pub(crate) mod where_cond;
 use candle_core::{Device, Result};
 pub(crate) trait BenchDevice {
    fn sync(&self) -> Result<()>;
    fn bench_name<S: Into<String>>(&self, name: S) -> String;
 }
 impl BenchDevice for Device {
    fn sync(&self) -> Result<()> {
        match self {
            Device::Cpu => Ok(()),
            Device::Cuda(device) => {
                #[cfg(feature = "cuda")]
                return Ok(device
                    .synchronize()
                    .map_err(|e| candle_core::Error::Cuda(Box::new(e)))?);
                #[cfg(not(feature = "cuda"))]
                panic!("Cuda device without cuda feature enabled: {:?}", device)
            }
            Device::Metal(device) => {
                #[cfg(feature = "metal")]
                return Ok(device.wait_until_completed()?);
                #[cfg(not(feature = "metal"))]
                panic!("Metal device without metal feature enabled: {:?}", device)
            }
        }
    }
    fn bench_name<S: Into<String>>(&self, name: S) -> String {
        match self {
            Device::Cpu => {
                let cpu_type = if cfg!(feature = "accelerate") {
                    "accelerate"
                } else if cfg!(feature = "mkl") {
                    "mkl"
                } else {
                    "cpu"
                };
                format!("{}_{}", cpu_type, name.into())
            }
            Device::Cuda(_) => format!("cuda_{}", name.into()),
            Device::Metal(_) => format!("metal_{}", name.into()),
        }
    }
 }
 struct BenchDeviceHandler {
    devices: Vec<Device>,
 }
 impl BenchDeviceHandler {
    pub fn new() -> Result<Self> {
        let mut devices = Vec::new();
        if cfg!(feature = "metal") {
            devices.push(Device::new_metal(0)?);
        } else if cfg!(feature = "cuda") {
            devices.push(Device::new_cuda(0)?);
        }
        devices.push(Device::Cpu);
        Ok(Self { devices })
    }
 }
--- a/candle-core/benches/benchmarks/qmatmul.rs
+++ b/candle-core/benches/benchmarks/qmatmul.rs
@ -1,72 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{
    quantized::{self, GgmlDType, QMatMul},
    Device, Module, Tensor,
 };
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn run(matmul: &QMatMul, x: &Tensor) {
    matmul.forward(x).unwrap();
 }
 fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
    let b = 1;
    let m = 1;
    let n = 1024;
    let k = 1024;
    let lhs = (0..(m * k))
        .map(|v| v as f32 / (m * k) as f32)
        .collect::<Vec<_>>();
    let rhs = (0..(k * n))
        .map(|v| v as f32 / (n * k) as f32)
        .collect::<Vec<_>>();
    let lhs = Tensor::from_slice(&lhs, (m, k), device).unwrap();
    let rhs = Tensor::from_slice(&rhs, (k, n), device).unwrap();
    let qtensor = quantized::QTensor::quantize(&rhs.t().unwrap(), dtype).unwrap();
    let matmul = quantized::QMatMul::from_qtensor(qtensor).unwrap();
    let flops = b * m * n * k;
    let mut group = c.benchmark_group(device.bench_name(format!("qmatmul_{:?}", dtype)));
    group.sample_size(200);
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&matmul), black_box(&lhs));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        for dtype in [
            GgmlDType::F32,
            GgmlDType::F16,
            GgmlDType::Q4_0,
            GgmlDType::Q4_1,
            GgmlDType::Q5_0,
            GgmlDType::Q5_1,
            GgmlDType::Q8_0,
            GgmlDType::Q2K,
            GgmlDType::Q3K,
            GgmlDType::Q4K,
            GgmlDType::Q5K,
            GgmlDType::Q6K,
        ] {
            run_bench(c, &device, dtype);
        }
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/random.rs
+++ b/candle-core/benches/benchmarks/random.rs
@ -1,63 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn rand_uniform(a: &Tensor) {
    a.rand_like(-1.0, 123.0).unwrap();
 }
 fn rand_normal(a: &Tensor) {
    a.randn_like(100.0, 15.0).unwrap();
 }
 fn run_random_bench(c: &mut Criterion, device: &Device) {
    let b = 1;
    let rows = 2048;
    let cols = 2048;
    let dtype = DType::F32;
    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
    let flops = b * rows * cols * dtype.size_in_bytes();
    let mut group = c.benchmark_group(device.bench_name("random_uniform"));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |benches| {
        benches.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                rand_uniform(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
    let mut group = c.benchmark_group(device.bench_name("random_normal"));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |benches| {
        benches.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                rand_normal(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        run_random_bench(c, &device);
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/reduce.rs
+++ b/candle-core/benches/benchmarks/reduce.rs
@ -1,158 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use half::{bf16, f16};
 use std::time::Instant;
 fn run_sum(a: &Tensor) {
    a.sum_keepdim(2).unwrap();
 }
 fn run_arg_min(a: &Tensor) {
    a.argmin_keepdim(2).unwrap();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    let (lo, up) = (-1000.0f32, 1000.0f32);
    for device in handler.devices {
        run_reduce(c, &device, (lo, up), false);
        run_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), false);
        run_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), false);
        run_arg_reduce(c, &device, (lo, up), false);
        run_arg_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), false);
        run_arg_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), false);
        run_reduce(c, &device, (lo, up), true);
        run_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), true);
        run_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), true);
        run_arg_reduce(c, &device, (lo, up), true);
        run_arg_reduce(c, &device, (f16::from_f32(lo), f16::from_f32(up)), true);
        run_arg_reduce(c, &device, (bf16::from_f32(lo), bf16::from_f32(up)), true);
    }
 }
 fn run_reduce<T: candle_core::FloatDType>(
    c: &mut Criterion,
    device: &Device,
    (lo, up): (T, T),
    strided: bool,
 ) {
    let b = 1;
    let m = 1024;
    let k = 1024;
    let a = if strided {
        Tensor::rand(lo, up, (b, m, k), &device)
            .unwrap()
            .transpose(0, 2)
            .unwrap()
    } else {
        Tensor::rand(lo, up, (b, m, k), &device).unwrap()
    };
    let flops = b * m * k * T::DTYPE.size_in_bytes();
    let name = match T::DTYPE {
        DType::F32 => {
            if strided {
                "reduce_f32_strided"
            } else {
                "reduce_f32"
            }
        }
        DType::F16 => {
            if strided {
                "reduce_f16_strided"
            } else {
                "reduce_f16"
            }
        }
        DType::BF16 => {
            if strided {
                "reduce_bf16_strided"
            } else {
                "reduce_bf16"
            }
        }
        _ => "unknown",
    };
    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run_sum(black_box(&a));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn run_arg_reduce<T: candle_core::FloatDType>(
    c: &mut Criterion,
    device: &Device,
    (lo, up): (T, T),
    strided: bool,
 ) {
    let b = 1;
    let m = 1024;
    let k = 1024;
    let a = if strided {
        Tensor::rand(lo, up, (b, m, k), &device)
            .unwrap()
            .transpose(0, 2)
            .unwrap()
    } else {
        Tensor::rand(lo, up, (b, m, k), &device).unwrap()
    };
    let flops = b * m * k * T::DTYPE.size_in_bytes();
    let name = match T::DTYPE {
        DType::F32 => {
            if strided {
                "arg_reduce_f32_strided"
            } else {
                "arg_reduce_f32"
            }
        }
        DType::F16 => {
            if strided {
                "arg_reduce_f16_strided"
            } else {
                "arg_reduce_f16"
            }
        }
        DType::BF16 => {
            if strided {
                "arg_reduce_bf16_strided"
            } else {
                "arg_reduce_bf16"
            }
        }
        _ => "unknown",
    };
    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run_arg_min(black_box(&a));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/unary.rs
+++ b/candle-core/benches/benchmarks/unary.rs
@ -1,49 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn run(a: &Tensor) {
    a.sqrt().unwrap();
 }
 fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let b = 1;
    let m = 1024;
    let k = 1024;
    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, device)
        .unwrap()
        .to_dtype(dtype)
        .unwrap()
        .reshape((b, m, k))
        .unwrap();
    let flops = b * m * k * dtype.size_in_bytes();
    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(black_box(&tensor));
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let handler = BenchDeviceHandler::new().unwrap();
    for device in handler.devices {
        for dtype in [DType::F32, DType::BF16, DType::F16] {
            let name = format!("sqrt_{:?}", dtype);
            run_unary_benchmark(c, &device, dtype, &name);
        }
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/where_cond.rs
+++ b/candle-core/benches/benchmarks/where_cond.rs
@ -1,64 +0,0 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 fn run(a: &Tensor, b: &Tensor, c: &Tensor) {
    a.where_cond(b, c).unwrap();
 }
 const fn create_cond_arr<const N: usize>() -> [u8; N] {
    let mut arr = [0u8; N];
    let mut i = 0;
    while i < N {
        arr[i] = (i % 2) as u8;
        i += 1;
    }
    arr
 }
 const B: usize = 1;
 const M: usize = 1024;
 const K: usize = 1024;
 const SIZE: usize = B * M * K;
 const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
 fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), device).unwrap();
    let on_true = Tensor::ones((B, M, K), dtype, device).unwrap();
    let on_false = Tensor::zeros((B, M, K), dtype, device).unwrap();
    let elements = B * M * K;
    // E.g. 2 f32 tensors + 1 u8 tensor
    let flops = (2 * elements * dtype.size_in_bytes()) + elements;
    let mut group = c.benchmark_group(device.bench_name(name));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
            let start = Instant::now();
            for _i in 0..iters {
                run(
                    black_box(&tensor),
                    black_box(&on_true),
                    black_box(&on_false),
                );
            }
            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }
 fn criterion_benchmark(c: &mut Criterion) {
    let device = BenchDeviceHandler::new().unwrap();
    for d in device.devices {
        run_where_cond_benchmark(c, &d, DType::F32, "where_cond_f32");
        run_where_cond_benchmark(c, &d, DType::BF16, "where_cond_bf16");
        run_where_cond_benchmark(c, &d, DType::F16, "where_cond_f16");
    }
 }
 criterion_group!(benches, criterion_benchmark);
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -8,10 +8,11 @@ use anyhow::Result;
 use candle_core::{Device, Tensor};
 fn main() -> Result<()> {
-    let a = Tensor::new(&[[0.0f32, 1.0, 2.0], [3.0, 4.0, 5.0]], &Device::Cpu)?;
+    let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
-    let b = Tensor::new(&[[88.0f32, 99.0]], &Device::Cpu)?;
+    let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
-    let new_a = a.slice_scatter(&b, 1, 2)?;
+    let start = std::time::Instant::now();
-    assert_eq!(a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
+    let res = inp.conv2d(&w, 0, 1, 1, 1)?;
-    assert_eq!(new_a.to_vec2::<f32>()?, [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]);
+    println!("{:?}", start.elapsed());
    println!("{res:?}");
    Ok(())
 }
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -9,25 +9,21 @@ use candle_core::{Device, Tensor};
 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
-    let x = Tensor::randn(0f32, 1.0, (8 * 4096, 8 * 4096), &device)?
+    let in_t = Tensor::rand(-1f32, 1f32, (1, 3, 12, 7), &device)?;
-        .to_dtype(candle_core::DType::BF16)?;
+    let k_t = Tensor::rand(-1f32, 1f32, (6, 3, 1, 1), &device)?;
-    candle_core::cuda::set_gemm_reduced_precision_f32(false);
+    let out_t = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
-    candle_core::cuda::set_gemm_reduced_precision_bf16(false);
+    println!("{out_t}");
-    let _x1 = x.matmul(&x)?;
+    let in_t = in_t.to_device(&Device::Cpu)?;
-    drop(_x1);
+    let k_t = k_t.to_device(&Device::Cpu)?;
-    let start_time = std::time::Instant::now();
+    let out_t2 = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
-    let _x1 = x.matmul(&x)?;
+    let diff = (out_t.to_device(&Device::Cpu)? - out_t2)?
-    device.synchronize()?;
+        .sqr()?
-    println!("fp32: {:?}", start_time.elapsed());
+        .sum_all()?;
-    drop(_x1);
+    println!("{diff}");
-    candle_core::cuda::set_gemm_reduced_precision_f32(true);
+
-    candle_core::cuda::set_gemm_reduced_precision_bf16(true);
+    let t = Tensor::randn(0f32, 1f32, (2, 4, 96, 96), &device)?;
-    let _x1 = x.matmul(&x)?;
+    let w = Tensor::randn(0f32, 1f32, (320, 4, 3, 3), &device)?;
-    drop(_x1);
+    let res = t.conv2d(&w, 1, 1, 1, 1)?;
-    let start_time = std::time::Instant::now();
+    println!("{res:?}");
    let _x1 = x.matmul(&x)?;
    device.synchronize()?;
    println!("tf32: {:?}", start_time.elapsed());
    drop(_x1);
    Ok(())
 }
--- a/candle-core/examples/metal_basics.rs
+++ b/candle-core/examples/metal_basics.rs
@ -1,28 +0,0 @@
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;
 use anyhow::Result;
 use candle_core::{Device, Tensor};
 fn main() -> Result<()> {
    // This requires the code to be run with MTL_CAPTURE_ENABLED=1
    let device = Device::new_metal(0)?;
    let metal_device = match &device {
        Device::Metal(m) => m,
        _ => anyhow::bail!("unexpected device"),
    };
    metal_device.capture("/tmp/candle.gputrace")?;
    // This first synchronize ensures that a new command buffer gets created after setting up the
    // capture scope.
    device.synchronize()?;
    let x = Tensor::randn(0f32, 1.0, (128, 128), &device)?;
    let x1 = x.add(&x)?;
    println!("{x1:?}");
    // This second synchronize ensures that the command buffer gets commited before the end of the
    // capture scope.
    device.synchronize()?;
    Ok(())
 }
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@ -0,0 +1,299 @@
 use candle_core::quantized::{gguf_file, k_quants, QTensor};
 use candle_core::{Device, Result, Tensor};
 use clap::{Parser, Subcommand, ValueEnum};
 use rayon::prelude::*;
 #[derive(ValueEnum, Debug, Clone)]
 enum QuantizationMode {
    /// The default quantization includes all 2d tensors, except the output tensor which always
    /// uses Q6_K.
    Llama,
 }
 impl QuantizationMode {
    fn quantize(
        &self,
        name: &str,
        tensor: QTensor,
        default: fn(&Tensor) -> Result<QTensor>,
    ) -> Result<QTensor> {
        match self {
            Self::Llama => {
                // Same behavior as the llama.cpp quantization.
                let should_quantize = name.ends_with(".weight") && tensor.rank() == 2;
                if should_quantize {
                    let tensor = tensor.dequantize(&Device::Cpu)?;
                    if name == "output.weight" {
                        QTensor::quantize::<k_quants::BlockQ6K>(&tensor)
                    } else {
                        default(&tensor)
                    }
                } else {
                    Ok(tensor)
                }
            }
        }
    }
 }
 #[derive(ValueEnum, Debug, Clone)]
 enum Quantization {
    #[value(name = "q4_0")]
    Q4_0,
    #[value(name = "q4_1")]
    Q4_1,
    #[value(name = "q5_0")]
    Q5_0,
    #[value(name = "q5_1")]
    Q5_1,
    #[value(name = "q8_0")]
    Q8_0,
    #[value(name = "q8_1")]
    Q8_1,
    Q2k,
    Q3k,
    Q4k,
    Q5k,
    Q6k,
    Q8k,
    F16,
    F32,
 }
 #[derive(ValueEnum, Debug, Clone)]
 enum Format {
    Safetensors,
    Npz,
    Ggml,
    Gguf,
    Pth,
    Pickle,
 }
 impl Format {
    fn infer<P: AsRef<std::path::Path>>(p: P) -> Option<Self> {
        p.as_ref()
            .extension()
            .and_then(|e| e.to_str())
            .and_then(|e| match e {
                // We don't infer any format for .bin as it can be used for ggml/gguf or pytorch.
                "safetensors" | "safetensor" => Some(Self::Safetensors),
                "npz" => Some(Self::Npz),
                "pth" | "pt" => Some(Self::Pth),
                "ggml" => Some(Self::Ggml),
                "gguf" => Some(Self::Gguf),
                _ => None,
            })
    }
 }
 #[derive(Subcommand, Debug, Clone)]
 enum Command {
    Ls {
        files: Vec<std::path::PathBuf>,
        /// The file format to use, if unspecified infer from the file extension.
        #[arg(long, value_enum)]
        format: Option<Format>,
        /// Enable verbose mode.
        #[arg(short, long)]
        verbose: bool,
    },
    Quantize {
        /// The input file, in gguf format.
        in_file: std::path::PathBuf,
        /// The output file, in gguf format.
        out_file: std::path::PathBuf,
        /// The quantization schema to apply.
        #[arg(long, value_enum)]
        quantization: Quantization,
        /// Which tensor to quantize.
        #[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
        mode: QuantizationMode,
    },
 }
 #[derive(Parser, Debug, Clone)]
 struct Args {
    #[command(subcommand)]
    command: Command,
 }
 fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> Result<()> {
    let format = match format {
        Some(format) => format,
        None => match Format::infer(file) {
            Some(format) => format,
            None => {
                println!(
                    "{file:?}: cannot infer format from file extension, use the --format flag"
                );
                return Ok(());
            }
        },
    };
    match format {
        Format::Npz => {
            let tensors = candle_core::npy::NpzTensors::new(file)?;
            let mut names = tensors.names();
            names.sort();
            for name in names {
                let shape_dtype = match tensors.get_shape_and_dtype(name) {
                    Ok((shape, dtype)) => format!("[{shape:?}; {dtype:?}]"),
                    Err(err) => err.to_string(),
                };
                println!("{name}: {shape_dtype}")
            }
        }
        Format::Safetensors => {
            let tensors = unsafe { candle_core::safetensors::MmapedFile::new(file)? };
            let tensors = tensors.deserialize()?;
            let mut tensors = tensors.tensors();
            tensors.sort_by(|a, b| a.0.cmp(&b.0));
            for (name, view) in tensors.iter() {
                let dtype = view.dtype();
                let dtype = match candle_core::DType::try_from(dtype) {
                    Ok(dtype) => format!("{dtype:?}"),
                    Err(_) => format!("{dtype:?}"),
                };
                let shape = view.shape();
                println!("{name}: [{shape:?}; {dtype}]")
            }
        }
        Format::Pth => {
            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose)?;
            tensors.sort_by(|a, b| a.name.cmp(&b.name));
            for tensor_info in tensors.iter() {
                println!(
                    "{}: [{:?}; {:?}]",
                    tensor_info.name,
                    tensor_info.layout.shape(),
                    tensor_info.dtype,
                );
                if verbose {
                    println!("    {:?}", tensor_info);
                }
            }
        }
        Format::Pickle => {
            let file = std::fs::File::open(file)?;
            let mut reader = std::io::BufReader::new(file);
            let mut stack = candle_core::pickle::Stack::empty();
            stack.read_loop(&mut reader)?;
            for (i, obj) in stack.stack().iter().enumerate() {
                println!("{i} {obj:?}");
            }
        }
        Format::Ggml => {
            let mut file = std::fs::File::open(file)?;
            let content = candle_core::quantized::ggml_file::Content::read(&mut file)?;
            let mut tensors = content.tensors.into_iter().collect::<Vec<_>>();
            tensors.sort_by(|a, b| a.0.cmp(&b.0));
            for (name, qtensor) in tensors.iter() {
                println!("{name}: [{:?}; {:?}]", qtensor.shape(), qtensor.dtype());
            }
        }
        Format::Gguf => {
            let mut file = std::fs::File::open(file)?;
            let content = gguf_file::Content::read(&mut file)?;
            if verbose {
                let mut metadata = content.metadata.into_iter().collect::<Vec<_>>();
                metadata.sort_by(|a, b| a.0.cmp(&b.0));
                println!("metadata entries ({})", metadata.len());
                for (key, value) in metadata.iter() {
                    println!("  {key}: {value:?}");
                }
            }
            let mut tensors = content.tensor_infos.into_iter().collect::<Vec<_>>();
            tensors.sort_by(|a, b| a.0.cmp(&b.0));
            for (name, info) in tensors.iter() {
                println!("{name}: [{:?}; {:?}]", info.shape, info.ggml_dtype);
            }
        }
    }
    Ok(())
 }
 fn run_quantize(
    in_file: std::path::PathBuf,
    out_file: std::path::PathBuf,
    q: Quantization,
    qmode: QuantizationMode,
 ) -> Result<()> {
    // Open the out file early so as to fail directly on missing directories etc.
    let mut out_file = std::fs::File::create(out_file)?;
    let mut in_ = std::fs::File::open(&in_file)?;
    let content = gguf_file::Content::read(&mut in_)?;
    println!("tensors: {}", content.tensor_infos.len());
    let quantize_fn = match q {
        Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
        Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
        Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
        Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
        Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
        Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
        Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
        Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
        Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
        Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
        Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
        Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
        Quantization::F16 => QTensor::quantize::<half::f16>,
        Quantization::F32 => QTensor::quantize::<f32>,
    };
    let qtensors = content
        .tensor_infos
        .par_iter()
        .map(|(name, _)| {
            println!("  quantizing {name}");
            let mut in_file = std::fs::File::open(&in_file)?;
            let tensor = content.tensor(&mut in_file, name)?;
            let tensor = qmode.quantize(name, tensor, quantize_fn)?;
            Ok((name, tensor))
        })
        .collect::<Result<Vec<_>>>()?;
    let qtensors = qtensors
        .iter()
        .map(|(k, v)| (k.as_str(), v))
        .collect::<Vec<_>>();
    let metadata = content
        .metadata
        .iter()
        .map(|(k, v)| (k.as_str(), v))
        .collect::<Vec<_>>();
    gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?;
    Ok(())
 }
 fn main() -> anyhow::Result<()> {
    let args = Args::parse();
    match args.command {
        Command::Ls {
            files,
            format,
            verbose,
        } => {
            let multiple_files = files.len() > 1;
            for file in files.iter() {
                if multiple_files {
                    println!("--- {file:?} ---");
                }
                run_ls(file, format.clone(), verbose)?
            }
        }
        Command::Quantize {
            in_file,
            out_file,
            quantization,
            mode,
        } => run_quantize(in_file, out_file, quantization, mode)?,
    }
    Ok(())
 }
--- a/candle-core/src/accelerate.rs
+++ b/candle-core/src/accelerate.rs
@ -370,70 +370,6 @@ pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
 }
 #[inline]
 pub fn vs_tanh_inplace(y: &mut [f32]) {
    unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
 }
 #[inline]
 pub fn vd_tanh_inplace(y: &mut [f64]) {
    unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
 }
 #[inline]
 pub fn vs_exp_inplace(y: &mut [f32]) {
    unsafe { ffi::vvexpf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
 }
 #[inline]
 pub fn vd_exp_inplace(y: &mut [f64]) {
    unsafe { ffi::vvexp(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
 }
 #[inline]
 pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
    }
    vs_tanh_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = 0.5 * v * (1.0 + *y)
    }
 }
 #[inline]
 pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
    }
    vd_tanh_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = 0.5 * v * (1.0 + *y)
    }
 }
 #[inline]
 pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = -v
    }
    vs_exp_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = v / (1.0 + *y)
    }
 }
 #[inline]
 pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = -v
    }
    vd_exp_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = v / (1.0 + *y)
    }
 }
 macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
        #[inline]
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@ -1,5 +1,3 @@
 //! Traits to Define Backend Behavior
 //!
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Layout, Result, Shape};
@ -41,14 +39,6 @@ pub trait BackendStorage: Sized {
        _params: &crate::conv::ParamsConv1D,
    ) -> Result<Self>;
    fn conv_transpose1d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self>;
    fn conv2d(
        &self,
        _l: &Layout,
@ -67,7 +57,6 @@ pub trait BackendStorage: Sized {
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self>;
    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self>;
    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self>;
@ -100,19 +89,6 @@ pub trait BackendStorage: Sized {
    ) -> Result<Self>;
    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;
    #[allow(clippy::too_many_arguments)]
    // Similar to cudaMemcpy2D, though values are in elements and not in bytes.
    fn copy2d(
        &self,
        _: &mut Self,
        _d1: usize,
        _d2: usize,
        _src_stride1: usize,
        _dst_stride1: usize,
        _src_offset: usize,
        _dst_offset: usize,
    ) -> Result<()>;
 }
 pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
@ -129,24 +105,9 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
    /// # Safety
    /// This function is unsafe as it doesn't initialize the underlying data store.
    /// The caller should ensure that the data is properly initialized as early as possible
    /// after this call.
    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage>;
    fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<Self::Storage>;
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage>;
    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage>;
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
    fn set_seed(&self, _: u64) -> Result<()>;
    /// Synchronize should block until all the operations on the device are completed.
    fn synchronize(&self) -> Result<()>;
 }
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -1,4 +1,3 @@
 //! Methods for backpropagation of gradients.
 use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};
 use crate::{Error, Result, Tensor, TensorId};
 use std::collections::HashMap;
@ -16,23 +15,12 @@ fn broadcast_back(arg: &Tensor, node: &Tensor, reduced_dims: &[usize]) -> Result
    }
 }
 thread_local! {
    static CANDLE_GRAD_DO_NOT_DETACH: bool = {
        match std::env::var("CANDLE_GRAD_DO_NOT_DETACH") {
            Ok(s) => {
                !s.is_empty() && s != "0"
            },
            Err(_) => false,
        }
    }
 }
 impl Tensor {
    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
    /// elements having dependencies on the latter ones, e.g. the first element if any is the
    /// argument.
    /// This assumes that the op graph is a DAG.
-    pub fn sorted_nodes(&self) -> Vec<&Tensor> {
+    fn sorted_nodes(&self) -> Vec<&Tensor> {
        // The vec of sorted nodes is passed as an owned value rather than a mutable reference
        // to get around some lifetime limitations.
        fn walk<'a>(
@ -48,8 +36,6 @@ impl Tensor {
                // Do not call recursively on the "leaf" nodes.
                track_grad = true;
                nodes
            } else if node.dtype().is_int() {
                nodes
            } else if let Some(op) = node.op() {
                match op {
                    Op::IndexAdd(t1, t2, t3, _)
@ -69,11 +55,6 @@ impl Tensor {
                        kernel: rhs,
                        ..
                    }
                    | Op::ConvTranspose1D {
                        arg: lhs,
                        kernel: rhs,
                        ..
                    }
                    | Op::Conv2D {
                        arg: lhs,
                        kernel: rhs,
@ -88,8 +69,7 @@ impl Tensor {
                    | Op::Binary(lhs, rhs, _)
                    | Op::Gather(lhs, rhs, _)
                    | Op::IndexSelect(lhs, rhs, _)
-                    | Op::Matmul(lhs, rhs)
+                    | Op::Matmul(lhs, rhs) => {
                    | Op::SliceScatter0(lhs, rhs, _) => {
                        let (tg, nodes) = walk(lhs, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(rhs, nodes, already_seen);
@ -110,19 +90,15 @@ impl Tensor {
                            nodes
                        }
                    }
                    Op::Unary(_node, UnaryOp::Ceil)
                    | Op::Unary(_node, UnaryOp::Floor)
                    | Op::Unary(_node, UnaryOp::Round)
                    | Op::Unary(_node, UnaryOp::Sign) => nodes,
                    Op::Reshape(node)
-                    | Op::UpsampleNearest1D { arg: node, .. }
+                    | Op::UpsampleNearest2D(node)
                    | Op::UpsampleNearest2D { arg: node, .. }
                    | Op::AvgPool2D { arg: node, .. }
                    | Op::MaxPool2D { arg: node, .. }
                    | Op::Copy(node)
                    | Op::Broadcast(node)
                    | Op::Cmp(node, _)
-                    | Op::Reduce(node, ReduceOp::Min | ReduceOp::Sum | ReduceOp::Max, _)
+                    | Op::Reduce(node, _, _)
                    | Op::ToDType(node)
                    | Op::ToDevice(node)
                    | Op::Transpose(node, _, _)
                    | Op::Permute(node, _)
@ -135,16 +111,6 @@ impl Tensor {
                        track_grad |= tg;
                        nodes
                    }
                    Op::ToDType(node) => {
                        if node.dtype().is_float() {
                            let (tg, nodes) = walk(node, nodes, already_seen);
                            track_grad |= tg;
                            nodes
                        } else {
                            nodes
                        }
                    }
                    Op::Reduce(_, ReduceOp::ArgMin | ReduceOp::ArgMax, _) => nodes,
                }
            } else {
                nodes
@ -168,16 +134,10 @@ impl Tensor {
            if node.is_variable() {
                continue;
            }
-            let grad = grads
+            let grad = grads.remove(node).unwrap();
-                .remove(node)
+            // TODO: We should perform all these operations in place (or at least not track the
-                .expect("candle internal error - grad not populated");
+            // whole graph). The only drawback would be if we wanted to support grad of grad but
-            // https://github.com/huggingface/candle/issues/1241
+            // this is out of scope.
            // Ideally, we would make these operations in place where possible to ensure that we
            // do not have to allocate too often. Here we just call `.detach` to avoid computing
            // the backprop graph of the backprop itself. This would be an issue for second order
            // derivatives but these are out of scope at the moment.
            let do_not_detach = CANDLE_GRAD_DO_NOT_DETACH.with(|b| *b);
            let grad = if do_not_detach { grad } else { grad.detach() };
            if let Some(op) = node.op() {
                match op {
                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
@ -232,45 +192,7 @@ impl Tensor {
                        let f_grad = pred.where_cond(&zeros, &grad)?;
                        *f_sum_grad = f_sum_grad.add(&f_grad)?;
                    }
-                    Op::Conv1D {
+                    Op::Conv1D { .. } => Err(Error::BackwardNotSupported { op: "conv1d" })?,
                        arg,
                        kernel,
                        padding,
                        stride,
                        dilation,
                    } => {
                        // The output height for conv_transpose1d is:
                        // (l_in - 1) * stride - 2 * padding + dilation * (k_size - 1) + out_padding + 1
                        let grad_l_in = grad.dim(2)?;
                        let k_size = kernel.dim(2)?;
                        let out_size =
                            (grad_l_in - 1) * stride + dilation * (k_size - 1) + 1 - 2 * padding;
                        let out_padding = arg.dim(2)? - out_size;
                        let grad_arg = grad.conv_transpose1d(
                            kernel,
                            *padding,
                            out_padding,
                            *stride,
                            *dilation,
                            /* groups */ 1,
                        )?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                        let grad_kernel = arg
                            .transpose(0, 1)?
                            .conv1d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
                        let (_, _, k0) = kernel.dims3()?;
                        let (_, _, g_k0) = grad_kernel.dims3()?;
                        let grad_kernel = if g_k0 != k0 {
                            grad_kernel.narrow(2, 0, k0)?
                        } else {
                            grad_kernel
                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
                    Op::Conv2D {
                        arg,
                        kernel,
@ -300,44 +222,11 @@ impl Tensor {
                            .conv2d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
                        let (_, _, k0, k1) = kernel.dims4()?;
                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
                        } else {
                            grad_kernel
                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
-                    Op::ConvTranspose1D { .. } => Err(Error::BackwardNotSupported {
+                    Op::ConvTranspose2D { .. } => Err(Error::BackwardNotSupported {
-                        op: "conv-transpose1d",
+                        op: "conv-transpose2d",
                    })?,
                    Op::ConvTranspose2D {
                        arg,
                        kernel,
                        padding,
                        stride,
                        dilation,
                        output_padding: _output_padding,
                    } => {
                        let grad_arg = grad.conv2d(kernel, *padding, *stride, *dilation, 1)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                        let grad_kernel = grad
                            .transpose(0, 1)?
                            .conv2d(&arg.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
                            .transpose(0, 1)?;
                        let sum_grad = grads.or_insert(kernel)?;
                        let (_, _, k0, k1) = kernel.dims4()?;
                        let (_, _, g_k0, g_k1) = grad_kernel.dims4()?;
                        let grad_kernel = if g_k0 != k0 || g_k1 != k1 {
                            grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)?
                        } else {
                            grad_kernel
                        };
                        *sum_grad = sum_grad.add(&grad_kernel)?;
                    }
                    Op::AvgPool2D {
                        arg,
                        kernel_size,
@ -373,48 +262,9 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad_arg)?;
                    }
-                    Op::UpsampleNearest1D { arg, target_size } => {
+                    Op::UpsampleNearest2D { .. } => Err(Error::BackwardNotSupported {
-                        let (_n, c, size) = arg.dims3()?;
+                        op: "upsample-nearest2d",
-                        if target_size % size != 0 {
+                    })?,
                            crate::bail!("backward not supported for non integer upscaling factors")
                        }
                        let scale = target_size / size;
                        let kernel = Tensor::ones((c, 1, scale), arg.dtype(), arg.device())?;
                        let conv_sum = grad.conv1d(&kernel, 0, scale, 1, c)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = conv_sum;
                    }
                    Op::UpsampleNearest2D {
                        arg,
                        target_h,
                        target_w,
                    } => {
                        let (_n, c, h, w) = arg.dims4()?;
                        if target_h % h != 0 || target_w % w != 0 {
                            crate::bail!("backward not supported for non integer upscaling factors")
                        }
                        let scale_h = target_h / h;
                        let scale_w = target_w / w;
                        if scale_h != scale_w {
                            crate::bail!("backward not supported for non uniform upscaling factors")
                        };
                        let kernel =
                            Tensor::ones((c, 1, scale_h, scale_w), arg.dtype(), arg.device())?;
                        let conv_sum = grad.conv2d(&kernel, 0, scale_h, 1, c)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = conv_sum;
                    }
                    Op::SliceScatter0(lhs, rhs, start_rhs) => {
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        let rhs_grad = grad.narrow(0, *start_rhs, rhs.dim(0)?)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        let lhs_grad = grad.slice_scatter0(&rhs.zeros_like()?, *start_rhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?
                    }
                    Op::Gather(arg, indexes, dim) => {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.scatter_add(indexes, &grad, *dim)?;
@ -489,6 +339,7 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad)?;
                    }
                    Op::Cmp(_args, _) => {}
                    Op::Reduce(arg, ReduceOp::Max, reduced_dims) => {
                        let node = broadcast_back(arg, node, reduced_dims)?;
                        let grad = broadcast_back(arg, &grad, reduced_dims)?;
@ -505,7 +356,7 @@ impl Tensor {
                    }
                    Op::ToDType(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad.to_dtype(arg.dtype())?)?
+                        *sum_grad = sum_grad.add(&grad.to_dtype(node.dtype())?)?
                    }
                    Op::Copy(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
@ -578,67 +429,20 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(_, UnaryOp::Floor)
+                    Op::Reduce(_, ReduceOp::ArgMin, _) => {}
-                    | Op::Unary(_, UnaryOp::Round)
+                    Op::Reduce(_, ReduceOp::ArgMax, _) => {}
                    | Op::Reduce(_, ReduceOp::ArgMin, _)
                    | Op::Reduce(_, ReduceOp::ArgMax, _)
                    | Op::Unary(_, UnaryOp::Sign)
                    | Op::Cmp(_, _) => {}
                    Op::Reshape(arg) => {
                        let arg_grad = grad.reshape(arg.dims())?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(_, UnaryOp::Ceil) => Err(Error::BackwardNotSupported { op: "ceil" })?,
+                    Op::Unary(_, UnaryOp::Gelu) => Err(Error::BackwardNotSupported { op: "gelu" })?,
                    Op::Unary(arg, UnaryOp::Gelu) => {
                        let sum_grad = grads.or_insert(arg)?;
                        let cube = arg.powf(3.)?;
                        let tanh = (0.0356774 * &cube + (0.797885 * arg)?)?.tanh()?;
                        let gelu_grad = (((0.5 * &tanh)?
                            + (0.0535161 * cube + (0.398942 * arg)?)? * (1. - tanh.powf(2.)?))?
                            + 0.5)?;
                        *sum_grad = sum_grad.add(&(&grad * gelu_grad)?)?
                    }
                    Op::Unary(arg, UnaryOp::Erf) => {
                        let sum_grad = grads.or_insert(arg)?;
                        // d/dx erf(x) = 2/sqrt(pi) * e^(-x^2)
                        let erf_grad =
                            (2. / std::f64::consts::PI.sqrt()) * (arg.sqr()?.neg()?).exp()?;
                        *sum_grad = sum_grad.add(&(&grad * erf_grad)?)?
                    }
                    Op::Unary(arg, UnaryOp::GeluErf) => {
                        let sum_grad = grads.or_insert(arg)?;
                        // d/dx gelu_erf(x) = 0.5 + 0.398942 e^(-x^2/2) x + 0.5 erf(x/sqrt(2))
                        let neg_half_square = (arg.sqr()?.neg()? / 2.)?;
                        let scaled_exp_arg = (0.398942 * neg_half_square.exp()? * arg)?;
                        let arg_scaled_sqrt = (arg / 2f64.sqrt())?;
                        let erf_scaled_sqrt = (0.5 * arg_scaled_sqrt.erf()?)?;
                        let gelu_erf_grad = (0.5 + scaled_exp_arg + erf_scaled_sqrt)?;
                        *sum_grad = sum_grad.add(&(&grad * gelu_erf_grad)?)?;
                    }
                    Op::Unary(arg, UnaryOp::Relu) => {
                        let sum_grad = grads.or_insert(arg)?;
                        let relu_grad = arg.ge(&arg.zeros_like()?)?.to_dtype(arg.dtype())?;
                        *sum_grad = sum_grad.add(&(&grad * relu_grad)?)?
                    }
-                    Op::Unary(arg, UnaryOp::Silu) => {
+                    Op::Elu(..) => Err(Error::BackwardNotSupported { op: "elu" })?,
                        let sum_grad = grads.or_insert(arg)?;
                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x))) = sigmoid(x) * (1 - node) + node
                        let sigmoid_arg = (arg.neg()?.exp()? + 1.)?.recip()?;
                        let silu_grad = &sigmoid_arg * (1. - *node) + *node;
                        *sum_grad = sum_grad.add(&(&grad * silu_grad)?)?
                    }
                    Op::Elu(arg, alpha) => {
                        // d/dx elu(x) = 1 for x > 0, alpha * e^x for x <= 0
                        let sum_grad = grads.or_insert(arg)?;
                        let zeros = arg.zeros_like()?;
                        let positive_mask = arg.gt(&zeros)?.to_dtype(arg.dtype())?;
                        let negative_mask = arg.le(&zeros)?.to_dtype(arg.dtype())?;
                        // node == alpha * (e^x - 1) for x <= 0, reuse it
                        let negative_exp_mask = (negative_mask * (*node + *alpha))?;
                        let combined_mask = (positive_mask + negative_exp_mask)?;
                        *sum_grad = sum_grad.add(&(grad * combined_mask)?)?
                    }
                    Op::Powf(arg, e) => {
                        let arg_grad = (&(grad * arg.powf(e - 1.)?)? * *e)?;
                        let sum_grad = grads.or_insert(arg)?;
@ -713,38 +517,29 @@ impl Tensor {
    }
 }
 /// A store for gradients, associating a tensor id to the corresponding gradient tensor, used for back propagation.
 #[derive(Debug)]
 pub struct GradStore(HashMap<TensorId, Tensor>);
 impl GradStore {
    /// Create a new gradient store
    fn new() -> Self {
        GradStore(HashMap::new())
    }
    /// Get the gradient tensor corresponding to the given tensor id
    pub fn get_id(&self, id: TensorId) -> Option<&Tensor> {
        self.0.get(&id)
    }
    /// Get the gradient tensor associated with the given tensor
    pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> {
        self.0.get(&tensor.id())
    }
    /// Remove the gradient tensor associated with the given tensor, returning it if it exists
    pub fn remove(&mut self, tensor: &Tensor) -> Option<Tensor> {
        self.0.remove(&tensor.id())
    }
    /// Insert a gradient tensor associated with the given tensor, returning the previous gradient tensor if it existed
    pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option<Tensor> {
        self.0.insert(tensor.id(), grad)
    }
    /// Get the gradient tensor associated with the given tensor, or, if it does not exist,
    /// insert a tensor of zeroes, with the same shape and type as the given tensors and return it
    fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> {
        use std::collections::hash_map::Entry;
        let grad = match self.0.entry(tensor.id()) {
@ -756,9 +551,4 @@ impl GradStore {
        };
        Ok(grad)
    }
    /// Get the tensor ids of the stored gradient tensors
    pub fn get_ids(&self) -> impl Iterator<Item = &TensorId> {
        self.0.keys()
    }
 }
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
@ -1,5 +1,3 @@
 //! 1D and 2D Convolutions
 //!
 use crate::{op::BackpropOp, op::Op, Error, Result, Tensor};
 #[derive(Debug, Clone, PartialEq, Eq)]
@ -14,7 +12,6 @@ pub struct ParamsConv1D {
    pub(crate) padding: usize,
    pub(crate) stride: usize,
    pub(crate) dilation: usize,
    pub(crate) cudnn_fwd_algo: Option<CudnnFwdAlgo>,
 }
 impl ParamsConv1D {
@ -28,46 +25,6 @@ impl ParamsConv1D {
    }
 }
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ParamsConvTranspose1D {
    pub(crate) b_size: usize,
    pub(crate) l_in: usize,
    pub(crate) c_out: usize,
    pub(crate) c_in: usize,
    pub(crate) k_size: usize,
    pub(crate) padding: usize,
    pub(crate) output_padding: usize,
    pub(crate) stride: usize,
    pub(crate) dilation: usize,
 }
 impl ParamsConvTranspose1D {
    pub(crate) fn l_out(&self) -> usize {
        (self.l_in - 1) * self.stride - 2 * self.padding
            + self.dilation * (self.k_size - 1)
            + self.output_padding
            + 1
    }
    pub(crate) fn out_dims(&self) -> Vec<usize> {
        let l_out = self.l_out();
        vec![self.b_size, self.c_out, l_out]
    }
 }
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum CudnnFwdAlgo {
    ImplicitGemm,
    ImplicitPrecompGemm,
    Gemm,
    Direct,
    Fft,
    FftTiling,
    Winograd,
    WinogradNonFused,
    Count,
 }
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ParamsConv2D {
    pub(crate) b_size: usize,
@ -80,7 +37,6 @@ pub struct ParamsConv2D {
    pub(crate) padding: usize,
    pub(crate) stride: usize,
    pub(crate) dilation: usize,
    pub cudnn_fwd_algo: Option<CudnnFwdAlgo>,
 }
 impl ParamsConv2D {
@ -175,7 +131,6 @@ impl Tensor {
            padding,
            stride,
            dilation,
            cudnn_fwd_algo: None,
        };
        if groups == 1 {
            self.conv1d_single_group(kernel, &params)
@ -191,72 +146,6 @@ impl Tensor {
        }
    }
    fn conv_transpose1d_single_group(
        &self,
        kernel: &Self,
        params: &ParamsConvTranspose1D,
    ) -> Result<Self> {
        let storage = self.storage().conv_transpose1d(
            self.layout(),
            &kernel.storage(),
            kernel.layout(),
            params,
        )?;
        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose1D {
            arg,
            kernel,
            padding: params.padding,
            output_padding: params.output_padding,
            stride: params.stride,
            dilation: params.dilation,
        });
        let out_dims = params.out_dims();
        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
    }
    /// Applies a 1D transposed convolution over the input tensor.
    pub fn conv_transpose1d(
        &self,
        kernel: &Self,
        padding: usize,
        output_padding: usize,
        stride: usize,
        dilation: usize,
        groups: usize,
    ) -> Result<Self> {
        let (c_in_k, c_out, k_size) = kernel.dims3()?;
        let (b_size, c_in, l_in) = self.dims3()?;
        if c_in != c_in_k {
            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
        }
        if c_in % groups != 0 {
            crate::bail!("in_channel {c_in} is not divisible by the number of groups")
        }
        let params = ParamsConvTranspose1D {
            b_size,
            l_in,
            k_size,
            c_out,
            c_in: c_in / groups,
            padding,
            output_padding,
            stride,
            dilation,
        };
        if groups == 1 {
            self.conv_transpose1d_single_group(kernel, &params)
        } else {
            let blocks = self.chunk(groups, 1)?;
            let kernel = kernel.chunk(groups, 0)?;
            let blocks = blocks
                .iter()
                .zip(&kernel)
                .map(|(block, kernel)| block.conv_transpose1d_single_group(kernel, &params))
                .collect::<Result<Vec<_>>>()?;
            Tensor::cat(&blocks, 1)
        }
    }
    fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> Result<Self> {
        let storage =
            self.storage()
@ -299,7 +188,6 @@ impl Tensor {
            padding,
            stride,
            dilation,
            cudnn_fwd_algo: None,
        };
        if groups == 1 {
            self.conv2d_single_group(kernel, &params)
--- a/candle-core/src/cpu/erf.rs
+++ b/candle-core/src/cpu/erf.rs
@ -1,763 +0,0 @@
 #![allow(clippy::excessive_precision)]
 // Code taken from https://github.com/statrs-dev/statrs
 //! Provides the [error](https://en.wikipedia.org/wiki/Error_function) and
 //! related functions
 mod evaluate {
    //! Provides functions that don't have a numerical solution and must
    //! be solved computationally (e.g. evaluation of a polynomial)
    /// evaluates a polynomial at `z` where `coeff` are the coeffecients
    /// to a polynomial of order `k` where `k` is the length of `coeff` and the
    /// coeffecient
    /// to the `k`th power is the `k`th element in coeff. E.g. [3,-1,2] equates to
    /// `2z^2 - z + 3`
    ///
    /// # Remarks
    ///
    /// Returns 0 for a 0 length coefficient slice
    pub fn polynomial(z: f64, coeff: &[f64]) -> f64 {
        let n = coeff.len();
        if n == 0 {
            return 0.0;
        }
        let mut sum = *coeff.last().unwrap();
        for c in coeff[0..n - 1].iter().rev() {
            sum = *c + z * sum;
        }
        sum
    }
 }
 use std::f64;
 /// `erf` calculates the error function at `x`.
 pub fn erf(x: f64) -> f64 {
    if x.is_nan() {
        f64::NAN
    } else if x >= 0.0 && x.is_infinite() {
        1.0
    } else if x <= 0.0 && x.is_infinite() {
        -1.0
    } else if x == 0. {
        0.0
    } else {
        erf_impl(x, false)
    }
 }
 /// `erf_inv` calculates the inverse error function
 /// at `x`.
 pub fn erf_inv(x: f64) -> f64 {
    if x == 0.0 {
        0.0
    } else if x >= 1.0 {
        f64::INFINITY
    } else if x <= -1.0 {
        f64::NEG_INFINITY
    } else if x < 0.0 {
        erf_inv_impl(-x, 1.0 + x, -1.0)
    } else {
        erf_inv_impl(x, 1.0 - x, 1.0)
    }
 }
 /// `erfc` calculates the complementary error function
 /// at `x`.
 pub fn erfc(x: f64) -> f64 {
    if x.is_nan() {
        f64::NAN
    } else if x == f64::INFINITY {
        0.0
    } else if x == f64::NEG_INFINITY {
        2.0
    } else {
        erf_impl(x, true)
    }
 }
 /// `erfc_inv` calculates the complementary inverse
 /// error function at `x`.
 pub fn erfc_inv(x: f64) -> f64 {
    if x <= 0.0 {
        f64::INFINITY
    } else if x >= 2.0 {
        f64::NEG_INFINITY
    } else if x > 1.0 {
        erf_inv_impl(-1.0 + x, 2.0 - x, -1.0)
    } else {
        erf_inv_impl(1.0 - x, x, 1.0)
    }
 }
 // **********************************************************
 // ********** Coefficients for erf_impl polynomial **********
 // **********************************************************
 /// Polynomial coefficients for a numerator of `erf_impl`
 /// in the interval [1e-10, 0.5].
 const ERF_IMPL_AN: &[f64] = &[
    0.00337916709551257388990745,
    -0.00073695653048167948530905,
    -0.374732337392919607868241,
    0.0817442448733587196071743,
    -0.0421089319936548595203468,
    0.0070165709512095756344528,
    -0.00495091255982435110337458,
    0.000871646599037922480317225,
 ];
 /// Polynomial coefficients for a denominator of `erf_impl`
 /// in the interval [1e-10, 0.5]
 const ERF_IMPL_AD: &[f64] = &[
    1.0,
    -0.218088218087924645390535,
    0.412542972725442099083918,
    -0.0841891147873106755410271,
    0.0655338856400241519690695,
    -0.0120019604454941768171266,
    0.00408165558926174048329689,
    -0.000615900721557769691924509,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [0.5, 0.75].
 const ERF_IMPL_BN: &[f64] = &[
    -0.0361790390718262471360258,
    0.292251883444882683221149,
    0.281447041797604512774415,
    0.125610208862766947294894,
    0.0274135028268930549240776,
    0.00250839672168065762786937,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [0.5, 0.75].
 const ERF_IMPL_BD: &[f64] = &[
    1.0,
    1.8545005897903486499845,
    1.43575803037831418074962,
    0.582827658753036572454135,
    0.124810476932949746447682,
    0.0113724176546353285778481,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [0.75, 1.25].
 const ERF_IMPL_CN: &[f64] = &[
    -0.0397876892611136856954425,
    0.153165212467878293257683,
    0.191260295600936245503129,
    0.10276327061989304213645,
    0.029637090615738836726027,
    0.0046093486780275489468812,
    0.000307607820348680180548455,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [0.75, 1.25].
 const ERF_IMPL_CD: &[f64] = &[
    1.0,
    1.95520072987627704987886,
    1.64762317199384860109595,
    0.768238607022126250082483,
    0.209793185936509782784315,
    0.0319569316899913392596356,
    0.00213363160895785378615014,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [1.25, 2.25].
 const ERF_IMPL_DN: &[f64] = &[
    -0.0300838560557949717328341,
    0.0538578829844454508530552,
    0.0726211541651914182692959,
    0.0367628469888049348429018,
    0.00964629015572527529605267,
    0.00133453480075291076745275,
    0.778087599782504251917881e-4,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [1.25, 2.25].
 const ERF_IMPL_DD: &[f64] = &[
    1.0,
    1.75967098147167528287343,
    1.32883571437961120556307,
    0.552528596508757581287907,
    0.133793056941332861912279,
    0.0179509645176280768640766,
    0.00104712440019937356634038,
    -0.106640381820357337177643e-7,
 ];
 ///  Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [2.25, 3.5].
 const ERF_IMPL_EN: &[f64] = &[
    -0.0117907570137227847827732,
    0.014262132090538809896674,
    0.0202234435902960820020765,
    0.00930668299990432009042239,
    0.00213357802422065994322516,
    0.00025022987386460102395382,
    0.120534912219588189822126e-4,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [2.25, 3.5].
 const ERF_IMPL_ED: &[f64] = &[
    1.0,
    1.50376225203620482047419,
    0.965397786204462896346934,
    0.339265230476796681555511,
    0.0689740649541569716897427,
    0.00771060262491768307365526,
    0.000371421101531069302990367,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [3.5, 5.25].
 const ERF_IMPL_FN: &[f64] = &[
    -0.00546954795538729307482955,
    0.00404190278731707110245394,
    0.0054963369553161170521356,
    0.00212616472603945399437862,
    0.000394984014495083900689956,
    0.365565477064442377259271e-4,
    0.135485897109932323253786e-5,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [3.5, 5.25].
 const ERF_IMPL_FD: &[f64] = &[
    1.0,
    1.21019697773630784832251,
    0.620914668221143886601045,
    0.173038430661142762569515,
    0.0276550813773432047594539,
    0.00240625974424309709745382,
    0.891811817251336577241006e-4,
    -0.465528836283382684461025e-11,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [5.25, 8].
 const ERF_IMPL_GN: &[f64] = &[
    -0.00270722535905778347999196,
    0.0013187563425029400461378,
    0.00119925933261002333923989,
    0.00027849619811344664248235,
    0.267822988218331849989363e-4,
    0.923043672315028197865066e-6,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [5.25, 8].
 const ERF_IMPL_GD: &[f64] = &[
    1.0,
    0.814632808543141591118279,
    0.268901665856299542168425,
    0.0449877216103041118694989,
    0.00381759663320248459168994,
    0.000131571897888596914350697,
    0.404815359675764138445257e-11,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [8, 11.5].
 const ERF_IMPL_HN: &[f64] = &[
    -0.00109946720691742196814323,
    0.000406425442750422675169153,
    0.000274499489416900707787024,
    0.465293770646659383436343e-4,
    0.320955425395767463401993e-5,
    0.778286018145020892261936e-7,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [8, 11.5].
 const ERF_IMPL_HD: &[f64] = &[
    1.0,
    0.588173710611846046373373,
    0.139363331289409746077541,
    0.0166329340417083678763028,
    0.00100023921310234908642639,
    0.24254837521587225125068e-4,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [11.5, 17].
 const ERF_IMPL_IN: &[f64] = &[
    -0.00056907993601094962855594,
    0.000169498540373762264416984,
    0.518472354581100890120501e-4,
    0.382819312231928859704678e-5,
    0.824989931281894431781794e-7,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [11.5, 17].
 const ERF_IMPL_ID: &[f64] = &[
    1.0,
    0.339637250051139347430323,
    0.043472647870310663055044,
    0.00248549335224637114641629,
    0.535633305337152900549536e-4,
    -0.117490944405459578783846e-12,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [17, 24].
 const ERF_IMPL_JN: &[f64] = &[
    -0.000241313599483991337479091,
    0.574224975202501512365975e-4,
    0.115998962927383778460557e-4,
    0.581762134402593739370875e-6,
    0.853971555085673614607418e-8,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [17, 24].
 const ERF_IMPL_JD: &[f64] = &[
    1.0,
    0.233044138299687841018015,
    0.0204186940546440312625597,
    0.000797185647564398289151125,
    0.117019281670172327758019e-4,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [24, 38].
 const ERF_IMPL_KN: &[f64] = &[
    -0.000146674699277760365803642,
    0.162666552112280519955647e-4,
    0.269116248509165239294897e-5,
    0.979584479468091935086972e-7,
    0.101994647625723465722285e-8,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [24, 38].
 const ERF_IMPL_KD: &[f64] = &[
    1.0,
    0.165907812944847226546036,
    0.0103361716191505884359634,
    0.000286593026373868366935721,
    0.298401570840900340874568e-5,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [38, 60].
 const ERF_IMPL_LN: &[f64] = &[
    -0.583905797629771786720406e-4,
    0.412510325105496173512992e-5,
    0.431790922420250949096906e-6,
    0.993365155590013193345569e-8,
    0.653480510020104699270084e-10,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [38, 60].
 const ERF_IMPL_LD: &[f64] = &[
    1.0,
    0.105077086072039915406159,
    0.00414278428675475620830226,
    0.726338754644523769144108e-4,
    0.477818471047398785369849e-6,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [60, 85].
 const ERF_IMPL_MN: &[f64] = &[
    -0.196457797609229579459841e-4,
    0.157243887666800692441195e-5,
    0.543902511192700878690335e-7,
    0.317472492369117710852685e-9,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [60, 85].
 const ERF_IMPL_MD: &[f64] = &[
    1.0,
    0.052803989240957632204885,
    0.000926876069151753290378112,
    0.541011723226630257077328e-5,
    0.535093845803642394908747e-15,
 ];
 /// Polynomial coefficients for a numerator in `erf_impl`
 /// in the interval [85, 110].
 const ERF_IMPL_NN: &[f64] = &[
    -0.789224703978722689089794e-5,
    0.622088451660986955124162e-6,
    0.145728445676882396797184e-7,
    0.603715505542715364529243e-10,
 ];
 /// Polynomial coefficients for a denominator in `erf_impl`
 /// in the interval [85, 110].
 const ERF_IMPL_ND: &[f64] = &[
    1.0,
    0.0375328846356293715248719,
    0.000467919535974625308126054,
    0.193847039275845656900547e-5,
 ];
 // **********************************************************
 // ********** Coefficients for erf_inv_impl polynomial ******
 // **********************************************************
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0, 0.5].
 const ERF_INV_IMPL_AN: &[f64] = &[
    -0.000508781949658280665617,
    -0.00836874819741736770379,
    0.0334806625409744615033,
    -0.0126926147662974029034,
    -0.0365637971411762664006,
    0.0219878681111168899165,
    0.00822687874676915743155,
    -0.00538772965071242932965,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0, 0.5].
 const ERF_INV_IMPL_AD: &[f64] = &[
    1.0,
    -0.970005043303290640362,
    -1.56574558234175846809,
    1.56221558398423026363,
    0.662328840472002992063,
    -0.71228902341542847553,
    -0.0527396382340099713954,
    0.0795283687341571680018,
    -0.00233393759374190016776,
    0.000886216390456424707504,
 ];
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0.5, 0.75].
 const ERF_INV_IMPL_BN: &[f64] = &[
    -0.202433508355938759655,
    0.105264680699391713268,
    8.37050328343119927838,
    17.6447298408374015486,
    -18.8510648058714251895,
    -44.6382324441786960818,
    17.445385985570866523,
    21.1294655448340526258,
    -3.67192254707729348546,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0.5, 0.75].
 const ERF_INV_IMPL_BD: &[f64] = &[
    1.0,
    6.24264124854247537712,
    3.9713437953343869095,
    -28.6608180499800029974,
    -20.1432634680485188801,
    48.5609213108739935468,
    10.8268667355460159008,
    -22.6436933413139721736,
    1.72114765761200282724,
 ];
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x less than 3.
 const ERF_INV_IMPL_CN: &[f64] = &[
    -0.131102781679951906451,
    -0.163794047193317060787,
    0.117030156341995252019,
    0.387079738972604337464,
    0.337785538912035898924,
    0.142869534408157156766,
    0.0290157910005329060432,
    0.00214558995388805277169,
    -0.679465575181126350155e-6,
    0.285225331782217055858e-7,
    -0.681149956853776992068e-9,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x less than 3.
 const ERF_INV_IMPL_CD: &[f64] = &[
    1.0,
    3.46625407242567245975,
    5.38168345707006855425,
    4.77846592945843778382,
    2.59301921623620271374,
    0.848854343457902036425,
    0.152264338295331783612,
    0.01105924229346489121,
 ];
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x between 3 and 6.
 const ERF_INV_IMPL_DN: &[f64] = &[
    -0.0350353787183177984712,
    -0.00222426529213447927281,
    0.0185573306514231072324,
    0.00950804701325919603619,
    0.00187123492819559223345,
    0.000157544617424960554631,
    0.460469890584317994083e-5,
    -0.230404776911882601748e-9,
    0.266339227425782031962e-11,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x between 3 and 6.
 const ERF_INV_IMPL_DD: &[f64] = &[
    1.0,
    1.3653349817554063097,
    0.762059164553623404043,
    0.220091105764131249824,
    0.0341589143670947727934,
    0.00263861676657015992959,
    0.764675292302794483503e-4,
 ];
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x between 6 and 18.
 const ERF_INV_IMPL_EN: &[f64] = &[
    -0.0167431005076633737133,
    -0.00112951438745580278863,
    0.00105628862152492910091,
    0.000209386317487588078668,
    0.149624783758342370182e-4,
    0.449696789927706453732e-6,
    0.462596163522878599135e-8,
    -0.281128735628831791805e-13,
    0.99055709973310326855e-16,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x between 6 and 18.
 const ERF_INV_IMPL_ED: &[f64] = &[
    1.0,
    0.591429344886417493481,
    0.138151865749083321638,
    0.0160746087093676504695,
    0.000964011807005165528527,
    0.275335474764726041141e-4,
    0.282243172016108031869e-6,
 ];
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x between 18 and 44.
 const ERF_INV_IMPL_FN: &[f64] = &[
    -0.0024978212791898131227,
    -0.779190719229053954292e-5,
    0.254723037413027451751e-4,
    0.162397777342510920873e-5,
    0.396341011304801168516e-7,
    0.411632831190944208473e-9,
    0.145596286718675035587e-11,
    -0.116765012397184275695e-17,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x between 18 and 44.
 const ERF_INV_IMPL_FD: &[f64] = &[
    1.0,
    0.207123112214422517181,
    0.0169410838120975906478,
    0.000690538265622684595676,
    0.145007359818232637924e-4,
    0.144437756628144157666e-6,
    0.509761276599778486139e-9,
 ];
 /// Polynomial coefficients for a numerator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x greater than 44.
 const ERF_INV_IMPL_GN: &[f64] = &[
    -0.000539042911019078575891,
    -0.28398759004727721098e-6,
    0.899465114892291446442e-6,
    0.229345859265920864296e-7,
    0.225561444863500149219e-9,
    0.947846627503022684216e-12,
    0.135880130108924861008e-14,
    -0.348890393399948882918e-21,
 ];
 /// Polynomial coefficients for a denominator of `erf_inv_impl`
 /// in the interval [0.75, 1] with x greater than 44.
 const ERF_INV_IMPL_GD: &[f64] = &[
    1.0,
    0.0845746234001899436914,
    0.00282092984726264681981,
    0.468292921940894236786e-4,
    0.399968812193862100054e-6,
    0.161809290887904476097e-8,
    0.231558608310259605225e-11,
 ];
 /// `erf_impl` computes the error function at `z`.
 /// If `inv` is true, `1 - erf` is calculated as opposed to `erf`
 fn erf_impl(z: f64, inv: bool) -> f64 {
    if z < 0.0 {
        if !inv {
            return -erf_impl(-z, false);
        }
        if z < -0.5 {
            return 2.0 - erf_impl(-z, true);
        }
        return 1.0 + erf_impl(-z, false);
    }
    let result = if z < 0.5 {
        if z < 1e-10 {
            z * 1.125 + z * 0.003379167095512573896158903121545171688
        } else {
            z * 1.125
                + z * evaluate::polynomial(z, ERF_IMPL_AN) / evaluate::polynomial(z, ERF_IMPL_AD)
        }
    } else if z < 110.0 {
        let (r, b) = if z < 0.75 {
            (
                evaluate::polynomial(z - 0.5, ERF_IMPL_BN)
                    / evaluate::polynomial(z - 0.5, ERF_IMPL_BD),
                0.3440242112,
            )
        } else if z < 1.25 {
            (
                evaluate::polynomial(z - 0.75, ERF_IMPL_CN)
                    / evaluate::polynomial(z - 0.75, ERF_IMPL_CD),
                0.419990927,
            )
        } else if z < 2.25 {
            (
                evaluate::polynomial(z - 1.25, ERF_IMPL_DN)
                    / evaluate::polynomial(z - 1.25, ERF_IMPL_DD),
                0.4898625016,
            )
        } else if z < 3.5 {
            (
                evaluate::polynomial(z - 2.25, ERF_IMPL_EN)
                    / evaluate::polynomial(z - 2.25, ERF_IMPL_ED),
                0.5317370892,
            )
        } else if z < 5.25 {
            (
                evaluate::polynomial(z - 3.5, ERF_IMPL_FN)
                    / evaluate::polynomial(z - 3.5, ERF_IMPL_FD),
                0.5489973426,
            )
        } else if z < 8.0 {
            (
                evaluate::polynomial(z - 5.25, ERF_IMPL_GN)
                    / evaluate::polynomial(z - 5.25, ERF_IMPL_GD),
                0.5571740866,
            )
        } else if z < 11.5 {
            (
                evaluate::polynomial(z - 8.0, ERF_IMPL_HN)
                    / evaluate::polynomial(z - 8.0, ERF_IMPL_HD),
                0.5609807968,
            )
        } else if z < 17.0 {
            (
                evaluate::polynomial(z - 11.5, ERF_IMPL_IN)
                    / evaluate::polynomial(z - 11.5, ERF_IMPL_ID),
                0.5626493692,
            )
        } else if z < 24.0 {
            (
                evaluate::polynomial(z - 17.0, ERF_IMPL_JN)
                    / evaluate::polynomial(z - 17.0, ERF_IMPL_JD),
                0.5634598136,
            )
        } else if z < 38.0 {
            (
                evaluate::polynomial(z - 24.0, ERF_IMPL_KN)
                    / evaluate::polynomial(z - 24.0, ERF_IMPL_KD),
                0.5638477802,
            )
        } else if z < 60.0 {
            (
                evaluate::polynomial(z - 38.0, ERF_IMPL_LN)
                    / evaluate::polynomial(z - 38.0, ERF_IMPL_LD),
                0.5640528202,
            )
        } else if z < 85.0 {
            (
                evaluate::polynomial(z - 60.0, ERF_IMPL_MN)
                    / evaluate::polynomial(z - 60.0, ERF_IMPL_MD),
                0.5641309023,
            )
        } else {
            (
                evaluate::polynomial(z - 85.0, ERF_IMPL_NN)
                    / evaluate::polynomial(z - 85.0, ERF_IMPL_ND),
                0.5641584396,
            )
        };
        let g = (-z * z).exp() / z;
        g * b + g * r
    } else {
        0.0
    };
    if inv && z >= 0.5 {
        result
    } else if z >= 0.5 || inv {
        1.0 - result
    } else {
        result
    }
 }
 // `erf_inv_impl` computes the inverse error function where
 // `p`,`q`, and `s` are the first, second, and third intermediate
 // parameters respectively
 fn erf_inv_impl(p: f64, q: f64, s: f64) -> f64 {
    let result = if p <= 0.5 {
        let y = 0.0891314744949340820313;
        let g = p * (p + 10.0);
        let r = evaluate::polynomial(p, ERF_INV_IMPL_AN) / evaluate::polynomial(p, ERF_INV_IMPL_AD);
        g * y + g * r
    } else if q >= 0.25 {
        let y = 2.249481201171875;
        let g = (-2.0 * q.ln()).sqrt();
        let xs = q - 0.25;
        let r =
            evaluate::polynomial(xs, ERF_INV_IMPL_BN) / evaluate::polynomial(xs, ERF_INV_IMPL_BD);
        g / (y + r)
    } else {
        let x = (-q.ln()).sqrt();
        if x < 3.0 {
            let y = 0.807220458984375;
            let xs = x - 1.125;
            let r = evaluate::polynomial(xs, ERF_INV_IMPL_CN)
                / evaluate::polynomial(xs, ERF_INV_IMPL_CD);
            y * x + r * x
        } else if x < 6.0 {
            let y = 0.93995571136474609375;
            let xs = x - 3.0;
            let r = evaluate::polynomial(xs, ERF_INV_IMPL_DN)
                / evaluate::polynomial(xs, ERF_INV_IMPL_DD);
            y * x + r * x
        } else if x < 18.0 {
            let y = 0.98362827301025390625;
            let xs = x - 6.0;
            let r = evaluate::polynomial(xs, ERF_INV_IMPL_EN)
                / evaluate::polynomial(xs, ERF_INV_IMPL_ED);
            y * x + r * x
        } else if x < 44.0 {
            let y = 0.99714565277099609375;
            let xs = x - 18.0;
            let r = evaluate::polynomial(xs, ERF_INV_IMPL_FN)
                / evaluate::polynomial(xs, ERF_INV_IMPL_FD);
            y * x + r * x
        } else {
            let y = 0.99941349029541015625;
            let xs = x - 44.0;
            let r = evaluate::polynomial(xs, ERF_INV_IMPL_GN)
                / evaluate::polynomial(xs, ERF_INV_IMPL_GD);
            y * x + r * x
        }
    };
    s * result
 }
--- a/candle-core/src/cpu/mod.rs
+++ b/candle-core/src/cpu/mod.rs
@ -1,9 +1,5 @@
 //! Traits and methods for CPU-backed Tensors
 pub mod erf;
 pub mod kernels;
 #[allow(unused)]
 trait Cpu<const ARR: usize> {
    type Unit;
    type Array;
@ -21,7 +17,6 @@ trait Cpu<const ARR: usize> {
    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit);
 }
 #[allow(unused)]
 trait CpuF16<const ARR: usize> {
    type Unit;
    type Array;
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
--- a/candle-core/src/cpu_backend/utils.rs
+++ b/candle-core/src/cpu_backend/utils.rs
@ -1,360 +0,0 @@
 /// Helper functions to write CPU kernels.
 use crate::backend::BackendStorage;
 use crate::{Error, Layout, Result, WithDType};
 type C = super::CpuStorage;
 pub trait Map1 {
    fn f<T: WithDType>(&self, vs: &[T], layout: &Layout) -> Result<Vec<T>>;
    fn map(&self, vs: &C, layout: &Layout) -> Result<C> {
        match vs {
            C::U8(vs) => Ok(C::U8(self.f(vs, layout)?)),
            C::U32(vs) => Ok(C::U32(self.f(vs, layout)?)),
            C::I64(vs) => Ok(C::I64(self.f(vs, layout)?)),
            C::BF16(vs) => Ok(C::BF16(self.f(vs, layout)?)),
            C::F16(vs) => Ok(C::F16(self.f(vs, layout)?)),
            C::F32(vs) => Ok(C::F32(self.f(vs, layout)?)),
            C::F64(vs) => Ok(C::F64(self.f(vs, layout)?)),
        }
    }
 }
 pub trait Map1Any {
    fn f<T: WithDType, W: Fn(Vec<T>) -> C>(&self, vs: &[T], layout: &Layout, wrap: W) -> Result<C>;
    fn map(&self, vs: &C, layout: &Layout) -> Result<C> {
        match vs {
            C::U8(vs) => Ok(self.f(vs, layout, C::U8)?),
            C::U32(vs) => Ok(self.f(vs, layout, C::U32)?),
            C::I64(vs) => Ok(self.f(vs, layout, C::I64)?),
            C::BF16(vs) => Ok(self.f(vs, layout, C::BF16)?),
            C::F16(vs) => Ok(self.f(vs, layout, C::F16)?),
            C::F32(vs) => Ok(self.f(vs, layout, C::F32)?),
            C::F64(vs) => Ok(self.f(vs, layout, C::F64)?),
        }
    }
 }
 pub trait Map2 {
    const OP: &'static str;
    fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result<Vec<T>>;
    fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result<C> {
        match (v1, v2) {
            (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::U32(v1), C::U32(v2)) => Ok(C::U32(self.f(v1, l1, v2, l2)?)),
            (C::I64(v1), C::I64(v2)) => Ok(C::I64(self.f(v1, l1, v2, l2)?)),
            (C::BF16(v1), C::BF16(v2)) => Ok(C::BF16(self.f(v1, l1, v2, l2)?)),
            (C::F16(v1), C::F16(v2)) => Ok(C::F16(self.f(v1, l1, v2, l2)?)),
            (C::F32(v1), C::F32(v2)) => Ok(C::F32(self.f(v1, l1, v2, l2)?)),
            (C::F64(v1), C::F64(v2)) => Ok(C::F64(self.f(v1, l1, v2, l2)?)),
            _ => Err(Error::DTypeMismatchBinaryOp {
                lhs: v1.dtype(),
                rhs: v2.dtype(),
                op: Self::OP,
            }
            .bt()),
        }
    }
 }
 pub trait Map2U8 {
    const OP: &'static str;
    fn f<T: WithDType>(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result<Vec<u8>>;
    fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result<C> {
        match (v1, v2) {
            (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::U32(v1), C::U32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::I64(v1), C::I64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::BF16(v1), C::BF16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::F16(v1), C::F16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::F32(v1), C::F32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            (C::F64(v1), C::F64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
            _ => Err(Error::DTypeMismatchBinaryOp {
                lhs: v1.dtype(),
                rhs: v2.dtype(),
                op: Self::OP,
            }
            .bt()),
        }
    }
 }
 pub fn binary_map<T: Copy, U: Copy, F: FnMut(T, T) -> U>(
    lhs_l: &Layout,
    rhs_l: &Layout,
    lhs: &[T],
    rhs: &[T],
    mut f: F,
 ) -> Vec<U> {
    match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) {
        (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => lhs[o_l1..o_l2]
            .iter()
            .zip(rhs[o_r1..o_r2].iter())
            .map(|(&l, &r)| f(l, r))
            .collect(),
        (Some((o_l1, o_l2)), None) => {
            // TODO: Maybe we want to avoid going through the layout twice.
            match rhs_l.offsets_b() {
                Some(ob) => {
                    let mut i_in_block = 0;
                    let mut i_right_broadcast = 0;
                    lhs[o_l1..o_l2]
                        .iter()
                        .map(|&l| {
                            let r = unsafe { rhs.get_unchecked(i_in_block + ob.start) };
                            i_right_broadcast += 1;
                            if i_right_broadcast >= ob.right_broadcast {
                                i_in_block += 1;
                                i_right_broadcast = 0;
                            }
                            if i_in_block >= ob.len {
                                i_in_block = 0
                            }
                            f(l, *r)
                        })
                        .collect()
                }
                None => lhs_l
                    .strided_index()
                    .zip(rhs_l.strided_index())
                    .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
                    .collect(),
            }
        }
        (None, Some((o_r1, o_r2))) => {
            // TODO: Maybe we want to avoid going through the layout twice.
            match lhs_l.offsets_b() {
                Some(ob) => {
                    let mut i_in_block = 0;
                    let mut i_right_broadcast = 0;
                    rhs[o_r1..o_r2]
                        .iter()
                        .map(|&r| {
                            let l = unsafe { lhs.get_unchecked(i_in_block + ob.start) };
                            i_right_broadcast += 1;
                            if i_right_broadcast >= ob.right_broadcast {
                                i_in_block += 1;
                                i_right_broadcast = 0;
                            }
                            if i_in_block >= ob.len {
                                i_in_block = 0
                            }
                            f(*l, r)
                        })
                        .collect()
                }
                None => lhs_l
                    .strided_index()
                    .zip(rhs_l.strided_index())
                    .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
                    .collect(),
            }
        }
        _ => lhs_l
            .strided_index()
            .zip(rhs_l.strided_index())
            .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
            .collect(),
    }
 }
 // Similar to binary_map but with vectorized variants.
 pub fn binary_map_vec<T: Copy, F: FnMut(T, T) -> T, FV: FnMut(&[T], &[T], &mut [T])>(
    lhs_l: &Layout,
    rhs_l: &Layout,
    lhs: &[T],
    rhs: &[T],
    mut f: F,
    mut f_vec: FV,
 ) -> Vec<T> {
    let el_count = lhs_l.shape().elem_count();
    match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) {
        (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => {
            let mut ys: Vec<T> = Vec::with_capacity(el_count);
            let ys_to_set = ys.spare_capacity_mut();
            let ys_to_set = unsafe {
                std::mem::transmute::<&mut [std::mem::MaybeUninit<T>], &mut [T]>(ys_to_set)
            };
            f_vec(&lhs[o_l1..o_l2], &rhs[o_r1..o_r2], ys_to_set);
            // SAFETY: values are all set by f_vec.
            unsafe { ys.set_len(el_count) };
            ys
        }
        (Some((o_l1, o_l2)), None) => match rhs_l.offsets_b() {
            Some(ob) if ob.right_broadcast == 1 => {
                let rhs = &rhs[ob.start..ob.start + ob.len];
                let mut ys: Vec<T> = Vec::with_capacity(el_count);
                let ys_to_set = ys.spare_capacity_mut();
                let ys_to_set = unsafe {
                    std::mem::transmute::<&mut [std::mem::MaybeUninit<T>], &mut [T]>(ys_to_set)
                };
                let mut dst_i = 0;
                for src_i in (o_l1..o_l2).step_by(ob.len) {
                    f_vec(
                        &lhs[src_i..src_i + ob.len],
                        rhs,
                        &mut ys_to_set[dst_i..dst_i + ob.len],
                    );
                    dst_i += ob.len;
                }
                // SAFETY: values are all set by f_vec.
                unsafe { ys.set_len(el_count) };
                ys
            }
            Some(ob) => {
                let rhs = &rhs[ob.start..ob.start + ob.len];
                let mut ys = lhs[o_l1..o_l2].to_vec();
                for idx_l in 0..ob.left_broadcast {
                    let start = idx_l * ob.len * ob.right_broadcast;
                    for (i, &r) in rhs.iter().enumerate() {
                        let start = start + i * ob.right_broadcast;
                        for v in ys[start..start + ob.right_broadcast].iter_mut() {
                            *v = f(*v, r)
                        }
                    }
                }
                ys
            }
            None => lhs_l
                .strided_index()
                .zip(rhs_l.strided_index())
                .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
                .collect(),
        },
        (None, Some((o_r1, o_r2))) => match lhs_l.offsets_b() {
            Some(ob) if ob.right_broadcast == 1 => {
                let lhs = &lhs[ob.start..ob.start + ob.len];
                let mut ys: Vec<T> = Vec::with_capacity(el_count);
                let ys_to_set = ys.spare_capacity_mut();
                let ys_to_set = unsafe {
                    std::mem::transmute::<&mut [std::mem::MaybeUninit<T>], &mut [T]>(ys_to_set)
                };
                let mut dst_i = 0;
                for src_i in (o_r1..o_r2).step_by(ob.len) {
                    f_vec(
                        lhs,
                        &rhs[src_i..src_i + ob.len],
                        &mut ys_to_set[dst_i..dst_i + ob.len],
                    );
                    dst_i += ob.len;
                }
                // SAFETY: values are all set by f_vec.
                unsafe { ys.set_len(el_count) };
                ys
            }
            Some(ob) => {
                let lhs = &lhs[ob.start..ob.start + ob.len];
                let mut ys = rhs[o_r1..o_r2].to_vec();
                for idx_l in 0..ob.left_broadcast {
                    let start = idx_l * ob.len * ob.right_broadcast;
                    for (i, &l) in lhs.iter().enumerate() {
                        let start = start + i * ob.right_broadcast;
                        for v in ys[start..start + ob.right_broadcast].iter_mut() {
                            *v = f(l, *v)
                        }
                    }
                }
                ys
            }
            None => lhs_l
                .strided_index()
                .zip(rhs_l.strided_index())
                .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
                .collect(),
        },
        _ => lhs_l
            .strided_index()
            .zip(rhs_l.strided_index())
            .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
            .collect(),
    }
 }
 pub fn unary_map<T: Copy, U: Copy, F: FnMut(T) -> U>(
    vs: &[T],
    layout: &Layout,
    mut f: F,
 ) -> Vec<U> {
    match layout.strided_blocks() {
        crate::StridedBlocks::SingleBlock { start_offset, len } => vs
            [start_offset..start_offset + len]
            .iter()
            .map(|&v| f(v))
            .collect(),
        crate::StridedBlocks::MultipleBlocks {
            block_start_index,
            block_len,
        } => {
            let mut result = Vec::with_capacity(layout.shape().elem_count());
            // Specialize the case where block_len is one to avoid the second loop.
            if block_len == 1 {
                for index in block_start_index {
                    let v = unsafe { vs.get_unchecked(index) };
                    result.push(f(*v))
                }
            } else {
                for index in block_start_index {
                    for offset in 0..block_len {
                        let v = unsafe { vs.get_unchecked(index + offset) };
                        result.push(f(*v))
                    }
                }
            }
            result
        }
    }
 }
 pub fn unary_map_vec<T: Copy, U: Copy, F: FnMut(T) -> U, FV: FnMut(&[T], &mut [U])>(
    vs: &[T],
    layout: &Layout,
    mut f: F,
    mut f_vec: FV,
 ) -> Vec<U> {
    match layout.strided_blocks() {
        crate::StridedBlocks::SingleBlock { start_offset, len } => {
            let mut ys: Vec<U> = Vec::with_capacity(len);
            let ys_to_set = ys.spare_capacity_mut();
            let ys_to_set = unsafe {
                std::mem::transmute::<&mut [std::mem::MaybeUninit<U>], &mut [U]>(ys_to_set)
            };
            f_vec(&vs[start_offset..start_offset + len], ys_to_set);
            // SAFETY: values are all set by f_vec.
            unsafe { ys.set_len(len) };
            ys
        }
        crate::StridedBlocks::MultipleBlocks {
            block_start_index,
            block_len,
        } => {
            let el_count = layout.shape().elem_count();
            // Specialize the case where block_len is one to avoid the second loop.
            if block_len == 1 {
                let mut result = Vec::with_capacity(el_count);
                for index in block_start_index {
                    let v = unsafe { vs.get_unchecked(index) };
                    result.push(f(*v))
                }
                result
            } else {
                let mut ys: Vec<U> = Vec::with_capacity(el_count);
                let ys_to_set = ys.spare_capacity_mut();
                let ys_to_set = unsafe {
                    std::mem::transmute::<&mut [std::mem::MaybeUninit<U>], &mut [U]>(ys_to_set)
                };
                let mut dst_index = 0;
                for src_index in block_start_index {
                    let vs = &vs[src_index..src_index + block_len];
                    let ys = &mut ys_to_set[dst_index..dst_index + block_len];
                    f_vec(vs, ys);
                    dst_index += block_len;
                }
                // SAFETY: values are all set by f_vec.
                unsafe { ys.set_len(el_count) };
                ys
            }
        }
    }
 }
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
--- a/candle-core/src/cuda_backend/cudnn.rs
+++ b/candle-core/src/cuda_backend/cudnn.rs
@ -1,225 +0,0 @@
 use crate::WithDType;
 use cudarc;
 use cudarc::cudnn::safe::{ConvForward, Cudnn};
 use cudarc::driver::{CudaSlice, CudaView, DeviceRepr, ValidAsZeroBits};
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::Arc;
 // The cudnn handles are stored per thread here rather than on the CudaDevice as they are neither
 // send nor sync.
 thread_local! {
    static CUDNN: RefCell<HashMap<crate::cuda_backend::DeviceId, Arc<Cudnn>>> = HashMap::new().into();
 }
 impl From<cudarc::cudnn::CudnnError> for crate::Error {
    fn from(err: cudarc::cudnn::CudnnError) -> Self {
        crate::Error::wrap(err)
    }
 }
 impl From<cudarc::driver::DriverError> for crate::Error {
    fn from(err: cudarc::driver::DriverError) -> Self {
        crate::Error::wrap(err)
    }
 }
 pub(crate) fn launch_conv2d<
    T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType,
    Y: cudarc::cudnn::CudnnDataType,
 >(
    src: &CudaView<T>,
    src_l: &crate::Layout,
    filter: &CudaView<T>,
    dst: &mut CudaSlice<T>,
    params: &crate::conv::ParamsConv2D,
    dev: &crate::cuda_backend::CudaDevice,
 ) -> crate::Result<()> {
    use crate::conv::CudnnFwdAlgo as CandleAlgo;
    use cudarc::cudnn::sys::cudnnConvolutionFwdAlgo_t as A;
    let device_id = dev.id();
    let cudnn = CUDNN.with(|cudnn| {
        if let Some(cudnn) = cudnn.borrow().get(&device_id) {
            return Ok(cudnn.clone());
        }
        let c = Cudnn::new(dev.cuda_stream());
        if let Ok(c) = &c {
            cudnn.borrow_mut().insert(device_id, c.clone());
        }
        c
    })?;
    let conv = cudnn.create_conv2d::<Y>(
        /* pad */ [params.padding as i32, params.padding as i32],
        /* stride */ [params.stride as i32, params.stride as i32],
        /* dilation */ [params.dilation as i32, params.dilation as i32],
        cudarc::cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
    )?;
    let x_shape = [
        params.b_size as i32,
        params.c_in as i32,
        params.i_h as i32,
        params.i_w as i32,
    ];
    // Note that `src` already starts at the proper offset.
    let x = if src_l.is_contiguous() {
        cudnn.create_4d_tensor::<T>(
            cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
            x_shape,
        )?
    } else {
        let s = src_l.stride();
        cudnn.create_4d_tensor_ex::<T>(
            x_shape,
            [s[0] as i32, s[1] as i32, s[2] as i32, s[3] as i32],
        )?
    };
    let w = cudnn.create_4d_filter::<T>(
        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
        [
            params.c_out as i32,
            params.c_in as i32,
            params.k_h as i32,
            params.k_w as i32,
        ],
    )?;
    let (w_out, h_out) = (params.out_w() as i32, params.out_h() as i32);
    let y = cudnn.create_4d_tensor::<T>(
        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
        [params.b_size as i32, params.c_out as i32, h_out, w_out],
    )?;
    let conv2d = ConvForward {
        conv: &conv,
        x: &x,
        w: &w,
        y: &y,
    };
    let alg = match params.cudnn_fwd_algo {
        None => conv2d.pick_algorithm()?,
        Some(CandleAlgo::ImplicitGemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
        Some(CandleAlgo::ImplicitPrecompGemm) => {
            A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
        }
        Some(CandleAlgo::Gemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
        Some(CandleAlgo::Direct) => A::CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
        Some(CandleAlgo::Fft) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT,
        Some(CandleAlgo::FftTiling) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
        Some(CandleAlgo::Winograd) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
        Some(CandleAlgo::WinogradNonFused) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
        Some(CandleAlgo::Count) => A::CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
    };
    let workspace_size = conv2d.get_workspace_size(alg)?;
    let mut workspace = dev.cuda_stream().alloc_zeros::<u8>(workspace_size)?;
    unsafe {
        conv2d.launch::<CudaSlice<u8>, _, _, _>(
            alg,
            Some(&mut workspace),
            (T::one(), T::zero()),
            src,
            filter,
            dst,
        )?;
    }
    Ok(())
 }
 pub(crate) fn launch_conv1d<
    T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType,
    Y: cudarc::cudnn::CudnnDataType,
 >(
    src: &CudaView<T>,
    src_l: &crate::Layout,
    filter: &CudaView<T>,
    dst: &mut CudaSlice<T>,
    params: &crate::conv::ParamsConv1D,
    dev: &crate::cuda_backend::CudaDevice,
 ) -> crate::Result<()> {
    use crate::conv::CudnnFwdAlgo as CandleAlgo;
    use cudarc::cudnn::sys::cudnnConvolutionFwdAlgo_t as A;
    let device_id = dev.id();
    let cudnn = CUDNN.with(|cudnn| {
        if let Some(cudnn) = cudnn.borrow().get(&device_id) {
            return Ok(cudnn.clone());
        }
        let c = Cudnn::new(dev.cuda_stream());
        if let Ok(c) = &c {
            cudnn.borrow_mut().insert(device_id, c.clone());
        }
        c
    })?;
    let conv = cudnn.create_conv2d::<Y>(
        /* pad */ [params.padding as i32, 0],
        /* stride */ [params.stride as i32, 1],
        /* dilation */ [params.dilation as i32, 1],
        cudarc::cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
    )?;
    // https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-ops-library.html#cudnnsettensornddescriptor
    // > Tensors are restricted to having at least 4 dimensions, and at most CUDNN_DIM_MAX
    // > dimensions (defined in cudnn.h). When working with lower dimensional data, it is
    // > recommended that the user create a 4D tensor, and set the size along unused dimensions
    // > to 1.
    let x_shape = [
        params.b_size as i32,
        params.c_in as i32,
        params.l_in as i32,
        1,
    ];
    // Note that `src` already starts at the proper offset.
    let x = if src_l.is_contiguous() {
        cudnn.create_4d_tensor::<T>(
            cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
            x_shape,
        )?
    } else {
        let s = src_l.stride();
        cudnn.create_4d_tensor_ex::<T>(x_shape, [s[0] as i32, s[1] as i32, s[2] as i32, 1i32])?
    };
    let w = cudnn.create_4d_filter::<T>(
        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
        [
            params.c_out as i32,
            params.c_in as i32,
            params.k_size as i32,
            1,
        ],
    )?;
    let l_out = params.l_out() as i32;
    let y = cudnn.create_4d_tensor::<T>(
        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
        [params.b_size as i32, params.c_out as i32, l_out, 1],
    )?;
    let conv1d = ConvForward {
        conv: &conv,
        x: &x,
        w: &w,
        y: &y,
    };
    let alg = match params.cudnn_fwd_algo {
        None => conv1d.pick_algorithm()?,
        Some(CandleAlgo::ImplicitGemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
        Some(CandleAlgo::ImplicitPrecompGemm) => {
            A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
        }
        Some(CandleAlgo::Gemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
        Some(CandleAlgo::Direct) => A::CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
        Some(CandleAlgo::Fft) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT,
        Some(CandleAlgo::FftTiling) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
        Some(CandleAlgo::Winograd) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
        Some(CandleAlgo::WinogradNonFused) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
        Some(CandleAlgo::Count) => A::CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
    };
    let workspace_size = conv1d.get_workspace_size(alg)?;
    let mut workspace = dev.cuda_stream().alloc_zeros::<u8>(workspace_size)?;
    unsafe {
        conv1d.launch::<CudaSlice<u8>, _, _, _>(
            alg,
            Some(&mut workspace),
            (T::one(), T::zero()),
            src,
            filter,
            dst,
        )?;
    }
    Ok(())
 }
--- a/candle-core/src/cuda_backend/device.rs
+++ b/candle-core/src/cuda_backend/device.rs
@ -1,646 +0,0 @@
 use crate::backend::BackendDevice;
 use crate::{CpuStorage, CpuStorageRef, DType, Layout, Result, Shape};
 pub use candle_kernels as kernels;
 pub use cudarc;
 use cudarc::driver::{CudaFunction, LaunchConfig, PushKernelArg};
 use half::{bf16, f16};
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use super::{CudaError, CudaStorage, CudaStorageSlice, WrapErr};
 /// Unique identifier for cuda devices.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub struct DeviceId(usize);
 impl DeviceId {
    fn new() -> Self {
        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
        use std::sync::atomic;
        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
    }
 }
 struct CudaRng(cudarc::curand::CudaRng);
 unsafe impl Send for CudaRng {}
 pub struct ModuleStore {
    mdls: [Option<Arc<cudarc::driver::CudaModule>>; kernels::ALL_IDS.len()],
 }
 #[derive(Clone)]
 pub struct CudaDevice {
    id: DeviceId,
    context: Arc<cudarc::driver::CudaContext>,
    modules: Arc<std::sync::RwLock<ModuleStore>>,
    custom_modules: Arc<std::sync::RwLock<HashMap<String, Arc<cudarc::driver::CudaModule>>>>,
    stream: Arc<cudarc::driver::CudaStream>,
    pub(crate) blas: Arc<cudarc::cublas::CudaBlas>,
    curand: Arc<Mutex<CudaRng>>,
 }
 impl std::fmt::Debug for CudaDevice {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "CudaDevice({:?})", self.id)
    }
 }
 impl CudaDevice {
    #[allow(clippy::missing_safety_doc)]
    pub unsafe fn alloc<T: cudarc::driver::DeviceRepr>(
        &self,
        len: usize,
    ) -> Result<cudarc::driver::CudaSlice<T>> {
        self.stream.alloc::<T>(len).w()
    }
    pub fn alloc_zeros<T: cudarc::driver::DeviceRepr + cudarc::driver::ValidAsZeroBits>(
        &self,
        len: usize,
    ) -> Result<cudarc::driver::CudaSlice<T>> {
        self.stream.alloc_zeros::<T>(len).w()
    }
    pub fn memcpy_htod<
        T: cudarc::driver::DeviceRepr,
        Src: cudarc::driver::HostSlice<T> + ?Sized,
        Dst: cudarc::driver::DevicePtrMut<T>,
    >(
        &self,
        src: &Src,
        dst: &mut Dst,
    ) -> Result<()> {
        self.stream.memcpy_htod(src, dst).w()
    }
    pub fn memcpy_dtov<T: cudarc::driver::DeviceRepr, Src: cudarc::driver::DevicePtr<T>>(
        &self,
        src: &Src,
    ) -> Result<Vec<T>> {
        self.stream.memcpy_dtov(src).w()
    }
    pub fn memcpy_dtod<
        T,
        Src: cudarc::driver::DevicePtr<T>,
        Dst: cudarc::driver::DevicePtrMut<T>,
    >(
        &self,
        src: &Src,
        dst: &mut Dst,
    ) -> Result<()> {
        self.stream.memcpy_dtod(src, dst).w()
    }
    pub fn memcpy_stod<
        T: cudarc::driver::DeviceRepr,
        Src: cudarc::driver::HostSlice<T> + ?Sized,
    >(
        &self,
        src: &Src,
    ) -> Result<cudarc::driver::CudaSlice<T>> {
        self.stream.memcpy_stod(src).w()
    }
 }
 pub struct CudaFunc {
    func: CudaFunction,
    stream: Arc<cudarc::driver::CudaStream>,
 }
 impl std::ops::Deref for CudaFunc {
    type Target = CudaFunction;
    fn deref(&self) -> &Self::Target {
        &self.func
    }
 }
 impl CudaFunc {
    pub fn into_cuda_function(self) -> CudaFunction {
        self.func
    }
 }
 #[macro_export]
 macro_rules! builder_arg {
    ($b:ident, $($arg:expr),*) => {
        $(
            let __arg = $arg;
            $b.arg(&__arg);
        )*
    };
 }
 impl CudaFunc {
    pub fn builder(&self) -> cudarc::driver::LaunchArgs<'_> {
        self.stream.launch_builder(&self.func)
    }
 }
 impl CudaDevice {
    pub fn cuda_stream(&self) -> Arc<cudarc::driver::CudaStream> {
        self.stream.clone()
    }
    #[cfg(not(target_arch = "wasm32"))]
    pub fn compile(
        &self,
        func_name: &'static str,
        kernel: ug::lang::ssa::Kernel,
    ) -> Result<CudaFunc> {
        let mut buf = vec![];
        ug_cuda::code_gen::gen(&mut buf, func_name, &kernel)?;
        let cuda_code = String::from_utf8(buf)?;
        let opts = cudarc::nvrtc::CompileOptions {
            use_fast_math: Some(true),
            ..Default::default()
        };
        let ptx = cudarc::nvrtc::safe::compile_ptx_with_opts(cuda_code, opts).w()?;
        let module = self.context.load_module(ptx).w()?;
        let func = module.load_function(func_name).w()?;
        Ok(CudaFunc {
            func,
            stream: self.stream.clone(),
        })
    }
    pub fn id(&self) -> DeviceId {
        self.id
    }
    fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
        let elem_count = shape.elem_count();
        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
        let slice = match dtype {
            DType::U8 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<u8>(elem_count)? };
                let func = self.get_or_load_func("fill_u8", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                let v = v as u8;
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::U8(data)
            }
            DType::U32 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<u32>(elem_count)? };
                let func = self.get_or_load_func("fill_u32", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                let v = v as u32;
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::U32(data)
            }
            DType::I64 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<i64>(elem_count)? };
                let func = self.get_or_load_func("fill_i64", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                let v = v as i64;
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::I64(data)
            }
            DType::BF16 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<bf16>(elem_count)? };
                let func = self.get_or_load_func("fill_bf16", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                let v = bf16::from_f64(v);
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::BF16(data)
            }
            DType::F16 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<f16>(elem_count)? };
                let func = self.get_or_load_func("fill_f16", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                let v = f16::from_f64(v);
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::F16(data)
            }
            DType::F32 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<f32>(elem_count)? };
                let func = self.get_or_load_func("fill_f32", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                let v = v as f32;
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::F32(data)
            }
            DType::F64 => {
                // SAFETY: Set later by running the fill kernel.
                let data = unsafe { self.alloc::<f64>(elem_count) }?;
                let func = self.get_or_load_func("fill_f64", &kernels::FILL)?;
                let mut builder = self.stream.launch_builder(&func);
                builder.arg(&data);
                builder.arg(&v);
                builder.arg(&elem_count);
                unsafe { builder.launch(cfg) }.w()?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    pub fn get_or_load_custom_func(
        &self,
        fn_name: &str,
        module_name: &str,
        ptx: &str,
    ) -> Result<CudaFunc> {
        let ms = self.custom_modules.read().unwrap();
        if let Some(mdl) = ms.get(module_name).as_ref() {
            let func = mdl.load_function(fn_name).w()?;
            return Ok(CudaFunc {
                func,
                stream: self.stream.clone(),
            });
        }
        drop(ms);
        let mut ms = self.custom_modules.write().unwrap();
        let cuda_module = self.context.load_module(ptx.into()).w()?;
        ms.insert(module_name.to_string(), cuda_module.clone());
        let func = cuda_module.load_function(fn_name).w()?;
        Ok(CudaFunc {
            func,
            stream: self.stream.clone(),
        })
    }
    pub fn get_or_load_func(&self, fn_name: &str, mdl: &kernels::Module) -> Result<CudaFunc> {
        let ms = self.modules.read().unwrap();
        if let Some(mdl) = ms.mdls[mdl.index()].as_ref() {
            let func = mdl.load_function(fn_name).w()?;
            return Ok(CudaFunc {
                func,
                stream: self.stream.clone(),
            });
        }
        drop(ms);
        let mut ms = self.modules.write().unwrap();
        let cuda_module = self.context.load_module(mdl.ptx().into()).w()?;
        ms.mdls[mdl.index()] = Some(cuda_module.clone());
        let func = cuda_module.load_function(fn_name).w()?;
        Ok(CudaFunc {
            func,
            stream: self.stream.clone(),
        })
    }
 }
 impl CudaDevice {
    pub fn new_with_stream(ordinal: usize) -> Result<Self> {
        let context = cudarc::driver::CudaContext::new(ordinal).w()?;
        let stream = context.new_stream().w()?;
        let blas = cudarc::cublas::CudaBlas::new(stream.clone()).w()?;
        let curand = cudarc::curand::CudaRng::new(299792458, stream.clone()).w()?;
        let module_store = ModuleStore {
            mdls: [const { None }; kernels::ALL_IDS.len()],
        };
        Ok(Self {
            id: DeviceId::new(),
            context,
            stream,
            blas: Arc::new(blas),
            curand: Arc::new(Mutex::new(CudaRng(curand))),
            modules: Arc::new(std::sync::RwLock::new(module_store)),
            custom_modules: Arc::new(std::sync::RwLock::new(HashMap::new())),
        })
    }
 }
 impl BackendDevice for CudaDevice {
    type Storage = CudaStorage;
    fn new(ordinal: usize) -> Result<Self> {
        let context = cudarc::driver::CudaContext::new(ordinal).w()?;
        let stream = context.default_stream();
        let blas = cudarc::cublas::CudaBlas::new(stream.clone()).w()?;
        let curand = cudarc::curand::CudaRng::new(299792458, stream.clone()).w()?;
        let module_store = ModuleStore {
            mdls: [const { None }; kernels::ALL_IDS.len()],
        };
        Ok(Self {
            id: DeviceId::new(),
            context,
            stream,
            blas: Arc::new(blas),
            curand: Arc::new(Mutex::new(CudaRng(curand))),
            modules: Arc::new(std::sync::RwLock::new(module_store)),
            custom_modules: Arc::new(std::sync::RwLock::new(HashMap::new())),
        })
    }
    fn set_seed(&self, seed: u64) -> Result<()> {
        // We do not call set_seed but instead create a new curand object. This ensures that the
        // state will be identical and the same random numbers will be generated.
        let mut curand = self.curand.lock().unwrap();
        curand.0 = cudarc::curand::CudaRng::new(seed, self.stream.clone()).w()?;
        Ok(())
    }
    fn location(&self) -> crate::DeviceLocation {
        crate::DeviceLocation::Cuda {
            gpu_id: self.context.ordinal(),
        }
    }
    fn same_device(&self, rhs: &Self) -> bool {
        self.id == rhs.id
    }
    fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
        let elem_count = shape.elem_count();
        let slice = match dtype {
            DType::U8 => {
                let data = self.alloc_zeros::<u8>(elem_count)?;
                CudaStorageSlice::U8(data)
            }
            DType::U32 => {
                let data = self.alloc_zeros::<u32>(elem_count)?;
                CudaStorageSlice::U32(data)
            }
            DType::I64 => {
                let data = self.alloc_zeros::<i64>(elem_count)?;
                CudaStorageSlice::I64(data)
            }
            DType::BF16 => {
                let data = self.alloc_zeros::<bf16>(elem_count)?;
                CudaStorageSlice::BF16(data)
            }
            DType::F16 => {
                let data = self.alloc_zeros::<f16>(elem_count)?;
                CudaStorageSlice::F16(data)
            }
            DType::F32 => {
                let data = self.alloc_zeros::<f32>(elem_count)?;
                CudaStorageSlice::F32(data)
            }
            DType::F64 => {
                let data = self.alloc_zeros::<f64>(elem_count)?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn rand_uniform(&self, shape: &Shape, dtype: DType, lo: f64, up: f64) -> Result<CudaStorage> {
        let elem_count = shape.elem_count();
        let curand = self.curand.lock().unwrap();
        let slice = match dtype {
            // TODO: Add support for F16 and BF16 though this is likely to require some upstream
            // cudarc changes.
            DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => {
                Err(CudaError::UnsupportedDtype {
                    dtype,
                    op: "rand_uniform",
                })
                .w()?
            }
            DType::F32 => {
                let mut data = unsafe { self.alloc::<f32>(elem_count)? };
                curand.0.fill_with_uniform(&mut data).w()?;
                CudaStorageSlice::F32(data)
            }
            DType::F64 => {
                let mut data = unsafe { self.alloc::<f64>(elem_count)? };
                curand.0.fill_with_uniform(&mut data).w()?;
                CudaStorageSlice::F64(data)
            }
        };
        let slice = if lo == 0. && up == 1.0 {
            slice
        } else {
            use super::utils::Map1;
            let layout = Layout::contiguous(shape);
            super::Affine(up - lo, lo).map(&slice, self, &layout)?
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, std: f64) -> Result<CudaStorage> {
        // TODO: Add support for F16 and BF16 though this is likely to require some upstream
        // cudarc changes.
        let elem_count = shape.elem_count();
        let curand = self.curand.lock().unwrap();
        // curand can only generate an odd number of values.
        // https://github.com/huggingface/candle/issues/734
        let elem_count_round = if elem_count % 2 == 1 {
            elem_count + 1
        } else {
            elem_count
        };
        let slice = match dtype {
            DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => {
                Err(CudaError::UnsupportedDtype {
                    dtype,
                    op: "rand_normal",
                })
                .w()?
            }
            DType::F32 => {
                let mut data = unsafe { self.alloc::<f32>(elem_count_round)? };
                curand
                    .0
                    .fill_with_normal(&mut data, mean as f32, std as f32)
                    .w()?;
                CudaStorageSlice::F32(data)
            }
            DType::F64 => {
                let mut data = unsafe { self.alloc::<f64>(elem_count_round)? };
                curand.0.fill_with_normal(&mut data, mean, std).w()?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
        self.const_impl(1., shape, dtype)
    }
    unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Self::Storage> {
        let elem_count = shape.elem_count();
        let slice = match dtype {
            DType::U8 => {
                let data = self.alloc::<u8>(elem_count)?;
                CudaStorageSlice::U8(data)
            }
            DType::U32 => {
                let data = self.alloc::<u32>(elem_count)?;
                CudaStorageSlice::U32(data)
            }
            DType::I64 => {
                let data = self.alloc::<i64>(elem_count)?;
                CudaStorageSlice::I64(data)
            }
            DType::BF16 => {
                let data = self.alloc::<bf16>(elem_count)?;
                CudaStorageSlice::BF16(data)
            }
            DType::F16 => {
                let data = self.alloc::<f16>(elem_count)?;
                CudaStorageSlice::F16(data)
            }
            DType::F32 => {
                let data = self.alloc::<f32>(elem_count)?;
                CudaStorageSlice::F32(data)
            }
            DType::F64 => {
                let data = self.alloc::<f64>(elem_count)?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn storage_from_slice<T: crate::WithDType>(&self, s: &[T]) -> Result<Self::Storage> {
        let slice = match T::cpu_storage_ref(s) {
            CpuStorageRef::U8(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::U8(data)
            }
            CpuStorageRef::U32(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::U32(data)
            }
            CpuStorageRef::I64(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::I64(data)
            }
            CpuStorageRef::BF16(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::BF16(data)
            }
            CpuStorageRef::F16(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::F16(data)
            }
            CpuStorageRef::F32(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::F32(data)
            }
            CpuStorageRef::F64(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<CudaStorage> {
        let slice = match storage {
            CpuStorage::U8(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::U8(data)
            }
            CpuStorage::U32(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::U32(data)
            }
            CpuStorage::I64(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::I64(data)
            }
            CpuStorage::BF16(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::BF16(data)
            }
            CpuStorage::F16(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::F16(data)
            }
            CpuStorage::F32(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::F32(data)
            }
            CpuStorage::F64(storage) => {
                let data = self.memcpy_stod(storage)?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result<CudaStorage> {
        let slice = match storage {
            CpuStorage::U8(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::U8(data)
            }
            CpuStorage::U32(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::U32(data)
            }
            CpuStorage::I64(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::I64(data)
            }
            CpuStorage::BF16(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::BF16(data)
            }
            CpuStorage::F16(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::F16(data)
            }
            CpuStorage::F32(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::F32(data)
            }
            CpuStorage::F64(storage) => {
                let data = self.memcpy_stod(&storage)?;
                CudaStorageSlice::F64(data)
            }
        };
        Ok(CudaStorage {
            slice,
            device: self.clone(),
        })
    }
    fn synchronize(&self) -> Result<()> {
        self.stream.synchronize().map_err(crate::Error::wrap)?;
        Ok(())
    }
 }
--- a/candle-core/src/cuda_backend/error.rs
+++ b/candle-core/src/cuda_backend/error.rs
@ -1,62 +0,0 @@
 use crate::{DType, Layout};
 /// cudarc related errors
 #[derive(thiserror::Error, Debug)]
 pub enum CudaError {
    #[error(transparent)]
    Cuda(#[from] cudarc::driver::DriverError),
    #[error(transparent)]
    Compiler(#[from] cudarc::nvrtc::CompileError),
    #[error(transparent)]
    Cublas(#[from] cudarc::cublas::result::CublasError),
    #[error(transparent)]
    Curand(#[from] cudarc::curand::result::CurandError),
    #[error("missing kernel '{module_name}'")]
    MissingKernel { module_name: String },
    #[error("unsupported dtype {dtype:?} for {op}")]
    UnsupportedDtype { dtype: DType, op: &'static str },
    #[error("internal error '{0}'")]
    InternalError(&'static str),
    #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")]
    MatMulNonContiguous {
        lhs_stride: Layout,
        rhs_stride: Layout,
        mnk: (usize, usize, usize),
    },
    #[error("{msg}, expected: {expected:?}, got: {got:?}")]
    UnexpectedDType {
        msg: &'static str,
        expected: DType,
        got: DType,
    },
    #[error("{cuda} when loading {module_name}")]
    Load {
        cuda: cudarc::driver::DriverError,
        module_name: String,
    },
 }
 impl From<CudaError> for crate::Error {
    fn from(val: CudaError) -> Self {
        crate::Error::Cuda(Box::new(val)).bt()
    }
 }
 pub trait WrapErr<O> {
    fn w(self) -> std::result::Result<O, crate::Error>;
 }
 impl<O, E: Into<CudaError>> WrapErr<O> for std::result::Result<O, E> {
    fn w(self) -> std::result::Result<O, crate::Error> {
        self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt())
    }
 }
--- a/candle-core/src/cuda_backend/mod.rs
+++ b/candle-core/src/cuda_backend/mod.rs
--- a/candle-core/src/cuda_backend/utils.rs
+++ b/candle-core/src/cuda_backend/utils.rs
@ -1,172 +0,0 @@
 /// Helper functions to plug cuda kernels in candle.
 use crate::{Layout, Result, Shape, WithDType};
 pub use cudarc;
 use cudarc::driver::{CudaSlice, DeviceRepr, ValidAsZeroBits};
 use super::{CudaDevice, CudaError, WrapErr};
 pub type S = super::CudaStorageSlice;
 pub trait Map1 {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
        &self,
        src: &CudaSlice<T>,
        dev: &CudaDevice,
        layout: &Layout,
    ) -> Result<CudaSlice<T>>;
    fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result<S> {
        let out = match s {
            S::U8(s) => S::U8(self.f(s, d, l)?),
            S::U32(s) => S::U32(self.f(s, d, l)?),
            S::I64(s) => S::I64(self.f(s, d, l)?),
            S::BF16(s) => S::BF16(self.f(s, d, l)?),
            S::F16(s) => S::F16(self.f(s, d, l)?),
            S::F32(s) => S::F32(self.f(s, d, l)?),
            S::F64(s) => S::F64(self.f(s, d, l)?),
        };
        Ok(out)
    }
 }
 pub trait Map2 {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
        &self,
        src1: &CudaSlice<T>,
        layout1: &Layout,
        src2: &CudaSlice<T>,
        layout2: &Layout,
        dev: &CudaDevice,
    ) -> Result<CudaSlice<T>>;
    fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result<S> {
        let out = match (s1, s2) {
            (S::U8(s1), S::U8(s2)) => S::U8(self.f(s1, l1, s2, l2, d)?),
            (S::U32(s1), S::U32(s2)) => S::U32(self.f(s1, l1, s2, l2, d)?),
            (S::I64(s1), S::I64(s2)) => S::I64(self.f(s1, l1, s2, l2, d)?),
            (S::BF16(s1), S::BF16(s2)) => S::BF16(self.f(s1, l1, s2, l2, d)?),
            (S::F16(s1), S::F16(s2)) => S::F16(self.f(s1, l1, s2, l2, d)?),
            (S::F32(s1), S::F32(s2)) => S::F32(self.f(s1, l1, s2, l2, d)?),
            (S::F64(s1), S::F64(s2)) => S::F64(self.f(s1, l1, s2, l2, d)?),
            _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?,
        };
        Ok(out)
    }
 }
 pub trait Map3 {
    #[allow(clippy::too_many_arguments)]
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
        &self,
        src1: &CudaSlice<T>,
        layout1: &Layout,
        src2: &CudaSlice<T>,
        layout2: &Layout,
        src3: &CudaSlice<T>,
        layout3: &Layout,
        dev: &CudaDevice,
    ) -> Result<CudaSlice<T>>;
    #[allow(clippy::too_many_arguments)]
    fn map(
        &self,
        s1: &S,
        l1: &Layout,
        s2: &S,
        l2: &Layout,
        s3: &S,
        l3: &Layout,
        d: &CudaDevice,
    ) -> Result<S> {
        let out = match (s1, s2, s3) {
            (S::U8(s1), S::U8(s2), S::U8(s3)) => S::U8(self.f(s1, l1, s2, l2, s3, l3, d)?),
            (S::U32(s1), S::U32(s2), S::U32(s3)) => S::U32(self.f(s1, l1, s2, l2, s3, l3, d)?),
            (S::I64(s1), S::I64(s2), S::I64(s3)) => S::I64(self.f(s1, l1, s2, l2, s3, l3, d)?),
            (S::BF16(s1), S::BF16(s2), S::BF16(s3)) => S::BF16(self.f(s1, l1, s2, l2, s3, l3, d)?),
            (S::F16(s1), S::F16(s2), S::F16(s3)) => S::F16(self.f(s1, l1, s2, l2, s3, l3, d)?),
            (S::F32(s1), S::F32(s2), S::F32(s3)) => S::F32(self.f(s1, l1, s2, l2, s3, l3, d)?),
            (S::F64(s1), S::F64(s2), S::F64(s3)) => S::F64(self.f(s1, l1, s2, l2, s3, l3, d)?),
            _ => Err(CudaError::InternalError("dtype mismatch in ternary op"))?,
        };
        Ok(out)
    }
 }
 pub trait Map2InPlace {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
        &self,
        dst: &mut CudaSlice<T>,
        dst_shape: &Shape,
        src: &CudaSlice<T>,
        src_l: &Layout,
        dev: &CudaDevice,
    ) -> Result<()>;
    fn map(
        &self,
        dst: &mut S,
        dst_s: &Shape,
        src: &S,
        src_l: &Layout,
        d: &CudaDevice,
    ) -> Result<()> {
        match (dst, src) {
            (S::U8(dst), S::U8(src)) => self.f(dst, dst_s, src, src_l, d),
            (S::U32(dst), S::U32(src)) => self.f(dst, dst_s, src, src_l, d),
            (S::I64(dst), S::I64(src)) => self.f(dst, dst_s, src, src_l, d),
            (S::BF16(dst), S::BF16(src)) => self.f(dst, dst_s, src, src_l, d),
            (S::F16(dst), S::F16(src)) => self.f(dst, dst_s, src, src_l, d),
            (S::F32(dst), S::F32(src)) => self.f(dst, dst_s, src, src_l, d),
            (S::F64(dst), S::F64(src)) => self.f(dst, dst_s, src, src_l, d),
            _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?,
        }
    }
 }
 pub trait Map1Any {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits, W: Fn(CudaSlice<T>) -> S>(
        &self,
        src: &CudaSlice<T>,
        dev: &CudaDevice,
        layout: &Layout,
        wrap: W,
    ) -> Result<S>;
    fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result<S> {
        let out = match s {
            S::U8(s) => self.f(s, d, l, S::U8)?,
            S::U32(s) => self.f(s, d, l, S::U32)?,
            S::I64(s) => self.f(s, d, l, S::I64)?,
            S::BF16(s) => self.f(s, d, l, S::BF16)?,
            S::F16(s) => self.f(s, d, l, S::F16)?,
            S::F32(s) => self.f(s, d, l, S::F32)?,
            S::F64(s) => self.f(s, d, l, S::F64)?,
        };
        Ok(out)
    }
 }
 pub trait Map2Any {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
        &self,
        src1: &CudaSlice<T>,
        layout1: &Layout,
        src2: &CudaSlice<T>,
        layout2: &Layout,
        dev: &CudaDevice,
    ) -> Result<S>;
    fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result<S> {
        let out = match (s1, s2) {
            (S::U8(s1), S::U8(s2)) => self.f(s1, l1, s2, l2, d)?,
            (S::U32(s1), S::U32(s2)) => self.f(s1, l1, s2, l2, d)?,
            (S::I64(s1), S::I64(s2)) => self.f(s1, l1, s2, l2, d)?,
            (S::BF16(s1), S::BF16(s2)) => self.f(s1, l1, s2, l2, d)?,
            (S::F16(s1), S::F16(s2)) => self.f(s1, l1, s2, l2, d)?,
            (S::F32(s1), S::F32(s2)) => self.f(s1, l1, s2, l2, d)?,
            (S::F64(s1), S::F64(s2)) => self.f(s1, l1, s2, l2, d)?,
            _ => Err(CudaError::InternalError("dtype mismatch in binary op")).w()?,
        };
        Ok(out)
    }
 }
--- a/candle-core/src/cudnn.rs
+++ b/candle-core/src/cudnn.rs
@ -0,0 +1,107 @@
 use crate::WithDType;
 use cudarc;
 use cudarc::cudnn::safe::{Conv2dForward, Cudnn};
 use cudarc::driver::{CudaSlice, CudaView, DeviceRepr, ValidAsZeroBits};
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::Arc;
 // The cudnn handles are stored per thread here rather than on the CudaDevice as they are neither
 // send nor sync.
 thread_local! {
    static CUDNN: RefCell<HashMap<crate::cuda_backend::DeviceId, Arc<Cudnn>>> = HashMap::new().into();
 }
 impl From<cudarc::cudnn::CudnnError> for crate::Error {
    fn from(err: cudarc::cudnn::CudnnError) -> Self {
        crate::Error::wrap(err)
    }
 }
 impl From<cudarc::driver::DriverError> for crate::Error {
    fn from(err: cudarc::driver::DriverError) -> Self {
        crate::Error::wrap(err)
    }
 }
 pub(crate) fn launch_conv2d<
    T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType,
 >(
    src: &CudaView<T>,
    src_l: &crate::Layout,
    filter: &CudaView<T>,
    dst: &mut CudaSlice<T>,
    params: &crate::conv::ParamsConv2D,
    dev: &crate::cuda_backend::CudaDevice,
 ) -> crate::Result<()> {
    let device_id = dev.id();
    let cudnn = CUDNN.with(|cudnn| {
        if let Some(cudnn) = cudnn.borrow().get(&device_id) {
            return Ok(cudnn.clone());
        }
        let c = Cudnn::new(dev.cuda_device());
        if let Ok(c) = &c {
            cudnn.borrow_mut().insert(device_id, c.clone());
        }
        c
    })?;
    let conv = cudnn.create_conv2d::<T>(
        /* pad */ [params.padding as i32, params.padding as i32],
        /* stride */ [params.stride as i32, params.stride as i32],
        /* dilation */ [params.dilation as i32, params.dilation as i32],
        cudarc::cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
    )?;
    let x_shape = [
        params.b_size as i32,
        params.c_in as i32,
        params.i_h as i32,
        params.i_w as i32,
    ];
    // Note that `src` already starts at the proper offset.
    let x = if src_l.is_contiguous() {
        cudnn.create_4d_tensor(
            cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
            x_shape,
        )?
    } else {
        let s = src_l.stride();
        cudnn.create_4d_tensor_ex(
            x_shape,
            [s[0] as i32, s[1] as i32, s[2] as i32, s[3] as i32],
        )?
    };
    let w = cudnn.create_4d_filter(
        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
        [
            params.c_out as i32,
            params.c_in as i32,
            params.k_h as i32,
            params.k_w as i32,
        ],
    )?;
    let (w_out, h_out) = (params.out_w() as i32, params.out_h() as i32);
    let y = cudnn.create_4d_tensor(
        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
        [params.b_size as i32, params.c_out as i32, h_out, w_out],
    )?;
    let conv2d = Conv2dForward {
        conv: &conv,
        x: &x,
        w: &w,
        y: &y,
    };
    let alg = conv2d.pick_algorithm()?;
    let workspace_size = conv2d.get_workspace_size(alg)?;
    let mut workspace = dev.cuda_device().alloc_zeros::<u8>(workspace_size)?;
    unsafe {
        conv2d.launch::<CudaSlice<u8>, _, _, _>(
            alg,
            Some(&mut workspace),
            (T::one(), T::zero()),
            src,
            filter,
            dst,
        )?;
    }
    Ok(())
 }
--- a/candle-core/src/custom_op.rs
+++ b/candle-core/src/custom_op.rs
@ -1,490 +0,0 @@
 use crate::op::{BackpropOp, Op};
 use crate::tensor::from_storage;
 use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
 use std::sync::Arc;
 /// Unary ops that can be defined in user-land.
 pub trait CustomOp1 {
    // Box<dyn> does not support const yet, so use a function to get the name.
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn metal_fwd(
        &self,
        _storage: &MetalStorage,
        _layout: &Layout,
    ) -> Result<(MetalStorage, Shape)> {
        Err(crate::Error::Metal(
            format!("no metal implementation for {}", self.name()).into(),
        ))
    }
    /// This function takes as argument the argument `arg` used in the forward pass, the result
    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
    /// The function should return the gradient of the argument.
    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
        Err(crate::Error::BackwardNotSupported { op: self.name() })
    }
 }
 pub trait CustomOp2 {
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(
        &self,
        s1: &CpuStorage,
        l1: &Layout,
        s2: &CpuStorage,
        l2: &Layout,
    ) -> Result<(CpuStorage, Shape)>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(
        &self,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
    ) -> Result<(CudaStorage, Shape)> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn metal_fwd(
        &self,
        _: &MetalStorage,
        _: &Layout,
        _: &MetalStorage,
        _: &Layout,
    ) -> Result<(MetalStorage, Shape)> {
        Err(crate::Error::Metal(
            format!("no metal implementation for {}", self.name()).into(),
        ))
    }
    fn bwd(
        &self,
        _arg1: &Tensor,
        _arg2: &Tensor,
        _res: &Tensor,
        _grad_res: &Tensor,
    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
        Err(crate::Error::BackwardNotSupported { op: self.name() })
    }
 }
 pub trait CustomOp3 {
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(
        &self,
        s1: &CpuStorage,
        l1: &Layout,
        s2: &CpuStorage,
        l2: &Layout,
        s3: &CpuStorage,
        l3: &Layout,
    ) -> Result<(CpuStorage, Shape)>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(
        &self,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
    ) -> Result<(CudaStorage, Shape)> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn metal_fwd(
        &self,
        _: &MetalStorage,
        _: &Layout,
        _: &MetalStorage,
        _: &Layout,
        _: &MetalStorage,
        _: &Layout,
    ) -> Result<(MetalStorage, Shape)> {
        Err(crate::Error::Metal(
            format!("no metal implementation for {}", self.name()).into(),
        ))
    }
    fn bwd(
        &self,
        _arg1: &Tensor,
        _arg2: &Tensor,
        _arg3: &Tensor,
        _res: &Tensor,
        _grad_res: &Tensor,
    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
        Err(crate::Error::BackwardNotSupported { op: self.name() })
    }
 }
 impl Tensor {
    /// Applies a unary custom op without backward support
    pub fn apply_op1_no_bwd<C: CustomOp1>(&self, c: &C) -> Result<Self> {
        let (storage, shape) = self.storage().apply_op1(self.layout(), c)?;
        Ok(from_storage(storage, shape, BackpropOp::none(), false))
    }
    /// Applies a binary custom op without backward support
    pub fn apply_op2_no_bwd<C: CustomOp2>(&self, rhs: &Self, c: &C) -> Result<Self> {
        let (storage, shape) =
            self.storage()
                .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?;
        Ok(from_storage(storage, shape, BackpropOp::none(), false))
    }
    /// Applies a ternary custom op without backward support
    pub fn apply_op3_no_bwd<C: CustomOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<Self> {
        let (storage, shape) = self.storage().apply_op3(
            self.layout(),
            &t2.storage(),
            t2.layout(),
            &t3.storage(),
            t3.layout(),
            c,
        )?;
        Ok(from_storage(storage, shape, BackpropOp::none(), false))
    }
    /// Applies a unary custom op.
    pub fn apply_op1_arc(&self, c: Arc<Box<dyn CustomOp1 + Send + Sync>>) -> Result<Self> {
        let (storage, shape) = self
            .storage()
            .apply_op1(self.layout(), c.as_ref().as_ref())?;
        let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone()));
        Ok(from_storage(storage, shape, op, false))
    }
    pub fn apply_op1<C: 'static + CustomOp1 + Send + Sync>(&self, c: C) -> Result<Self> {
        self.apply_op1_arc(Arc::new(Box::new(c)))
    }
    /// Applies a binary custom op.
    pub fn apply_op2_arc(
        &self,
        rhs: &Self,
        c: Arc<Box<dyn CustomOp2 + Send + Sync>>,
    ) -> Result<Self> {
        let (storage, shape) = self.storage().apply_op2(
            self.layout(),
            &rhs.storage(),
            rhs.layout(),
            c.as_ref().as_ref(),
        )?;
        let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone()));
        Ok(from_storage(storage, shape, op, false))
    }
    pub fn apply_op2<C: 'static + CustomOp2 + Send + Sync>(&self, r: &Self, c: C) -> Result<Self> {
        self.apply_op2_arc(r, Arc::new(Box::new(c)))
    }
    /// Applies a ternary custom op.
    pub fn apply_op3_arc(
        &self,
        t2: &Self,
        t3: &Self,
        c: Arc<Box<dyn CustomOp3 + Send + Sync>>,
    ) -> Result<Self> {
        let (storage, shape) = self.storage().apply_op3(
            self.layout(),
            &t2.storage(),
            t2.layout(),
            &t3.storage(),
            t3.layout(),
            c.as_ref().as_ref(),
        )?;
        let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| {
            Op::CustomOp3(t1, t2, t3, c.clone())
        });
        Ok(from_storage(storage, shape, op, false))
    }
    pub fn apply_op3<C: 'static + CustomOp3 + Send + Sync>(
        &self,
        t2: &Self,
        t3: &Self,
        c: C,
    ) -> Result<Self> {
        self.apply_op3_arc(t2, t3, Arc::new(Box::new(c)))
    }
 }
 // In place ops.
 /// Unary ops that can be defined in user-land.
 /// These ops work in place and as such back-prop is unsupported.
 pub trait InplaceOp1 {
    // Box<dyn> does not support const yet, so use a function to get the name.
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result<()>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Result<()> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> Result<()> {
        Err(crate::Error::Metal(
            format!("no metal implementation for {}", self.name()).into(),
        ))
    }
 }
 pub trait InplaceOp2 {
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout)
        -> Result<()>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout) -> Result<()> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn metal_fwd(
        &self,
        _: &mut MetalStorage,
        _: &Layout,
        _: &MetalStorage,
        _: &Layout,
    ) -> Result<()> {
        Err(crate::Error::Metal(
            format!("no metal implementation for {}", self.name()).into(),
        ))
    }
 }
 pub trait InplaceOp3 {
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(
        &self,
        s1: &mut CpuStorage,
        l1: &Layout,
        s2: &CpuStorage,
        l2: &Layout,
        s3: &CpuStorage,
        l3: &Layout,
    ) -> Result<()>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(
        &self,
        _: &mut CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
    ) -> Result<()> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn metal_fwd(
        &self,
        _: &mut MetalStorage,
        _: &Layout,
        _: &MetalStorage,
        _: &Layout,
        _: &MetalStorage,
        _: &Layout,
    ) -> Result<()> {
        Err(crate::Error::Metal(
            format!("no metal implementation for {}", self.name()).into(),
        ))
    }
 }
 impl Tensor {
    /// Applies a unary custom op in place.
    pub fn inplace_op1<C: InplaceOp1>(&self, c: &C) -> Result<()> {
        self.storage_mut().inplace_op1(self.layout(), c)
    }
    /// Applies a unary custom op in place (for the first tensor).
    pub fn inplace_op2<C: InplaceOp2>(&self, rhs: &Self, c: &C) -> Result<()> {
        self.storage_mut()
            .inplace_op2(self.layout(), &rhs.storage(), rhs.layout(), c)
    }
    /// Applies a ternary custom op in place (for the first tensor).
    pub fn inplace_op3<C: InplaceOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<()> {
        self.storage_mut().inplace_op3(
            self.layout(),
            &t2.storage(),
            t2.layout(),
            &t3.storage(),
            t3.layout(),
            c,
        )
    }
 }
 pub struct UgIOp1 {
    name: &'static str,
    #[cfg(feature = "cuda")]
    func: cudarc::driver::CudaFunction,
    #[cfg(feature = "metal")]
    func: metal::ComputePipelineState,
 }
 impl UgIOp1 {
    #[allow(unused)]
    #[cfg(not(target_arch = "wasm32"))]
    pub fn new(
        name: &'static str,
        kernel: ug::lang::ssa::Kernel,
        device: &crate::Device,
    ) -> Result<Self> {
        #[cfg(feature = "cuda")]
        {
            let device = device.as_cuda_device()?;
            let func = device.compile(name, kernel)?;
            Ok(Self {
                name,
                func: func.into_cuda_function(),
            })
        }
        #[cfg(feature = "metal")]
        {
            let device = device.as_metal_device()?;
            let func = device.compile(name, kernel)?;
            Ok(Self { name, func })
        }
        #[cfg(not(any(feature = "cuda", feature = "metal")))]
        {
            Ok(Self { name })
        }
    }
 }
 impl InplaceOp1 for UgIOp1 {
    fn name(&self) -> &'static str {
        self.name
    }
    fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> {
        crate::bail!("ug ops are only supported on metal/cuda at the moment")
    }
    #[cfg(feature = "metal")]
    fn metal_fwd(&self, sto: &mut MetalStorage, layout: &Layout) -> Result<()> {
        use crate::backend::BackendStorage;
        use candle_metal_kernels::utils::EncoderProvider;
        let elem_count = layout.shape().elem_count();
        if sto.dtype() != crate::DType::F32 {
            // TODO: support more dtypes.
            crate::bail!("input is not a f32 tensor")
        }
        let device = sto.device();
        println!("here");
        let command_buffer = device.command_buffer()?;
        let command_buffer = &command_buffer;
        let encoder = command_buffer.encoder();
        let encoder = encoder.as_ref();
        encoder.set_compute_pipeline_state(&self.func);
        let (g, b) = if elem_count % 32 == 0 {
            (elem_count / 32, 32)
        } else {
            (elem_count, 1)
        };
        let grid_dims = metal::MTLSize {
            width: g as u64,
            height: 1,
            depth: 1,
        };
        let group_dims = candle_metal_kernels::utils::get_block_dims(b as u64, 1, 1);
        candle_metal_kernels::utils::set_param(encoder, 0, (sto.buffer(), 0usize));
        encoder.use_resource(sto.buffer(), metal::MTLResourceUsage::Write);
        encoder.dispatch_threads(grid_dims, group_dims);
        Ok(())
    }
    #[cfg(feature = "cuda")]
    fn cuda_fwd(&self, sto: &mut CudaStorage, layout: &Layout) -> Result<()> {
        use crate::cuda_backend::WrapErr;
        use cudarc::driver::PushKernelArg;
        let elem_count = layout.shape().elem_count();
        let stream = sto.device.cuda_stream();
        // TODO: support more dtypes.
        let sto = sto.as_cuda_slice::<f32>()?;
        let sto = match layout.contiguous_offsets() {
            None => crate::bail!("input has to be contiguous"),
            Some((o1, o2)) => sto.slice(o1..o2),
        };
        let (g, b) = if elem_count % 32 == 0 {
            (elem_count / 32, 32)
        } else {
            (elem_count, 1)
        };
        let cfg = cudarc::driver::LaunchConfig {
            grid_dim: (g as u32, 1, 1),
            block_dim: (b as u32, 1, 1),
            shared_mem_bytes: 0,
        };
        let mut builder = stream.launch_builder(&self.func);
        builder.arg(&sto);
        unsafe { builder.launch(cfg) }.w()?;
        Ok(())
    }
 }
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -8,15 +8,12 @@ use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
 pub enum DeviceLocation {
    Cpu,
    Cuda { gpu_id: usize },
    Metal { gpu_id: usize },
 }
 /// Cpu, Cuda, or Metal
 #[derive(Debug, Clone)]
 pub enum Device {
    Cpu,
    Cuda(crate::CudaDevice),
    Metal(crate::MetalDevice),
 }
 pub trait NdArray {
@ -131,43 +128,10 @@ impl Device {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
    }
    pub fn as_cuda_device(&self) -> Result<&crate::CudaDevice> {
        match self {
            Self::Cuda(d) => Ok(d),
            Self::Cpu => crate::bail!("expected a cuda device, got cpu"),
            Self::Metal(_) => crate::bail!("expected a cuda device, got Metal"),
        }
    }
    pub fn as_metal_device(&self) -> Result<&crate::MetalDevice> {
        match self {
            Self::Cuda(_) => crate::bail!("expected a metal device, got cuda"),
            Self::Cpu => crate::bail!("expected a metal device, got cpu"),
            Self::Metal(d) => Ok(d),
        }
    }
    pub fn new_cuda_with_stream(ordinal: usize) -> Result<Self> {
        Ok(Self::Cuda(crate::CudaDevice::new_with_stream(ordinal)?))
    }
    pub fn new_metal(ordinal: usize) -> Result<Self> {
        Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
    }
    pub fn set_seed(&self, seed: u64) -> Result<()> {
        match self {
            Self::Cpu => CpuDevice.set_seed(seed),
            Self::Cuda(c) => c.set_seed(seed),
            Self::Metal(m) => m.set_seed(seed),
        }
    }
    pub fn same_device(&self, rhs: &Self) -> bool {
        match (self, rhs) {
            (Self::Cpu, Self::Cpu) => true,
            (Self::Cuda(lhs), Self::Cuda(rhs)) => lhs.same_device(rhs),
            (Self::Metal(lhs), Self::Metal(rhs)) => lhs.same_device(rhs),
            _ => false,
        }
    }
@ -176,35 +140,20 @@ impl Device {
        match self {
            Self::Cpu => DeviceLocation::Cpu,
            Self::Cuda(device) => device.location(),
            Device::Metal(device) => device.location(),
        }
    }
    pub fn is_cpu(&self) -> bool {
        matches!(self, Self::Cpu)
    }
    pub fn is_cuda(&self) -> bool {
        matches!(self, Self::Cuda(_))
    }
    pub fn is_metal(&self) -> bool {
        matches!(self, Self::Metal(_))
    }
    pub fn supports_bf16(&self) -> bool {
        match self {
-            Self::Cuda(_) | Self::Metal(_) => true,
+            Self::Cpu => true,
-            Self::Cpu => false,
+            Self::Cuda(_) => false,
        }
    }
-    /// Return `BF16` for devices that support it, otherwise default to `F32`.
+    pub fn is_cuda(&self) -> bool {
-    pub fn bf16_default_to_f32(&self) -> DType {
+        match self {
-        if self.supports_bf16() {
+            Self::Cpu => false,
-            DType::BF16
+            Self::Cuda(_) => true,
        } else {
            DType::F32
        }
    }
@ -229,18 +178,8 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
                if dtype == DType::F16 || dtype == DType::BF16 {
                    let storage = device.rand_uniform(shape, DType::F32, lo, up)?;
                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
                } else {
                    let storage = device.rand_uniform(shape, dtype, lo, up)?;
                    Ok(Storage::Cuda(storage))
                }
            }
            Device::Metal(device) => {
                let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                Ok(Storage::Metal(storage))
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -267,18 +206,8 @@ impl Device {
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
                // TODO: Remove the special case if we start supporting generating f16/bf16 directly.
                if dtype == DType::F16 || dtype == DType::BF16 {
                    let storage = device.rand_normal(shape, DType::F32, mean, std)?;
                    Storage::Cuda(storage).to_dtype(&crate::Layout::contiguous(shape), dtype)
                } else {
                    let storage = device.rand_normal(shape, dtype, mean, std)?;
                    Ok(Storage::Cuda(storage))
                }
            }
            Device::Metal(device) => {
                let storage = device.rand_normal(shape, dtype, mean, std)?;
-                Ok(Storage::Metal(storage))
+                Ok(Storage::Cuda(storage))
            }
        }
    }
@ -302,10 +231,6 @@ impl Device {
                let storage = device.ones_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = device.ones_impl(shape, dtype)?;
                Ok(Storage::Metal(storage))
            }
        }
    }
@ -319,41 +244,6 @@ impl Device {
                let storage = device.zeros_impl(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = device.zeros_impl(shape, dtype)?;
                Ok(Storage::Metal(storage))
            }
        }
    }
    pub(crate) unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
        match self {
            Device::Cpu => {
                let storage = CpuDevice.alloc_uninit(shape, dtype)?;
                Ok(Storage::Cpu(storage))
            }
            Device::Cuda(device) => {
                let storage = device.alloc_uninit(shape, dtype)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = device.alloc_uninit(shape, dtype)?;
                Ok(Storage::Metal(storage))
            }
        }
    }
    pub(crate) fn storage_from_slice<D: WithDType>(&self, data: &[D]) -> Result<Storage> {
        match self {
            Device::Cpu => Ok(Storage::Cpu(data.to_cpu_storage())),
            Device::Cuda(device) => {
                let storage = device.storage_from_slice(data)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = device.storage_from_slice(data)?;
                Ok(Storage::Metal(storage))
            }
        }
    }
@ -362,14 +252,9 @@ impl Device {
            Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())),
            Device::Cuda(device) => {
                let storage = array.to_cpu_storage();
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = array.to_cpu_storage();
                let storage = device.storage_from_cpu_storage_owned(storage)?;
                Ok(Storage::Metal(storage))
            }
        }
    }
@ -378,22 +263,9 @@ impl Device {
            Device::Cpu => Ok(Storage::Cpu(S::to_cpu_storage_owned(data))),
            Device::Cuda(device) => {
                let storage = S::to_cpu_storage_owned(data);
-                let storage = device.storage_from_cpu_storage_owned(storage)?;
+                let storage = device.storage_from_cpu_storage(&storage)?;
                Ok(Storage::Cuda(storage))
            }
            Device::Metal(device) => {
                let storage = S::to_cpu_storage_owned(data);
                let storage = device.storage_from_cpu_storage_owned(storage)?;
                Ok(Storage::Metal(storage))
            }
        }
    }
    pub fn synchronize(&self) -> Result<()> {
        match self {
            Self::Cpu => Ok(()),
            Self::Cuda(d) => d.synchronize(),
            Self::Metal(d) => d.synchronize(),
        }
    }
 }
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -1,7 +1,6 @@
-//! Pretty printing of tensors
+/// Pretty printing of tensors
-//!
+/// This implementation should be in line with the PyTorch version.
-//! This implementation should be in line with the [PyTorch version](https://github.com/pytorch/pytorch/blob/7b419e8513a024e172eae767e24ec1b849976b13/torch/_tensor_str.py).
+/// https://github.com/pytorch/pytorch/blob/7b419e8513a024e172eae767e24ec1b849976b13/torch/_tensor_str.py
 //!
 use crate::{DType, Result, Tensor, WithDType};
 use half::{bf16, f16};
@ -15,9 +14,6 @@ impl Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
            crate::DeviceLocation::Metal { gpu_id } => {
                format!(", metal:{}", gpu_id)
            }
        };
        write!(f, "Tensor[")?;
@ -66,13 +62,12 @@ impl std::fmt::Debug for Tensor {
 }
 /// Options for Tensor pretty printing
 #[derive(Debug, Clone)]
 pub struct PrinterOptions {
-    pub precision: usize,
+    precision: usize,
-    pub threshold: usize,
+    threshold: usize,
-    pub edge_items: usize,
+    edge_items: usize,
-    pub line_width: usize,
+    line_width: usize,
-    pub sci_mode: Option<bool>,
+    sci_mode: Option<bool>,
 }
 static PRINT_OPTS: std::sync::Mutex<PrinterOptions> =
@ -91,10 +86,6 @@ impl PrinterOptions {
    }
 }
 pub fn print_options() -> &'static std::sync::Mutex<PrinterOptions> {
    &PRINT_OPTS
 }
 pub fn set_print_options(options: PrinterOptions) {
    *PRINT_OPTS.lock().unwrap() = options
 }
@ -123,26 +114,6 @@ pub fn set_print_options_full() {
    }
 }
 pub fn set_line_width(line_width: usize) {
    PRINT_OPTS.lock().unwrap().line_width = line_width
 }
 pub fn set_precision(precision: usize) {
    PRINT_OPTS.lock().unwrap().precision = precision
 }
 pub fn set_edge_items(edge_items: usize) {
    PRINT_OPTS.lock().unwrap().edge_items = edge_items
 }
 pub fn set_threshold(threshold: usize) {
    PRINT_OPTS.lock().unwrap().threshold = threshold
 }
 pub fn set_sci_mode(sci_mode: Option<bool>) {
    PRINT_OPTS.lock().unwrap().sci_mode = sci_mode
 }
 struct FmtSize {
    current_size: usize,
 }
@ -505,9 +476,6 @@ impl std::fmt::Display for Tensor {
            crate::DeviceLocation::Cuda { gpu_id } => {
                format!(", cuda:{}", gpu_id)
            }
            crate::DeviceLocation::Metal { gpu_id } => {
                format!(", metal:{}", gpu_id)
            }
        };
        write!(
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@ -1,7 +1,7 @@
 //! Types for elements that can be stored and manipulated using tensors.
 #![allow(clippy::redundant_closure_call)]
 use crate::backend::BackendStorage;
-use crate::{CpuStorage, CpuStorageRef, Error, Result};
+use crate::{CpuStorage, Error, Result};
 /// The different types of elements allowed in tensors.
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
@ -23,15 +23,7 @@ pub enum DType {
 }
 #[derive(Debug, PartialEq, Eq)]
-pub struct DTypeParseError(String);
+pub struct DTypeParseError;
 impl std::fmt::Display for DTypeParseError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "cannot parse '{}' as a dtype", self.0)
    }
 }
 impl std::error::Error for DTypeParseError {}
 impl std::str::FromStr for DType {
    type Err = DTypeParseError;
@ -44,7 +36,7 @@ impl std::str::FromStr for DType {
            "f16" => Ok(Self::F16),
            "f32" => Ok(Self::F32),
            "f64" => Ok(Self::F64),
-            _ => Err(DTypeParseError(s.to_string())),
+            _ => Err(DTypeParseError),
        }
    }
 }
@ -75,20 +67,6 @@ impl DType {
            Self::F64 => 8,
        }
    }
    pub fn is_int(&self) -> bool {
        match self {
            Self::U8 | Self::U32 | Self::I64 => true,
            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => false,
        }
    }
    pub fn is_float(&self) -> bool {
        match self {
            Self::U8 | Self::U32 | Self::I64 => false,
            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => true,
        }
    }
 }
 pub trait WithDType:
@ -100,14 +78,12 @@ pub trait WithDType:
    + 'static
    + Send
    + Sync
    + std::any::Any
    + crate::cpu::kernels::VecOps
 {
    const DTYPE: DType;
    fn from_f64(v: f64) -> Self;
    fn to_f64(self) -> f64;
    fn cpu_storage_ref(data: &[Self]) -> CpuStorageRef<'_>;
    fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage;
    fn to_cpu_storage(data: &[Self]) -> CpuStorage {
@ -131,10 +107,6 @@ macro_rules! with_dtype {
                $to_f64(self)
            }
            fn cpu_storage_ref(data: &[Self]) -> CpuStorageRef<'_> {
                CpuStorageRef::$dtype(data)
            }
            fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage {
                CpuStorage::$dtype(data)
            }
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -1,5 +1,3 @@
 //! Implementation of the Cuda backend when Cuda support has not been compiled in.
 //!
 #![allow(dead_code)]
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
@ -16,12 +14,6 @@ macro_rules! fail {
    };
 }
 impl CudaDevice {
    pub fn new_with_stream(_: usize) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
 }
 impl crate::backend::BackendStorage for CudaStorage {
    type Device = CudaDevice;
@ -87,16 +79,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn conv_transpose1d(
        &self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn conv2d(
        &self,
        _: &Layout,
@ -162,19 +144,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn copy2d(
        &self,
        _: &mut Self,
        _: usize,
        _: usize,
        _: usize,
        _: usize,
        _: usize,
        _: usize,
    ) -> Result<()> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
@ -183,10 +152,6 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
@ -198,10 +163,6 @@ impl crate::backend::BackendDevice for CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn set_seed(&self, _: u64) -> Result<()> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn location(&self) -> crate::DeviceLocation {
        fail!()
    }
@ -218,22 +179,10 @@ impl crate::backend::BackendDevice for CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }
    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
@ -241,38 +190,4 @@ impl crate::backend::BackendDevice for CudaDevice {
    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    fn synchronize(&self) -> Result<()> {
        Ok(())
    }
 }
 /// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
 /// allowed with f16 GEMMs.
 pub fn gemm_reduced_precision_f16() -> bool {
    true
 }
 /// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
 /// allowed with f16 GEMMs.
 pub fn set_gemm_reduced_precision_f16(_: bool) {}
 /// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
 /// allowed with bf16 GEMMs.
 pub fn gemm_reduced_precision_bf16() -> bool {
    true
 }
 /// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
 /// allowed with bf16 GEMMs.
 pub fn set_gemm_reduced_precision_bf16(_: bool) {}
 /// This bool controls whether reduced precision reductions (e.g., with tf32 accumulation type) are
 /// allowed with f32 GEMMs.
 pub fn gemm_reduced_precision_f32() -> bool {
    true
 }
 /// This bool controls whether reduced precision reductions (e.g., with tf32 accumulation type) are
 /// allowed with f32 GEMMs.
 pub fn set_gemm_reduced_precision_f32(_b: bool) {}
--- a/candle-core/src/dummy_metal_backend.rs
+++ b/candle-core/src/dummy_metal_backend.rs
@ -1,252 +0,0 @@
 #![allow(dead_code)]
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Error, Layout, Result, Shape};
 #[derive(Debug, Clone)]
 pub struct MetalDevice;
 #[derive(Debug)]
 pub struct MetalStorage;
 #[derive(thiserror::Error, Debug)]
 pub enum MetalError {
    #[error("{0}")]
    Message(String),
 }
 impl From<String> for MetalError {
    fn from(e: String) -> Self {
        MetalError::Message(e)
    }
 }
 macro_rules! fail {
    () => {
        unimplemented!("metal support has not been enabled, add `metal` feature to enable.")
    };
 }
 impl crate::backend::BackendStorage for MetalStorage {
    type Device = MetalDevice;
    fn try_clone(&self, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn dtype(&self) -> DType {
        fail!()
    }
    fn device(&self) -> &Self::Device {
        fail!()
    }
    fn to_cpu_storage(&self) -> Result<CpuStorage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn conv1d(
        &self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &crate::conv::ParamsConv1D,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn conv_transpose1d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn conv2d(
        &self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &crate::conv::ParamsConv2D,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn conv_transpose2d(
        &self,
        _l: &Layout,
        _kernel: &Self,
        _kernel_l: &Layout,
        _params: &crate::conv::ParamsConvTranspose2D,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn scatter_add(
        &self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: usize,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn index_add(
        &self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: &Self,
        _: &Layout,
        _: usize,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn matmul(
        &self,
        _: &Self,
        _: (usize, usize, usize, usize),
        _: &Layout,
        _: &Layout,
    ) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn copy2d(
        &self,
        _: &mut Self,
        _: usize,
        _: usize,
        _: usize,
        _: usize,
        _: usize,
        _: usize,
    ) -> Result<()> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
 }
 impl crate::backend::BackendDevice for MetalDevice {
    type Storage = MetalStorage;
    fn new(_: usize) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn set_seed(&self, _: u64) -> Result<()> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn location(&self) -> crate::DeviceLocation {
        fail!()
    }
    fn same_device(&self, _: &Self) -> bool {
        fail!()
    }
    fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn storage_from_slice<T: crate::WithDType>(&self, _: &[T]) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    fn synchronize(&self) -> Result<()> {
        Ok(())
    }
 }
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -1,5 +1,4 @@
-//! Candle-specific Error and Result
+use crate::{DType, DeviceLocation, Layout, Shape};
 use crate::{DType, DeviceLocation, Layout, MetalError, Shape};
 #[derive(Debug, Clone)]
 pub struct MatMulUnexpectedStriding {
@ -9,14 +8,8 @@ pub struct MatMulUnexpectedStriding {
    pub msg: &'static str,
 }
 impl std::fmt::Debug for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{self}")
    }
 }
 /// Main library error type.
-#[derive(thiserror::Error)]
+#[derive(thiserror::Error, Debug)]
 pub enum Error {
    // === DType Errors ===
    #[error("{msg}, expected: {expected:?}, got: {got:?}")]
@ -149,9 +142,6 @@ pub enum Error {
    #[error("{op} expects at least one tensor")]
    OpRequiresAtLeastOneTensor { op: &'static str },
    #[error("{op} expects at least two tensors")]
    OpRequiresAtLeastTwoTensors { op: &'static str },
    #[error("backward is not supported for {op}")]
    BackwardNotSupported { op: &'static str },
@ -159,9 +149,6 @@ pub enum Error {
    #[error("the candle crate has not been built with cuda support")]
    NotCompiledWithCudaSupport,
    #[error("the candle crate has not been built with metal support")]
    NotCompiledWithMetalSupport,
    #[error("cannot find tensor {path}")]
    CannotFindTensor { path: String },
@ -169,13 +156,6 @@ pub enum Error {
    #[error(transparent)]
    Cuda(Box<dyn std::error::Error + Send + Sync>),
    #[error("Metal error {0}")]
    Metal(#[from] MetalError),
    #[cfg(not(target_arch = "wasm32"))]
    #[error(transparent)]
    Ug(#[from] ug::Error),
    #[error(transparent)]
    TryFromIntError(#[from] core::num::TryFromIntError),
@ -190,10 +170,6 @@ pub enum Error {
    #[error(transparent)]
    ParseInt(#[from] std::num::ParseIntError),
    /// Utf8 parse error.
    #[error(transparent)]
    FromUtf8(#[from] std::string::FromUtf8Error),
    /// I/O error.
    #[error(transparent)]
    Io(#[from] std::io::Error),
@ -206,14 +182,8 @@ pub enum Error {
    UnsupportedSafeTensorDtype(safetensors::Dtype),
    /// Arbitrary errors wrapping.
-    #[error("{0}")]
+    #[error(transparent)]
-    Wrapped(Box<dyn std::fmt::Display + Send + Sync>),
+    Wrapped(Box<dyn std::error::Error + Send + Sync>),
    #[error("{context}\n{inner}")]
    Context {
        inner: Box<Self>,
        context: Box<dyn std::fmt::Display + Send + Sync>,
    },
    /// Adding path information to an error.
    #[error("path: {path:?} {inner}")]
@ -231,26 +201,19 @@ pub enum Error {
    /// User generated error message, typically created via `bail!`.
    #[error("{0}")]
    Msg(String),
    #[error("unwrap none")]
    UnwrapNone,
 }
 pub type Result<T> = std::result::Result<T, Error>;
 impl Error {
-    pub fn wrap(err: impl std::fmt::Display + Send + Sync + 'static) -> Self {
+    pub fn wrap(err: impl std::error::Error + Send + Sync + 'static) -> Self {
        Self::Wrapped(Box::new(err)).bt()
    }
-    pub fn msg(err: impl std::fmt::Display) -> Self {
+    pub fn msg(err: impl std::error::Error + Send + Sync + 'static) -> Self {
        Self::Msg(err.to_string()).bt()
    }
    pub fn debug(err: impl std::fmt::Debug) -> Self {
        Self::Msg(format!("{err:?}")).bt()
    }
    pub fn bt(self) -> Self {
        let backtrace = std::backtrace::Backtrace::capture();
        match backtrace.status() {
@ -269,13 +232,6 @@ impl Error {
            path: p.as_ref().to_path_buf(),
        }
    }
    pub fn context(self, c: impl std::fmt::Display + Send + Sync + 'static) -> Self {
        Self::Context {
            inner: Box::new(self),
            context: Box::new(c),
        }
    }
 }
 #[macro_export]
@ -298,41 +254,3 @@ pub fn zip<T, U>(r1: Result<T>, r2: Result<U>) -> Result<(T, U)> {
        (_, Err(e)) => Err(e),
    }
 }
 // Taken from anyhow.
 pub trait Context<T> {
    /// Wrap the error value with additional context.
    fn context<C>(self, context: C) -> Result<T>
    where
        C: std::fmt::Display + Send + Sync + 'static;
    /// Wrap the error value with additional context that is evaluated lazily
    /// only once an error does occur.
    fn with_context<C, F>(self, f: F) -> Result<T>
    where
        C: std::fmt::Display + Send + Sync + 'static,
        F: FnOnce() -> C;
 }
 impl<T> Context<T> for Option<T> {
    fn context<C>(self, context: C) -> Result<T>
    where
        C: std::fmt::Display + Send + Sync + 'static,
    {
        match self {
            Some(v) => Ok(v),
            None => Err(Error::UnwrapNone.context(context).bt()),
        }
    }
    fn with_context<C, F>(self, f: F) -> Result<T>
    where
        C: std::fmt::Display + Send + Sync + 'static,
        F: FnOnce() -> C,
    {
        match self {
            Some(v) => Ok(v),
            None => Err(Error::UnwrapNone.context(f()).bt()),
        }
    }
 }
--- a/candle-core/src/indexer.rs
+++ b/candle-core/src/indexer.rs
@ -46,31 +46,19 @@ impl Tensor {
                    current_dim += 1;
                    out
                }
                TensorIndexer::IndexSelect(indexes) => {
                    if indexes.rank() != 1 {
                        crate::bail!("multi-dimensional tensor indexing is not supported")
                    }
                    let out = x.index_select(&indexes.to_device(x.device())?, current_dim)?;
                    current_dim += 1;
                    out
                }
                TensorIndexer::Err(e) => crate::bail!("indexing error {e:?}"),
            };
        }
        Ok(x)
    }
 }
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 /// Generic structure used to index a slice of the tensor
 pub enum TensorIndexer {
-    /// This selects the elements for which an index has some specific value.
+    /// This selects the elemnts for which an index has some specific value.
    Select(usize),
    /// This is a regular slice, purely indexing a chunk of the tensor
    Narrow(Bound<usize>, Bound<usize>),
    /// Indexing via a 1d tensor
    IndexSelect(Tensor),
    Err(Error),
 }
 impl From<usize> for TensorIndexer {
@ -79,55 +67,36 @@ impl From<usize> for TensorIndexer {
    }
 }
-impl From<&[u32]> for TensorIndexer {
+macro_rules! impl_from_range {
-    fn from(index: &[u32]) -> Self {
+    ($range_type:ty) => {
-        match Tensor::new(index, &crate::Device::Cpu) {
+        impl From<$range_type> for TensorIndexer {
-            Ok(tensor) => TensorIndexer::IndexSelect(tensor),
+            fn from(range: $range_type) -> Self {
-            Err(e) => TensorIndexer::Err(e),
+                use std::ops::Bound::*;
                let start = match range.start_bound() {
                    Included(idx) => Included(*idx),
                    Excluded(idx) => Excluded(*idx),
                    Unbounded => Unbounded,
                };
                let end = match range.end_bound() {
                    Included(idx) => Included(*idx),
                    Excluded(idx) => Excluded(*idx),
                    Unbounded => Unbounded,
                };
                TensorIndexer::Narrow(start, end)
            }
        }
-    }
+    };
 }
-impl From<Vec<u32>> for TensorIndexer {
+impl_from_range!(Range<usize>);
-    fn from(index: Vec<u32>) -> Self {
+impl_from_range!(RangeFrom<usize>);
-        let len = index.len();
+impl_from_range!(RangeFull);
-        match Tensor::from_vec(index, len, &crate::Device::Cpu) {
+impl_from_range!(RangeInclusive<usize>);
-            Ok(tensor) => TensorIndexer::IndexSelect(tensor),
+impl_from_range!(RangeTo<usize>);
-            Err(e) => TensorIndexer::Err(e),
+impl_from_range!(RangeToInclusive<usize>);
        }
    }
 }
 impl From<&Tensor> for TensorIndexer {
    fn from(tensor: &Tensor) -> Self {
        TensorIndexer::IndexSelect(tensor.clone())
    }
 }
 trait RB: RangeBounds<usize> {}
 impl RB for Range<usize> {}
 impl RB for RangeFrom<usize> {}
 impl RB for RangeFull {}
 impl RB for RangeInclusive<usize> {}
 impl RB for RangeTo<usize> {}
 impl RB for RangeToInclusive<usize> {}
 impl<T: RB> From<T> for TensorIndexer {
    fn from(range: T) -> Self {
        use std::ops::Bound::*;
        let start = match range.start_bound() {
            Included(idx) => Included(*idx),
            Excluded(idx) => Excluded(*idx),
            Unbounded => Unbounded,
        };
        let end = match range.end_bound() {
            Included(idx) => Included(*idx),
            Excluded(idx) => Excluded(*idx),
            Unbounded => Unbounded,
        };
        TensorIndexer::Narrow(start, end)
    }
 }
 /// Trait used to implement multiple signatures for ease of use of the slicing
 /// of a tensor
@ -141,117 +110,28 @@ impl<T> IndexOp<T> for Tensor
 where
    T: Into<TensorIndexer>,
 {
    ///```rust
    /// use candle_core::{Tensor, DType, Device, IndexOp};
    /// let a = Tensor::new(&[
    ///     [0., 1.],
    ///     [2., 3.],
    ///     [4., 5.]
    /// ], &Device::Cpu)?;
    ///
    /// let b = a.i(0)?;
    /// assert_eq!(b.shape().dims(), &[2]);
    /// assert_eq!(b.to_vec1::<f64>()?, &[0., 1.]);
    ///
    /// let c = a.i(..2)?;
    /// assert_eq!(c.shape().dims(), &[2, 2]);
    /// assert_eq!(c.to_vec2::<f64>()?, &[
    ///     [0., 1.],
    ///     [2., 3.]
    /// ]);
    ///
    /// let d = a.i(1..)?;
    /// assert_eq!(d.shape().dims(), &[2, 2]);
    /// assert_eq!(d.to_vec2::<f64>()?, &[
    ///     [2., 3.],
    ///     [4., 5.]
    /// ]);
    /// # Ok::<(), candle_core::Error>(())
    /// ```
    fn i(&self, index: T) -> Result<Tensor, Error> {
        self.index(&[index.into()])
    }
 }
 impl<A> IndexOp<(A,)> for Tensor
 where
    A: Into<TensorIndexer>,
 {
    ///```rust
    /// use candle_core::{Tensor, DType, Device, IndexOp};
    /// let a = Tensor::new(&[
    ///     [0f32, 1.],
    ///     [2.  , 3.],
    ///     [4.  , 5.]
    /// ], &Device::Cpu)?;
    ///
    /// let b = a.i((0,))?;
    /// assert_eq!(b.shape().dims(), &[2]);
    /// assert_eq!(b.to_vec1::<f32>()?, &[0., 1.]);
    ///
    /// let c = a.i((..2,))?;
    /// assert_eq!(c.shape().dims(), &[2, 2]);
    /// assert_eq!(c.to_vec2::<f32>()?, &[
    ///     [0., 1.],
    ///     [2., 3.]
    /// ]);
    ///
    /// let d = a.i((1..,))?;
    /// assert_eq!(d.shape().dims(), &[2, 2]);
    /// assert_eq!(d.to_vec2::<f32>()?, &[
    ///     [2., 3.],
    ///     [4., 5.]
    /// ]);
    /// # Ok::<(), candle_core::Error>(())
    /// ```
    fn i(&self, (a,): (A,)) -> Result<Tensor, Error> {
        self.index(&[a.into()])
    }
 }
 #[allow(non_snake_case)]
 impl<A, B> IndexOp<(A, B)> for Tensor
 where
    A: Into<TensorIndexer>,
    B: Into<TensorIndexer>,
 {
    ///```rust
    /// use candle_core::{Tensor, DType, Device, IndexOp};
    /// let a = Tensor::new(&[[0f32, 1., 2.], [3., 4., 5.], [6., 7., 8.]], &Device::Cpu)?;
    ///
    /// let b = a.i((1, 0))?;
    /// assert_eq!(b.to_vec0::<f32>()?, 3.);
    ///
    /// let c = a.i((..2, 1))?;
    /// assert_eq!(c.shape().dims(), &[2]);
    /// assert_eq!(c.to_vec1::<f32>()?, &[1., 4.]);
    ///
    /// let d = a.i((2.., ..))?;
    /// assert_eq!(c.shape().dims(), &[2]);
    /// assert_eq!(c.to_vec1::<f32>()?, &[1., 4.]);
    /// # Ok::<(), candle_core::Error>(())
    /// ```
    fn i(&self, (a, b): (A, B)) -> Result<Tensor, Error> {
        self.index(&[a.into(), b.into()])
    }
 }
 macro_rules! index_op_tuple {
-    ($doc:tt, $($t:ident),+) => {
+    ($($t:ident),+) => {
        #[allow(non_snake_case)]
        impl<$($t),*> IndexOp<($($t,)*)> for Tensor
        where
            $($t: Into<TensorIndexer>,)*
        {
            #[doc=$doc]
            fn i(&self, ($($t,)*): ($($t,)*)) -> Result<Tensor, Error> {
                self.index(&[$($t.into(),)*])
            }
        }
    };
 }
-
+index_op_tuple!(A);
-index_op_tuple!("see [TensorIndex#method.i]", A, B, C);
+index_op_tuple!(A, B);
-index_op_tuple!("see [TensorIndex#method.i]", A, B, C, D);
+index_op_tuple!(A, B, C);
-index_op_tuple!("see [TensorIndex#method.i]", A, B, C, D, E);
+index_op_tuple!(A, B, C, D);
-index_op_tuple!("see [TensorIndex#method.i]", A, B, C, D, E, F);
+index_op_tuple!(A, B, C, D, E);
-index_op_tuple!("see [TensorIndex#method.i]", A, B, C, D, E, F, G);
+index_op_tuple!(A, B, C, D, E, F);
 index_op_tuple!(A, B, C, D, E, F, G);
--- a/candle-core/src/layout.rs
+++ b/candle-core/src/layout.rs
@ -1,4 +1,3 @@
 //! Tensor Layouts including contiguous or sparse strides
 use crate::{Error, Result, Shape};
 #[derive(Debug, PartialEq, Eq, Clone)]
@ -36,12 +35,6 @@ impl Layout {
        self.shape.dims()
    }
    /// The dimension size for a specified dimension index.
    pub fn dim<D: crate::shape::Dim>(&self, dim: D) -> Result<usize> {
        let dim = dim.to_index(&self.shape, "dim")?;
        Ok(self.dims()[dim])
    }
    pub fn shape(&self) -> &Shape {
        &self.shape
    }
@ -77,7 +70,7 @@ impl Layout {
        self.shape.is_fortran_contiguous(&self.stride)
    }
-    pub fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
+    pub(crate) fn narrow(&self, dim: usize, start: usize, len: usize) -> Result<Self> {
        let dims = self.shape().dims();
        if dim >= dims.len() {
            Err(Error::DimOutOfRange {
@ -106,7 +99,7 @@ impl Layout {
        })
    }
-    pub fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
+    pub(crate) fn transpose(&self, dim1: usize, dim2: usize) -> Result<Self> {
        let rank = self.shape.rank();
        if rank <= dim1 || rank <= dim2 {
            Err(Error::UnexpectedNumberOfDims {
@ -127,7 +120,7 @@ impl Layout {
        })
    }
-    pub fn permute(&self, idxs: &[usize]) -> Result<Self> {
+    pub(crate) fn permute(&self, idxs: &[usize]) -> Result<Self> {
        let is_permutation =
            idxs.len() == self.shape.rank() && (0..idxs.len()).all(|i| idxs.contains(&i));
        if !is_permutation {
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -7,14 +7,14 @@
 //!
 //! let a = Tensor::arange(0f32, 6f32, &Device::Cpu)?.reshape((2, 3))?;
 //! let b = Tensor::arange(0f32, 12f32, &Device::Cpu)?.reshape((3, 4))?;
 //! let c = a.matmul(&b)?;
 //!
 //! let c = a.matmul(&b)?;
 //! # Ok(())}
 //! ```
 //!
 //! ## Features
 //!
-//! - Simple syntax (looks and feels like PyTorch)
+//! - Simple syntax (looks and like PyTorch)
 //! - CPU and Cuda backends (and M1 support)
 //! - Enable serverless (CPU) small and fast deployments
 //! - Model training
@ -32,91 +32,60 @@
 //! Python can really add overhead in more complex workflows and the [GIL](https://www.backblaze.com/blog/the-python-gil-past-present-and-future/) is a notorious source of headaches.
 //!
 //! Rust is cool, and a lot of the HF ecosystem already has Rust crates [safetensors](https://github.com/huggingface/safetensors) and [tokenizers](https://github.com/huggingface/tokenizers)
 //!
 //! ## Other Crates
 //!
 //! Candle consists of a number of crates. This crate holds core the common data structures but you may wish
 //! to look at the docs for the other crates which can be found here:
 //!
 //! - [candle-core](https://docs.rs/candle-core/). Core Datastructures and DataTypes.
 //! - [candle-nn](https://docs.rs/candle-nn/). Building blocks for Neural Nets.
 //! - [candle-datasets](https://docs.rs/candle-datasets/). Rust access to commonly used Datasets like MNIST.
 //! - [candle-examples](https://docs.rs/candle-examples/). Examples of Candle in Use.
 //! - [candle-onnx](https://docs.rs/candle-onnx/). Loading and using ONNX models.
 //! - [candle-pyo3](https://docs.rs/candle-pyo3/). Access to Candle from Python.
 //! - [candle-transformers](https://docs.rs/candle-transformers/). Candle implemntation of many published transformer models.
 //!
 #[cfg(feature = "accelerate")]
 mod accelerate;
 pub mod backend;
 pub mod backprop;
-pub mod conv;
+mod conv;
 mod convert;
 pub mod cpu;
 pub mod cpu_backend;
 #[cfg(feature = "cuda")]
 pub mod cuda_backend;
-mod custom_op;
+#[cfg(feature = "cudnn")]
 pub mod cudnn;
 mod device;
 pub mod display;
 mod dtype;
-pub mod dummy_cuda_backend;
+mod dummy_cuda_backend;
 mod dummy_metal_backend;
 pub mod error;
 mod indexer;
 pub mod layout;
 #[cfg(feature = "metal")]
 pub mod metal_backend;
 #[cfg(feature = "mkl")]
 mod mkl;
 pub mod npy;
-pub mod op;
+mod op;
 pub mod pickle;
 pub mod quantized;
 pub mod safetensors;
 pub mod scalar;
 pub mod shape;
 mod sort;
 mod storage;
 pub mod streaming;
 mod strided_index;
 mod tensor;
 mod tensor_cat;
 pub mod test_utils;
 pub mod utils;
 mod variable;
-#[cfg(feature = "cudnn")]
+pub use cpu_backend::CpuStorage;
-pub use cuda_backend::cudnn;
+pub use device::{Device, DeviceLocation};
-
+pub use dtype::{DType, FloatDType, IntDType, WithDType};
-pub use cpu_backend::{CpuStorage, CpuStorageRef};
+pub use error::{Error, Result};
-pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3, UgIOp1};
+pub use indexer::IndexOp;
 pub use device::{Device, DeviceLocation, NdArray};
 pub use dtype::{DType, DTypeParseError, FloatDType, IntDType, WithDType};
 pub use error::{Context, Error, Result};
 pub use indexer::{IndexOp, TensorIndexer};
 pub use layout::Layout;
 pub use op::{CustomOp1, CustomOp2, CustomOp3};
 pub use shape::{Shape, D};
 pub use storage::Storage;
 pub use streaming::{StreamTensor, StreamingBinOp, StreamingModule};
 pub use strided_index::{StridedBlocks, StridedIndex};
 pub use tensor::{Tensor, TensorId};
 pub use variable::Var;
 #[cfg(feature = "cuda")]
-pub use cuda_backend as cuda;
+pub use cuda_backend::{CudaDevice, CudaStorage};
 #[cfg(not(feature = "cuda"))]
-pub use dummy_cuda_backend as cuda;
+pub use dummy_cuda_backend::{CudaDevice, CudaStorage};
 pub use cuda::{CudaDevice, CudaStorage};
 #[cfg(feature = "metal")]
 pub use metal_backend::{MetalDevice, MetalError, MetalStorage};
 #[cfg(not(feature = "metal"))]
 pub use dummy_metal_backend::{MetalDevice, MetalError, MetalStorage};
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;
@ -140,34 +109,19 @@ impl ToUsize2 for (usize, usize) {
    }
 }
-/// Defining a module with forward method using a single argument.
+// A simple trait defining a module with forward method using a single argument.
-pub trait Module {
+pub trait Module: std::fmt::Debug {
    fn forward(&self, xs: &Tensor) -> Result<Tensor>;
    /// Change the module to use training mode vs eval mode.
    ///
    /// The default implementation does nothing as this is only used for a couple modules such as
    /// dropout or batch-normalization.
    fn set_training(&mut self, _training: bool) {}
 }
-impl<T: Fn(&Tensor) -> Result<Tensor>> Module for T {
+impl Module for quantized::QMatMul {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        self(xs)
    }
 }
 impl<M: Module> Module for Option<&M> {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        match self {
            None => Ok(xs.clone()),
            Some(m) => m.forward(xs),
        }
    }
 }
 /// A single forward method using a single single tensor argument and a flag to
 /// separate the training and evaluation behaviors.
 pub trait ModuleT {
    fn forward_t(&self, xs: &Tensor, train: bool) -> Result<Tensor>;
 }
 impl<M: Module> ModuleT for M {
    fn forward_t(&self, xs: &Tensor, _train: bool) -> Result<Tensor> {
        self.forward(xs)
    }
 }
--- a/candle-core/src/metal_backend/device.rs
+++ b/candle-core/src/metal_backend/device.rs
@ -1,340 +0,0 @@
 use crate::{DType, Result};
 use candle_metal_kernels::Kernels;
 use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger};
 use std::collections::HashMap;
 use std::path::Path;
 use std::sync::{Arc, Mutex, RwLock};
 use super::MetalError;
 /// Unique identifier for cuda devices.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub struct DeviceId(usize);
 impl DeviceId {
    pub(crate) fn new() -> Self {
        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
        use std::sync::atomic;
        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
    }
 }
 type BufferMap = HashMap<(NSUInteger, MTLResourceOptions), Vec<Arc<Buffer>>>;
 pub(crate) struct Commands {
    /// Single command queue for the entire device.
    command_queue: CommandQueue,
    /// One command buffer at a time.
    /// The scheduler works by allowing multiple
    /// [ComputeCommandEncoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc)
    /// on a single command buffer. Using a single command buffer would be fastest on the GPU but
    /// prevents overlapping of CPU and GPU commands (because command buffer needs to be committed
    /// to start to work).
    /// Despite what the documentation says, command buffers are NOT ordered. They are ordered
    /// for their START time, but there's no guarantee that command buffer1 will finish before
    /// command buffer2 starts (or there are metal bugs there)
    command_buffer: CommandBuffer,
    /// Keeps track of the current amount of compute command encoders on the current
    /// command buffer
    /// Arc, RwLock because of the interior mutability.
    command_buffer_index: usize,
    /// The maximum amount of [compute command encoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) per [command buffer](https://developer.apple.com/documentation/metal/mtlcommandbuffer?language=objc)
    compute_per_buffer: usize,
 }
 impl Commands {
    pub(crate) fn new(command_queue: CommandQueue) -> Result<Self> {
        let command_buffer = command_queue.new_command_buffer().to_owned();
        command_buffer.enqueue();
        let compute_per_buffer = match std::env::var("CANDLE_METAL_COMPUTE_PER_BUFFER") {
            Ok(val) => val.parse()?,
            _ => 50,
        };
        Ok(Self {
            command_queue,
            command_buffer,
            command_buffer_index: 0,
            compute_per_buffer,
        })
    }
    pub fn command_buffer(&mut self) -> Result<(bool, CommandBuffer)> {
        let mut command_buffer = self.command_buffer.to_owned();
        let mut flushed = false;
        if self.command_buffer_index > self.compute_per_buffer {
            self.command_buffer.commit();
            command_buffer = self.command_queue.new_command_buffer().to_owned();
            self.command_buffer = command_buffer.clone();
            self.command_buffer_index = 0;
            flushed = true;
        }
        self.command_buffer_index += 1;
        Ok((flushed, command_buffer))
    }
    pub fn wait_until_completed(&mut self) -> Result<()> {
        match self.command_buffer.status() {
            metal::MTLCommandBufferStatus::Committed
            | metal::MTLCommandBufferStatus::Scheduled
            | metal::MTLCommandBufferStatus::Completed => {
                panic!("Already committed");
            }
            _ => {}
        }
        self.command_buffer.commit();
        self.command_buffer.wait_until_completed();
        self.command_buffer = self.command_queue.new_command_buffer().to_owned();
        Ok(())
    }
 }
 #[derive(Clone)]
 pub struct MetalDevice {
    /// Unique identifier, the registryID is not sufficient as it identifies the GPU rather than
    /// the device itself.
    pub(crate) id: DeviceId,
    /// Raw metal device: <https://developer.apple.com/documentation/metal/mtldevice?language=objc>
    pub(crate) device: metal::Device,
    pub(crate) commands: Arc<RwLock<Commands>>,
    /// Simple allocator struct.
    /// The buffers are stored in size buckets since ML tends to use similar shapes over and over.
    /// We store the buffers in [`Arc`] because it's much faster than Obj-c internal ref counting
    /// (could be linked to FFI communication overhead).
    ///
    /// Whenever a buffer has a strong_count==1, we can reuse it, it means it was dropped in the
    /// graph calculation, and only we the allocator kept a reference to it, therefore it's free
    /// to be reused. However, in order for this to work, we need to guarantee the order of
    /// operation, so that this buffer is not being used by another kernel at the same time.
    /// Arc is the CPU reference count, it doesn't mean anything on the GPU side of things.
    ///
    /// Whenever we actually allocate a new buffer, we make a full sweep to clean up unused buffers
    /// (strong_count = 1).
    pub(crate) buffers: Arc<RwLock<BufferMap>>,
    /// Simple keeper struct to keep track of the already compiled kernels so we can reuse them.
    /// Heavily used by [`candle_metal_kernels`]
    pub(crate) kernels: Arc<Kernels>,
    /// Seed for random number generation.
    pub(crate) seed: Arc<Mutex<Buffer>>,
 }
 impl std::fmt::Debug for MetalDevice {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "MetalDevice({:?})", self.id)
    }
 }
 impl std::ops::Deref for MetalDevice {
    type Target = metal::DeviceRef;
    fn deref(&self) -> &Self::Target {
        &self.device
    }
 }
 impl MetalDevice {
    #[cfg(not(target_arch = "wasm32"))]
    pub fn compile(
        &self,
        func_name: &'static str,
        kernel: ug::lang::ssa::Kernel,
    ) -> Result<metal::ComputePipelineState> {
        let mut buf = vec![];
        ug_metal::code_gen::gen(&mut buf, func_name, &kernel)?;
        let metal_code = String::from_utf8(buf)?;
        let lib = self
            .device
            .new_library_with_source(&metal_code, &metal::CompileOptions::new())
            .map_err(MetalError::from)?;
        let func = lib
            .get_function(func_name, None)
            .map_err(MetalError::from)?;
        let pl = self
            .device
            .new_compute_pipeline_state_with_function(&func)
            .map_err(MetalError::from)?;
        Ok(pl)
    }
    pub fn id(&self) -> DeviceId {
        self.id
    }
    pub fn metal_device(&self) -> &metal::Device {
        &self.device
    }
    fn drop_unused_buffers(&self) -> Result<()> {
        let mut buffers = self.buffers.write().map_err(MetalError::from)?;
        for subbuffers in buffers.values_mut() {
            let newbuffers = subbuffers
                .iter()
                .filter(|s| Arc::strong_count(*s) > 1)
                .map(Arc::clone)
                .collect();
            *subbuffers = newbuffers;
        }
        Ok(())
    }
    pub fn command_buffer(&self) -> Result<CommandBuffer> {
        let mut commands = self.commands.write().map_err(MetalError::from)?;
        let (flushed, command_buffer) = commands.command_buffer()?;
        if flushed {
            self.drop_unused_buffers()?
        }
        Ok(command_buffer)
    }
    pub fn wait_until_completed(&self) -> Result<()> {
        let mut commands = self.commands.write().map_err(MetalError::from)?;
        commands.wait_until_completed()
    }
    pub fn kernels(&self) -> &Kernels {
        &self.kernels
    }
    pub fn device(&self) -> &metal::Device {
        &self.device
    }
    /// Creates a new buffer (not necessarily zeroed).
    /// The buffer is [MTLPrivate](https://developer.apple.com/documentation/metal/mtlstoragemode)
    /// This means the buffer data cannot be read on the CPU directly.
    ///
    /// [`name`] is only used to keep track of the resource origin in case of bugs
    pub fn new_buffer(
        &self,
        element_count: usize,
        dtype: DType,
        name: &str,
    ) -> Result<Arc<Buffer>> {
        let size = (element_count * dtype.size_in_bytes()) as NSUInteger;
        self.allocate_buffer(size, MTLResourceOptions::StorageModePrivate, name)
    }
    /// Creates a new buffer (not necessarily zeroed).
    /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode)
    /// This means the buffer can be read on the CPU but will require manual
    /// synchronization when the CPU memory is modified
    /// Used as a bridge to gather data back from the GPU
    pub fn new_buffer_managed(&self, size: NSUInteger) -> Result<Arc<Buffer>> {
        self.allocate_buffer(size, MTLResourceOptions::StorageModeManaged, "managed")
    }
    /// Creates a new buffer from data.
    /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode)
    ///
    /// Does not require synchronization, as [newBufferWithBytes](https://developer.apple.com/documentation/metal/mtldevice/1433429-newbufferwithbytes)
    /// allocates the buffer and copies over the existing data before returning the MTLBuffer.
    pub fn new_buffer_with_data<T>(&self, data: &[T]) -> Result<Arc<Buffer>> {
        let size = core::mem::size_of_val(data) as NSUInteger;
        let new_buffer = self.device.new_buffer_with_data(
            data.as_ptr().cast(),
            size,
            MTLResourceOptions::StorageModeManaged,
        );
        let mut buffers = self.buffers.write().map_err(MetalError::from)?;
        let subbuffers = buffers
            .entry((size, MTLResourceOptions::StorageModeManaged))
            .or_insert(vec![]);
        let new_buffer = Arc::new(new_buffer);
        subbuffers.push(new_buffer.clone());
        Ok(new_buffer)
    }
    pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result<Arc<Buffer>> {
        let buffer = self.allocate_buffer(
            size_in_bytes as NSUInteger,
            MTLResourceOptions::StorageModePrivate,
            "allocate_zeros",
        )?;
        let command_buffer = self.command_buffer()?;
        command_buffer.set_label("zeros");
        let blit = command_buffer.new_blit_command_encoder();
        blit.fill_buffer(
            &buffer,
            metal::NSRange {
                location: 0,
                length: buffer.length(),
            },
            0,
        );
        blit.end_encoding();
        Ok(buffer)
    }
    /// The critical allocator algorithm
    fn allocate_buffer(
        &self,
        size: NSUInteger,
        option: MTLResourceOptions,
        _name: &str,
    ) -> Result<Arc<Buffer>> {
        let mut buffers = self.buffers.write().map_err(MetalError::from)?;
        if let Some(b) = find_available_buffer(size, option, &buffers) {
            // Cloning also ensures we increment the strong count
            return Ok(b.clone());
        }
        let size = buf_size(size);
        let subbuffers = buffers.entry((size, option)).or_insert(vec![]);
        let new_buffer = self.device.new_buffer(size as NSUInteger, option);
        let new_buffer = Arc::new(new_buffer);
        subbuffers.push(new_buffer.clone());
        Ok(new_buffer)
    }
    /// Create a metal GPU capture trace on [`path`].
    pub fn capture<P: AsRef<Path>>(&self, path: P) -> Result<()> {
        let capture = metal::CaptureManager::shared();
        let descriptor = metal::CaptureDescriptor::new();
        descriptor.set_destination(metal::MTLCaptureDestination::GpuTraceDocument);
        descriptor.set_capture_device(self);
        // The [set_output_url] call requires an absolute path so we convert it if needed.
        if path.as_ref().is_absolute() {
            descriptor.set_output_url(path);
        } else {
            let path = std::env::current_dir()?.join(path);
            descriptor.set_output_url(path);
        }
        capture
            .start_capture(&descriptor)
            .map_err(MetalError::from)?;
        Ok(())
    }
 }
 fn buf_size(size: NSUInteger) -> NSUInteger {
    size.saturating_sub(1).next_power_of_two() as NSUInteger
 }
 fn find_available_buffer(
    size: NSUInteger,
    option: MTLResourceOptions,
    buffers: &BufferMap,
 ) -> Option<Arc<Buffer>> {
    let mut best_buffer: Option<&Arc<Buffer>> = None;
    let mut best_buffer_size: NSUInteger = NSUInteger::MAX;
    for ((buffer_size, buffer_option), subbuffers) in buffers.iter() {
        if buffer_size >= &size && buffer_size < &best_buffer_size && buffer_option == &option {
            for sub in subbuffers {
                if Arc::strong_count(sub) == 1 {
                    best_buffer = Some(sub);
                    best_buffer_size = *buffer_size;
                }
            }
        }
    }
    best_buffer.cloned()
 }
--- a/candle-core/src/metal_backend/mod.rs
+++ b/candle-core/src/metal_backend/mod.rs
--- a/candle-core/src/mkl.rs
+++ b/candle-core/src/mkl.rs
@ -333,16 +333,6 @@ pub fn vd_tanh_inplace(y: &mut [f64]) {
    unsafe { ffi::vdTanh(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
 }
 #[inline]
 pub fn vs_exp_inplace(y: &mut [f32]) {
    unsafe { ffi::vsExp(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
 }
 #[inline]
 pub fn vd_exp_inplace(y: &mut [f64]) {
    unsafe { ffi::vdExp(y.len() as i32, y.as_ptr(), y.as_mut_ptr()) }
 }
 #[inline]
 pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
@ -365,28 +355,6 @@ pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
    }
 }
 #[inline]
 pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = -v
    }
    vs_exp_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = v / (1.0 + *y)
    }
 }
 #[inline]
 pub fn vd_silu(vs: &[f64], ys: &mut [f64]) {
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = -v
    }
    vd_exp_inplace(ys);
    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
        *y = v / (1.0 + *y)
    }
 }
 macro_rules! binary_op {
    ($fn_name:ident, $ty:ty, $mkl_name:ident) => {
        #[inline]
--- a/candle-core/src/npy.rs
+++ b/candle-core/src/npy.rs
@ -250,6 +250,8 @@ impl Tensor {
        if header.fortran_order {
            return Err(Error::Npy("fortran order not supported".to_string()));
        }
        let mut data: Vec<u8> = vec![];
        reader.read_to_end(&mut data)?;
        Self::from_reader(header.shape(), header.descr, &mut reader)
    }
@ -330,7 +332,7 @@ impl Tensor {
        path: P,
    ) -> Result<()> {
        let mut zip = zip::ZipWriter::new(File::create(path.as_ref())?);
-        let options: zip::write::FileOptions<()> =
+        let options =
            zip::write::FileOptions::default().compression_method(zip::CompressionMethod::Stored);
        for (name, tensor) in ts.iter() {
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,7 +1,5 @@
 //! Tensor Opertion Enums and Traits
 //!
 #![allow(clippy::redundant_closure_call)]
-use crate::Tensor;
+use crate::{CpuStorage, CudaStorage, Layout, Result, Shape, Tensor};
 use half::{bf16, f16};
 use num_traits::float::Float;
@ -60,15 +58,8 @@ pub enum UnaryOp {
    Sqr,
    Sqrt,
    Gelu,
    GeluErf,
    Erf,
    Relu,
    Silu,
    Tanh,
    Floor,
    Ceil,
    Round,
    Sign,
 }
 #[derive(Clone)]
@ -94,16 +85,6 @@ pub enum Op {
        dilation: usize,
    },
    #[allow(dead_code)]
    ConvTranspose1D {
        arg: Tensor,
        kernel: Tensor,
        padding: usize,
        output_padding: usize,
        stride: usize,
        dilation: usize,
    },
    #[allow(dead_code)]
    Conv2D {
        arg: Tensor,
@ -135,15 +116,7 @@ pub enum Op {
        stride: (usize, usize),
    },
-    UpsampleNearest1D {
+    UpsampleNearest2D(Tensor),
        arg: Tensor,
        target_size: usize,
    },
    UpsampleNearest2D {
        arg: Tensor,
        target_h: usize,
        target_w: usize,
    },
    Cat(Vec<Tensor>, usize),
@ -157,30 +130,132 @@ pub enum Op {
    Copy(Tensor),
    Broadcast(Tensor),
    Narrow(Tensor, usize, usize, usize),
    SliceScatter0(Tensor, Tensor, usize),
    Reshape(Tensor),
    ToDevice(Tensor),
    Transpose(Tensor, usize, usize),
    Permute(Tensor, Vec<usize>),
    Elu(Tensor, f64),
    Powf(Tensor, f64),
-    CustomOp1(
+    CustomOp1(Tensor, std::sync::Arc<Box<dyn CustomOp1 + Send + Sync>>),
        Tensor,
        std::sync::Arc<Box<dyn crate::CustomOp1 + Send + Sync>>,
    ),
    CustomOp2(
        Tensor,
        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp2 + Send + Sync>>,
+        std::sync::Arc<Box<dyn CustomOp2 + Send + Sync>>,
    ),
    CustomOp3(
        Tensor,
        Tensor,
        Tensor,
-        std::sync::Arc<Box<dyn crate::CustomOp3 + Send + Sync>>,
+        std::sync::Arc<Box<dyn CustomOp3 + Send + Sync>>,
    ),
 }
 /// Unary ops that can be defined in user-land.
 pub trait CustomOp1 {
    // Box<dyn> does not support const yet, so use a function to get the name.
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    /// This function takes as argument the argument `arg` used in the forward pass, the result
    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
    /// The function should return the gradient of the argument.
    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
        Err(crate::Error::BackwardNotSupported { op: self.name() })
    }
 }
 pub trait CustomOp2 {
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(
        &self,
        s1: &CpuStorage,
        l1: &Layout,
        s2: &CpuStorage,
        l2: &Layout,
    ) -> Result<(CpuStorage, Shape)>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(
        &self,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
    ) -> Result<(CudaStorage, Shape)> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    fn bwd(
        &self,
        _arg1: &Tensor,
        _arg2: &Tensor,
        _res: &Tensor,
        _grad_res: &Tensor,
    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
        Err(crate::Error::BackwardNotSupported { op: self.name() })
    }
 }
 pub trait CustomOp3 {
    fn name(&self) -> &'static str;
    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cpu_fwd(
        &self,
        s1: &CpuStorage,
        l1: &Layout,
        s2: &CpuStorage,
        l2: &Layout,
        s3: &CpuStorage,
        l3: &Layout,
    ) -> Result<(CpuStorage, Shape)>;
    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
    /// offsets etc so the associated layout should be used to access it.
    fn cuda_fwd(
        &self,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
        _: &CudaStorage,
        _: &Layout,
    ) -> Result<(CudaStorage, Shape)> {
        Err(crate::Error::Cuda(
            format!("no cuda implementation for {}", self.name()).into(),
        ))
    }
    fn bwd(
        &self,
        _arg1: &Tensor,
        _arg2: &Tensor,
        _arg3: &Tensor,
        _res: &Tensor,
        _grad_res: &Tensor,
    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
        Err(crate::Error::BackwardNotSupported { op: self.name() })
    }
 }
 pub trait UnaryOpT {
    const NAME: &'static str;
    const KERNEL: &'static str;
@ -249,15 +324,8 @@ pub(crate) struct Recip;
 pub(crate) struct Sqr;
 pub(crate) struct Sqrt;
 pub(crate) struct Gelu;
 pub(crate) struct GeluErf;
 pub(crate) struct Erf;
 pub(crate) struct Relu;
 pub(crate) struct Silu;
 pub(crate) struct Tanh;
 pub(crate) struct Floor;
 pub(crate) struct Ceil;
 pub(crate) struct Round;
 pub(crate) struct Sign;
 macro_rules! bin_op {
    ($op:ident, $name: literal, $e: expr, $f32_vec: ident, $f64_vec: ident) => {
@ -456,20 +524,13 @@ unary_op!(Log, "log", v, v.ln(), vs_ln, vd_ln);
 unary_op!(Sin, "sin", v, v.sin(), vs_sin, vd_sin);
 unary_op!(Cos, "cos", v, v.cos(), vs_cos, vd_cos);
 unary_op!(Tanh, "tanh", v, v.tanh(), vs_tanh, vd_tanh);
 unary_op!(Abs, "abs", v, v.abs());
 unary_op!(Neg, "neg", v, -v);
 unary_op!(Recip, "recip", v, v.recip());
 unary_op!(Sqr, "sqr", v, v * v, vs_sqr, vd_sqr);
 unary_op!(Sqrt, "sqrt", v, v.sqrt(), vs_sqrt, vd_sqrt);
-// Hardcode the value for sqrt(2/pi)
+/// `gelu` operation
 // https://github.com/huggingface/candle/issues/1982
 #[allow(clippy::excessive_precision)]
 const SQRT_TWO_OVER_PI_F32: f32 = 0.79788456080286535587989211986876373;
 #[allow(clippy::excessive_precision)]
 const SQRT_TWO_OVER_PI_F64: f64 = 0.79788456080286535587989211986876373;
 /// Tanh based approximation of the `gelu` operation
 /// GeluErf is the more precise one.
 /// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
 impl UnaryOpT for Gelu {
    const NAME: &'static str = "gelu";
@ -480,7 +541,7 @@ impl UnaryOpT for Gelu {
            * v
            * (bf16::ONE
                + bf16::tanh(
-                    bf16::from_f32_const(SQRT_TWO_OVER_PI_F32)
+                    (bf16::from_f32_const(2.0) / bf16::PI).sqrt()
                        * v
                        * (bf16::ONE + bf16::from_f32_const(0.044715) * v * v),
                ))
@ -491,18 +552,22 @@ impl UnaryOpT for Gelu {
            * v
            * (f16::ONE
                + f16::tanh(
-                    f16::from_f32_const(SQRT_TWO_OVER_PI_F32)
+                    (f16::from_f32_const(2.0) / f16::PI).sqrt()
                        * v
                        * (f16::ONE + f16::from_f32_const(0.044715) * v * v),
                ))
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
-        0.5 * v * (1.0 + f32::tanh(SQRT_TWO_OVER_PI_F32 * v * (1.0 + 0.044715 * v * v)))
+        0.5 * v
            * (1.0
                + f32::tanh((2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
-        0.5 * v * (1.0 + f64::tanh(SQRT_TWO_OVER_PI_F64 * v * (1.0 + 0.044715 * v * v)))
+        0.5 * v
            * (1.0
                + f64::tanh((2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
    }
    #[inline(always)]
    fn u8(_: u8) -> u8 {
@ -535,301 +600,6 @@ impl UnaryOpT for Gelu {
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::mkl::vd_gelu(xs, ys)
    }
    #[cfg(feature = "accelerate")]
    const F32_VEC: bool = true;
    #[cfg(feature = "accelerate")]
    #[inline(always)]
    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
        crate::accelerate::vs_gelu(xs, ys)
    }
    #[cfg(feature = "accelerate")]
    const F64_VEC: bool = true;
    #[cfg(feature = "accelerate")]
    #[inline(always)]
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::accelerate::vd_gelu(xs, ys)
    }
 }
 /// `erf` operation
 /// <https://en.wikipedia.org/wiki/Error_function>
 impl UnaryOpT for Erf {
    const NAME: &'static str = "erf";
    const KERNEL: &'static str = "uerf";
    const V: Self = Erf;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        bf16::from_f64(Self::f64(v.to_f64()))
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        f16::from_f64(Self::f64(v.to_f64()))
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        Self::f64(v as f64) as f32
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        crate::cpu::erf::erf(v)
    }
    #[inline(always)]
    fn u8(_: u8) -> u8 {
        0
    }
    #[inline(always)]
    fn u32(_: u32) -> u32 {
        0
    }
    #[inline(always)]
    fn i64(_: i64) -> i64 {
        0
    }
 }
 /// Silu operation
 impl UnaryOpT for Silu {
    const NAME: &'static str = "silu";
    const V: Self = Silu;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        v / (bf16::ONE + (-v).exp())
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        v / (f16::ONE + (-v).exp())
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        v / (1.0 + (-v).exp())
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        v / (1.0 + (-v).exp())
    }
    #[inline(always)]
    fn u8(_: u8) -> u8 {
        0
    }
    #[inline(always)]
    fn u32(_: u32) -> u32 {
        0
    }
    #[inline(always)]
    fn i64(_: i64) -> i64 {
        0
    }
    const KERNEL: &'static str = "usilu";
    #[cfg(feature = "mkl")]
    const F32_VEC: bool = true;
    #[cfg(feature = "mkl")]
    #[inline(always)]
    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
        crate::mkl::vs_silu(xs, ys)
    }
    #[cfg(feature = "mkl")]
    const F64_VEC: bool = true;
    #[cfg(feature = "mkl")]
    #[inline(always)]
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::mkl::vd_silu(xs, ys)
    }
    #[cfg(feature = "accelerate")]
    const F32_VEC: bool = true;
    #[cfg(feature = "accelerate")]
    #[inline(always)]
    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
        crate::accelerate::vs_silu(xs, ys)
    }
    #[cfg(feature = "accelerate")]
    const F64_VEC: bool = true;
    #[cfg(feature = "accelerate")]
    #[inline(always)]
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::accelerate::vd_silu(xs, ys)
    }
 }
 impl UnaryOpT for Abs {
    const NAME: &'static str = "abs";
    const KERNEL: &'static str = "uabs";
    const V: Self = Abs;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        v.abs()
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        v.abs()
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        v.abs()
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        v.abs()
    }
    #[inline(always)]
    fn u8(v: u8) -> u8 {
        v
    }
    #[inline(always)]
    fn u32(v: u32) -> u32 {
        v
    }
    #[inline(always)]
    fn i64(v: i64) -> i64 {
        v.abs()
    }
 }
 impl UnaryOpT for Ceil {
    const NAME: &'static str = "ceil";
    const KERNEL: &'static str = "uceil";
    const V: Self = Ceil;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        v.ceil()
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        v.ceil()
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        v.ceil()
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        v.ceil()
    }
    #[inline(always)]
    fn u8(v: u8) -> u8 {
        v
    }
    #[inline(always)]
    fn u32(v: u32) -> u32 {
        v
    }
    #[inline(always)]
    fn i64(v: i64) -> i64 {
        v
    }
 }
 impl UnaryOpT for Floor {
    const NAME: &'static str = "floor";
    const KERNEL: &'static str = "ufloor";
    const V: Self = Floor;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        v.floor()
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        v.floor()
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        v.floor()
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        v.floor()
    }
    #[inline(always)]
    fn u8(v: u8) -> u8 {
        v
    }
    #[inline(always)]
    fn u32(v: u32) -> u32 {
        v
    }
    #[inline(always)]
    fn i64(v: i64) -> i64 {
        v
    }
 }
 impl UnaryOpT for Round {
    const NAME: &'static str = "round";
    const KERNEL: &'static str = "uround";
    const V: Self = Round;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        v.round()
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        v.round()
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        v.round()
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        v.round()
    }
    #[inline(always)]
    fn u8(v: u8) -> u8 {
        v
    }
    #[inline(always)]
    fn u32(v: u32) -> u32 {
        v
    }
    #[inline(always)]
    fn i64(v: i64) -> i64 {
        v
    }
 }
 impl UnaryOpT for GeluErf {
    const NAME: &'static str = "gelu_erf";
    const KERNEL: &'static str = "ugelu_erf";
    const V: Self = GeluErf;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        bf16::from_f64(Self::f64(v.to_f64()))
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        f16::from_f64(Self::f64(v.to_f64()))
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        Self::f64(v as f64) as f32
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        (crate::cpu::erf::erf(v / 2f64.sqrt()) + 1.) * 0.5 * v
    }
    #[inline(always)]
    fn u8(_: u8) -> u8 {
        0
    }
    #[inline(always)]
    fn u32(_: u32) -> u32 {
        0
    }
    #[inline(always)]
    fn i64(_: i64) -> i64 {
        0
    }
 }
 impl UnaryOpT for Relu {
@ -917,10 +687,6 @@ impl BackpropOp {
        };
        Self(op)
    }
    pub(crate) fn is_none(&self) -> bool {
        self.0.is_none()
    }
 }
 impl std::ops::Deref for BackpropOp {
@ -929,37 +695,3 @@ impl std::ops::Deref for BackpropOp {
        &self.0
    }
 }
 impl UnaryOpT for Sign {
    const NAME: &'static str = "sign";
    const KERNEL: &'static str = "usign";
    const V: Self = Sign;
    #[inline(always)]
    fn bf16(v: bf16) -> bf16 {
        bf16::from((v > bf16::ZERO) as i8) - bf16::from((v < bf16::ZERO) as i8)
    }
    #[inline(always)]
    fn f16(v: f16) -> f16 {
        f16::from((v > f16::ZERO) as i8) - f16::from((v < f16::ZERO) as i8)
    }
    #[inline(always)]
    fn f32(v: f32) -> f32 {
        f32::from(v > 0.) - f32::from(v < 0.)
    }
    #[inline(always)]
    fn f64(v: f64) -> f64 {
        f64::from(v > 0.) - f64::from(v < 0.)
    }
    #[inline(always)]
    fn u8(v: u8) -> u8 {
        u8::min(1, v)
    }
    #[inline(always)]
    fn u32(v: u32) -> u32 {
        u32::min(1, v)
    }
    #[inline(always)]
    fn i64(v: i64) -> i64 {
        (v > 0) as i64 - (v < 0) as i64
    }
 }
--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@ -1,7 +1,7 @@
-//! Just enough pickle support to be able to read PyTorch checkpoints.
+// Just enough pickle support to be able to read PyTorch checkpoints.
 // This hardcodes objects that are required for tensor reading, we may want to make this a bit more
 // composable/tensor agnostic at some point.
-use crate::{Context, DType, Error as E, Layout, Result, Tensor};
+use crate::{DType, Error as E, Layout, Result, Tensor};
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;
 use std::io::BufRead;
@ -42,10 +42,9 @@ pub enum OpCode {
    Stop = b'.',
    NewObj = 0x81,
    EmptyList = b']',
-    BinFloat = b'G',
+    BinFloat = b'g',
    Append = b'a',
    Appends = b'e',
    Long1 = 0x8a,
 }
 // Avoid using FromPrimitive so as not to drag another dependency.
@ -85,7 +84,6 @@ impl TryFrom<u8> for OpCode {
            b'G' => Ok(Self::BinFloat),
            b'a' => Ok(Self::Append),
            b'e' => Ok(Self::Appends),
            0x8a => Ok(Self::Long1),
            value => Err(value),
        }
    }
@ -108,7 +106,6 @@ pub enum Object {
        class_name: String,
    },
    Int(i32),
    Long(i64),
    Float(f64),
    Unicode(String),
    Bool(bool),
@ -173,14 +170,6 @@ impl Object {
        }
    }
    pub fn int_or_long(self) -> OResult<i64> {
        match self {
            Self::Int(t) => Ok(t as i64),
            Self::Long(t) => Ok(t),
            _ => Err(self),
        }
    }
    pub fn tuple(self) -> OResult<Vec<Self>> {
        match self {
            Self::Tuple(t) => Ok(t),
@ -204,55 +193,6 @@ impl Object {
            _ => Err(self),
        }
    }
    pub fn into_tensor_info(
        self,
        name: Self,
        dir_name: &std::path::Path,
    ) -> Result<Option<TensorInfo>> {
        let name = match name.unicode() {
            Ok(name) => name,
            Err(_) => return Ok(None),
        };
        let (callable, args) = match self.reduce() {
            Ok(callable_args) => callable_args,
            _ => return Ok(None),
        };
        let (callable, args) = match callable {
            Object::Class {
                module_name,
                class_name,
            } if module_name == "torch._tensor" && class_name == "_rebuild_from_type_v2" => {
                let mut args = args.tuple()?;
                let callable = args.remove(0);
                let args = args.remove(1);
                (callable, args)
            }
            Object::Class {
                module_name,
                class_name,
            } if module_name == "torch._utils" && class_name == "_rebuild_parameter" => {
                let mut args = args.tuple()?;
                args.remove(0).reduce()?
            }
            _ => (callable, args),
        };
        match callable {
            Object::Class {
                module_name,
                class_name,
            } if module_name == "torch._utils" && class_name == "_rebuild_tensor_v2" => {}
            _ => return Ok(None),
        };
        let (layout, dtype, file_path, storage_size) = rebuild_args(args)?;
        Ok(Some(TensorInfo {
            name,
            dtype,
            layout,
            path: format!("{}/{}", dir_name.to_string_lossy(), file_path),
            storage_size,
        }))
    }
 }
 impl TryFrom<Object> for String {
@ -361,10 +301,8 @@ impl Stack {
                module_name,
                class_name,
            } => {
-                if module_name == "collections"
+                if module_name == "collections" && class_name == "OrderedDict" {
-                    && (class_name == "OrderedDict" || class_name == "defaultdict")
+                    // TODO: have a separate ordered dict.
                {
                    // TODO: have a separate ordered dict and a separate default dict.
                    Some(Object::Dict(vec![]))
                } else {
                    None
@ -473,10 +411,7 @@ impl Stack {
                self.push(Object::Int(arg))
            }
            OpCode::BinFloat => {
-                // Somehow floats are encoded using BigEndian whereas int types use LittleEndian.
+                let arg = r.read_f64::<LittleEndian>()?;
                // https://github.com/python/cpython/blob/0c80da4c14d904a367968955544dd6ae58c8101c/Lib/pickletools.py#L855
                // https://github.com/pytorch/pytorch/blob/372d078f361e726bb4ac0884ac334b04c58179ef/torch/_weights_only_unpickler.py#L243
                let arg = r.read_f64::<byteorder::BigEndian>()?;
                self.push(Object::Float(arg))
            }
            OpCode::BinUnicode => {
@ -548,7 +483,7 @@ impl Stack {
                        crate::bail!("setitems: not an even number of objects")
                    }
                    while let Some(value) = objs.pop() {
-                        let key = objs.pop().context("empty objs")?;
+                        let key = objs.pop().unwrap();
                        d.push((key, value))
                    }
                } else {
@ -568,7 +503,7 @@ impl Stack {
                    crate::bail!("setitems: not an even number of objects")
                }
                while let Some(value) = objs.pop() {
-                    let key = objs.pop().context("empty objs")?;
+                    let key = objs.pop().unwrap();
                    pydict.push((key, value))
                }
                self.push(Object::Dict(pydict))
@ -601,15 +536,6 @@ impl Stack {
                let obj = self.new_obj(class, args)?;
                self.push(obj)
            }
            OpCode::Long1 => {
                let n_bytes = r.read_u8()?;
                let mut v = 0;
                // Decode the next n bytes in little endian
                for i in 0..n_bytes {
                    v |= (r.read_u8()? as i64) << (i * 8);
                }
                self.push(Object::Long(v))
            }
        }
        Ok(false)
    }
@ -627,10 +553,10 @@ fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
    let mut args = args.tuple()?;
    let stride = Vec::<usize>::try_from(args.remove(3))?;
    let size = Vec::<usize>::try_from(args.remove(2))?;
-    let offset = args.remove(1).int_or_long()? as usize;
+    let offset = args.remove(1).int()? as usize;
    let storage = args.remove(0).persistent_load()?;
    let mut storage = storage.tuple()?;
-    let storage_size = storage.remove(4).int_or_long()? as usize;
+    let storage_size = storage.remove(4).int()? as usize;
    let path = storage.remove(2).unicode()?;
    let (_module_name, class_name) = storage.remove(1).class()?;
    let dtype = match class_name.as_str() {
@ -639,16 +565,11 @@ fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
        "HalfStorage" => DType::F16,
        "BFloat16Storage" => DType::BF16,
        "ByteStorage" => DType::U8,
        "LongStorage" => DType::I64,
        other => {
            crate::bail!("unsupported storage type {other}")
        }
    };
-    let layout = Layout::new(
+    let layout = Layout::new(crate::Shape::from(size), stride, offset);
        crate::Shape::from(size),
        stride,
        offset * dtype.size_in_bytes(),
    );
    Ok((layout, dtype, path, storage_size))
 }
@ -661,16 +582,9 @@ pub struct TensorInfo {
    pub storage_size: usize,
 }
 /// Read the tensor info from a .pth file.
 ///
 /// # Arguments
 /// * `file` - The path to the .pth file.
 /// * `verbose` - Whether to print debug information.
 /// * `key` - Optional key to retrieve `state_dict` from the pth file.
 pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
    file: P,
    verbose: bool,
    key: Option<&str>,
 ) -> Result<Vec<TensorInfo>> {
    let file = std::fs::File::open(file)?;
    let zip_reader = std::io::BufReader::new(file);
@ -685,16 +599,15 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
        if !file_name.ends_with("data.pkl") {
            continue;
        }
-        let dir_name = std::path::PathBuf::from(file_name.strip_suffix(".pkl").context("no .pkl")?);
+        let dir_name = std::path::PathBuf::from(file_name.strip_suffix(".pkl").unwrap());
        let reader = zip.by_name(file_name)?;
        let mut reader = std::io::BufReader::new(reader);
        let mut stack = Stack::empty();
        stack.read_loop(&mut reader)?;
        let obj = stack.finalize()?;
        if VERBOSE || verbose {
-            println!("{obj:#?}");
+            println!("{obj:?}");
        }
        let obj = match obj {
            Object::Build { callable, args } => match *callable {
                Object::Reduce { callable, args: _ } => match *callable {
@ -708,30 +621,52 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
            },
            obj => obj,
        };
        // If key is provided, then we need to extract the state_dict from the object.
        let obj = if let Some(key) = key {
            if let Object::Dict(key_values) = obj {
                key_values
                    .into_iter()
                    .find(|(k, _)| *k == Object::Unicode(key.to_owned()))
                    .map(|(_, v)| v)
                    .ok_or_else(|| E::Msg(format!("key {key} not found")))?
            } else {
                obj
            }
        } else {
            obj
        };
        // If the object is a dict, then we can extract the tensor info from it.
        // NOTE: We are assuming that the `obj` is state_dict by this stage.
        if let Object::Dict(key_values) = obj {
            for (name, value) in key_values.into_iter() {
-                match value.into_tensor_info(name, &dir_name) {
+                let name = match name.unicode() {
-                    Ok(Some(tensor_info)) => tensor_infos.push(tensor_info),
+                    Ok(name) => name,
-                    Ok(None) => {}
+                    Err(_) => continue,
-                    Err(err) => eprintln!("skipping: {err:?}"),
+                };
                let (callable, args) = match value.reduce() {
                    Ok(callable_args) => callable_args,
                    _ => continue,
                };
                let (callable, args) = match callable {
                    Object::Class {
                        module_name,
                        class_name,
                    } if module_name == "torch._tensor"
                        && class_name == "_rebuild_from_type_v2" =>
                    {
                        let mut args = args.tuple()?;
                        let callable = args.remove(0);
                        let args = args.remove(1);
                        (callable, args)
                    }
                    _ => (callable, args),
                };
                match callable {
                    Object::Class {
                        module_name,
                        class_name,
                    } if module_name == "torch._utils" && class_name == "_rebuild_tensor_v2" => {}
                    _ => continue,
                };
                match rebuild_args(args) {
                    Ok((layout, dtype, file_path, storage_size)) => {
                        let mut path = dir_name.clone();
                        path.push(file_path);
                        tensor_infos.push(TensorInfo {
                            name,
                            dtype,
                            layout,
                            path: path.to_string_lossy().into_owned(),
                            storage_size,
                        })
                    }
                    Err(err) => {
                        eprintln!("skipping {name}: {err:?}")
                    }
                }
            }
        }
@ -748,8 +683,8 @@ pub struct PthTensors {
 }
 impl PthTensors {
-    pub fn new<P: AsRef<std::path::Path>>(path: P, key: Option<&str>) -> Result<Self> {
+    pub fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
-        let tensor_infos = read_pth_tensor_info(path.as_ref(), false, key)?;
+        let tensor_infos = read_pth_tensor_info(path.as_ref(), false)?;
        let tensor_infos = tensor_infos
            .into_iter()
            .map(|ti| (ti.name.to_string(), ti))
@ -763,7 +698,6 @@ impl PthTensors {
    }
    pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
        use std::io::Read;
        let tensor_info = match self.tensor_infos.get(name) {
            None => return Ok(None),
            Some(tensor_info) => tensor_info,
@ -772,70 +706,20 @@ impl PthTensors {
        let zip_reader = std::io::BufReader::new(std::fs::File::open(&self.path)?);
        let mut zip = zip::ZipArchive::new(zip_reader)?;
        let mut reader = zip.by_name(&tensor_info.path)?;
        let is_fortran_contiguous = tensor_info.layout.is_fortran_contiguous();
        let rank = tensor_info.layout.shape().rank();
-        // Reading the data is a bit tricky as it can be strided, for now only support the basic
+        // Reading the data is a bit tricky as it can be strided, use an offset, etc.
-        // case and when the tensor is fortran contiguous.
+        // For now only support the basic case.
-        if !tensor_info.layout.is_contiguous() && !is_fortran_contiguous {
+        if tensor_info.layout.start_offset() != 0 || !tensor_info.layout.is_contiguous() {
            crate::bail!(
                "cannot retrieve non-contiguous tensors {:?}",
                tensor_info.layout
            )
        }
        let start_offset = tensor_info.layout.start_offset();
        if start_offset > 0 {
            std::io::copy(
                &mut reader.by_ref().take(start_offset as u64),
                &mut std::io::sink(),
            )?;
        }
        let tensor = Tensor::from_reader(
            tensor_info.layout.shape().clone(),
            tensor_info.dtype,
            &mut reader,
        )?;
-
+        Ok(Some(tensor))
        if rank > 1 && is_fortran_contiguous {
            // Reverse the shape, e.g. Shape(2, 3, 4) -> Shape(4, 3, 2)
            let shape_reversed: Vec<_> = tensor_info.layout.dims().iter().rev().cloned().collect();
            let tensor = tensor.reshape(shape_reversed)?;
            // Permute (transpose) the dimensions, e.g. Shape(4, 3, 2) -> Shape(2, 3, 4)
            let dim_indeces_reversed: Vec<_> = (0..rank).rev().collect();
            let tensor = tensor.permute(dim_indeces_reversed)?;
            Ok(Some(tensor))
        } else {
            Ok(Some(tensor))
        }
    }
 }
 /// Read all the tensors from a PyTorch pth file with a given key.
 ///
 /// # Arguments
 /// * `path` - Path to the pth file.
 /// * `key` - Optional key to retrieve `state_dict` from the pth file. Sometimes the pth file
 ///   contains multiple objects and the state_dict is the one we are interested in.
 pub fn read_all_with_key<P: AsRef<std::path::Path>>(
    path: P,
    key: Option<&str>,
 ) -> Result<Vec<(String, Tensor)>> {
    let pth = PthTensors::new(path, key)?;
    let tensor_names = pth.tensor_infos.keys();
    let mut tensors = Vec::with_capacity(tensor_names.len());
    for name in tensor_names {
        if let Some(tensor) = pth.get(name)? {
            tensors.push((name.to_string(), tensor))
        }
    }
    Ok(tensors)
 }
 /// Read all the tensors from a PyTorch pth file.
 ///
 /// # Arguments
 /// * `path` - Path to the pth file.
 pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tensor)>> {
    read_all_with_key(path, None)
 }
--- a/candle-core/src/quantized/avx.rs
+++ b/candle-core/src/quantized/avx.rs
@ -50,9 +50,14 @@ pub(crate) unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> __m256 {
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
    let nb = n / qk;
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
    if nb % 2 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
    }
    unsafe {
        let mut acc = _mm256_setzero_ps();
        for (x, y) in xs.iter().zip(ys.iter()) {
@ -353,7 +358,7 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                q3 = q3.add(32);
                // Prepare low and high bits
-                // We hardcode the shifts here to avoid loading them into a separate register
+                // We hardcode the shifts here to avoid loading them into a seperate register
                let q3l_0 = _mm256_and_si256(q3bits, m3);
                let q3h_0 = if j == 0 {
                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 0)), 0)
@ -586,7 +591,7 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bits = _mm256_loadu_si256(q5 as *const __m256i);
                q5 = q5.add(32);
-                //Similar to q3k we hardcode the shifts here to avoid loading them into a separate register
+                //Similar to q3k we hardcode the shifts here to avoid loading them into a seperate register
                let q5l_0 = _mm256_and_si256(q5bits, m4);
                let q5l_0_shift_input = _mm256_and_si256(hbits, hmask);
                let q5l_0_right_shift = match j {
@ -633,35 +638,3 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
        Ok(hsum_float_8(acc) + summs)
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
    let qk = QK_K;
    if n % qk != 0 {
        crate::bail!("vec_dot_q8k_8k: {n} is not divisible by {qk}")
    }
    unsafe {
        let mut acc = _mm256_setzero_ps();
        for (xs, ys) in xs.iter().zip(ys.iter()) {
            let mut sumi = _mm256_setzero_si256();
            let x_qs = xs.qs.as_ptr();
            let y_qs = ys.qs.as_ptr();
            for j in (0..QK_K).step_by(32) {
                let xs = _mm256_loadu_si256(x_qs.add(j) as *const __m256i);
                let ys = _mm256_loadu_si256(y_qs.add(j) as *const __m256i);
                let xs0 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(xs, 0));
                let ys0 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(ys, 0));
                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(xs0, ys0));
                let xs1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(xs, 1));
                let ys1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(ys, 1));
                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(xs1, ys1));
            }
            let d = _mm256_set1_ps(xs.d * ys.d);
            acc = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi), acc);
        }
        Ok(hsum_float_8(acc))
    }
 }
--- a/candle-core/src/quantized/cuda.rs
+++ b/candle-core/src/quantized/cuda.rs
@ -1,737 +0,0 @@
 use super::{GgmlDType, QStorage};
 use crate::quantized::k_quants::GgmlType;
 use crate::{backend::BackendDevice, cuda_backend::WrapErr};
 use crate::{builder_arg as barg, CudaDevice, CudaStorage, Result};
 use half::f16;
 use cudarc::driver::{CudaSlice, CudaView, PushKernelArg};
 #[derive(Clone, Debug)]
 struct PaddedCudaSlice {
    inner: CudaSlice<u8>,
    len: usize,
 }
 #[derive(Clone, Debug)]
 pub struct QCudaStorage {
    data: PaddedCudaSlice,
    dtype: GgmlDType,
    device: CudaDevice,
 }
 static FORCE_DMMV: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
 pub fn set_force_dmmv(f: bool) {
    FORCE_DMMV.store(f, std::sync::atomic::Ordering::Relaxed)
 }
 pub const WARP_SIZE: usize = 32;
 pub const MMQ_X_Q4_0_AMPERE: usize = 4;
 pub const MMQ_Y_Q4_0_AMPERE: usize = 32;
 pub const NWARPS_Q4_0_AMPERE: usize = 4;
 pub const GGML_CUDA_MMV_X: usize = 32;
 pub const GGML_CUDA_MMV_Y: usize = 1;
 pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256;
 pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256;
 pub const MATRIX_ROW_PADDING: usize = 512;
 fn ceil_div(p: usize, q: usize) -> usize {
    p.div_ceil(q)
 }
 fn pad(p: usize, q: usize) -> usize {
    ceil_div(p, q) * q
 }
 fn quantize_q8_1(
    src: &CudaView<f32>,
    dst: &mut CudaSlice<u8>,
    elem_count: usize,
    ky: usize,
    dev: &CudaDevice,
 ) -> Result<()> {
    let kx = elem_count;
    let kx_padded = pad(kx, MATRIX_ROW_PADDING);
    let num_blocks = ceil_div(kx_padded, CUDA_QUANTIZE_BLOCK_SIZE);
    let func = dev.get_or_load_func("quantize_q8_1", &candle_kernels::QUANTIZED)?;
    let cfg = cudarc::driver::LaunchConfig {
        grid_dim: (num_blocks as u32, ky as u32, 1),
        block_dim: (CUDA_QUANTIZE_BLOCK_SIZE as u32, 1, 1),
        shared_mem_bytes: 0,
    };
    let mut builder = func.builder();
    builder.arg(src);
    builder.arg(dst);
    barg!(builder, kx as i32, kx_padded as i32);
    unsafe { builder.launch(cfg) }.w()?;
    Ok(())
 }
 fn dequantize_f32(
    data: &PaddedCudaSlice,
    dtype: GgmlDType,
    elem_count: usize,
    dev: &CudaDevice,
 ) -> Result<CudaStorage> {
    let nb = elem_count.div_ceil(256);
    let (kernel_name, is_k, block_dim, num_blocks) = match dtype {
        GgmlDType::Q4_0 => ("dequantize_block_q4_0_f32", false, 32, nb),
        GgmlDType::Q4_1 => ("dequantize_block_q4_1_f32", false, 32, nb),
        GgmlDType::Q5_0 => (
            "dequantize_block_q5_0_f32",
            false,
            CUDA_DEQUANTIZE_BLOCK_SIZE,
            ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
        ),
        GgmlDType::Q5_1 => (
            "dequantize_block_q5_1_f32",
            false,
            CUDA_DEQUANTIZE_BLOCK_SIZE,
            ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
        ),
        GgmlDType::Q8_0 => ("dequantize_block_q8_0_f32", false, 32, nb),
        GgmlDType::Q2K => ("dequantize_block_q2_K_f32", true, 64, nb),
        GgmlDType::Q3K => ("dequantize_block_q3_K_f32", true, 64, nb),
        GgmlDType::Q4K => ("dequantize_block_q4_K_f32", true, 32, nb),
        GgmlDType::Q5K => ("dequantize_block_q5_K_f32", true, 64, nb),
        GgmlDType::Q6K => ("dequantize_block_q6_K_f32", true, 64, nb),
        GgmlDType::Q8K => ("dequantize_block_q8_K_f32", true, 32, nb),
        _ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
    };
    let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
    let dst = unsafe { dev.alloc::<f32>(elem_count)? };
    // See e.g.
    // https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
    let cfg = cudarc::driver::LaunchConfig {
        grid_dim: (num_blocks as u32, 1, 1),
        block_dim: (block_dim as u32, 1, 1),
        shared_mem_bytes: 0,
    };
    if is_k {
        let mut builder = func.builder();
        builder.arg(&data.inner);
        builder.arg(&dst);
        unsafe { builder.launch(cfg) }.w()?;
    } else {
        let nb32 = match dtype {
            GgmlDType::Q5_0 | GgmlDType::Q5_1 => elem_count,
            _ => elem_count / 32,
        };
        let mut builder = func.builder();
        builder.arg(&data.inner);
        builder.arg(&dst);
        barg!(builder, nb32 as i32);
        unsafe { builder.launch(cfg) }.w()?;
    }
    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
 }
 fn dequantize_f16(
    data: &PaddedCudaSlice,
    dtype: GgmlDType,
    elem_count: usize,
    dev: &CudaDevice,
 ) -> Result<CudaStorage> {
    let nb = elem_count.div_ceil(256);
    let (kernel_name, is_k, block_dim, num_blocks) = match dtype {
        GgmlDType::Q4_0 => ("dequantize_block_q4_0_f16", false, 32, nb),
        GgmlDType::Q4_1 => ("dequantize_block_q4_1_f16", false, 32, nb),
        GgmlDType::Q5_0 => (
            "dequantize_block_q5_0_f16",
            false,
            CUDA_DEQUANTIZE_BLOCK_SIZE,
            ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
        ),
        GgmlDType::Q5_1 => (
            "dequantize_block_q5_1_f16",
            false,
            CUDA_DEQUANTIZE_BLOCK_SIZE,
            ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
        ),
        GgmlDType::Q8_0 => ("dequantize_block_q8_0_f16", false, 32, nb),
        GgmlDType::Q2K => ("dequantize_block_q2_K_f16", true, 64, nb),
        GgmlDType::Q3K => ("dequantize_block_q3_K_f16", true, 64, nb),
        GgmlDType::Q4K => ("dequantize_block_q4_K_f16", true, 32, nb),
        GgmlDType::Q5K => ("dequantize_block_q5_K_f16", true, 64, nb),
        GgmlDType::Q6K => ("dequantize_block_q6_K_f16", true, 64, nb),
        GgmlDType::Q8K => ("dequantize_block_q8_K_f16", true, 32, nb),
        _ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
    };
    let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
    let dst = unsafe { dev.alloc::<f16>(elem_count)? };
    // See e.g.
    // https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
    let cfg = cudarc::driver::LaunchConfig {
        grid_dim: (num_blocks as u32, 1, 1),
        block_dim: (block_dim as u32, 1, 1),
        shared_mem_bytes: 0,
    };
    if is_k {
        let mut builder = func.builder();
        builder.arg(&data.inner);
        builder.arg(&dst);
        unsafe { builder.launch(cfg) }.w()?;
    } else {
        let nb32 = match dtype {
            GgmlDType::Q5_0 | GgmlDType::Q5_1 => elem_count,
            _ => elem_count / 32,
        };
        let mut builder = func.builder();
        builder.arg(&data.inner);
        builder.arg(&dst);
        barg!(builder, nb32 as i32);
        unsafe { builder.launch(cfg) }.w()?;
    }
    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
 }
 fn dequantize_mul_mat_vec(
    data: &PaddedCudaSlice,
    y: &CudaView<f32>,
    dtype: GgmlDType,
    ncols: usize,
    nrows: usize,
    dev: &CudaDevice,
 ) -> Result<CudaStorage> {
    let data_elems = data.len / dtype.type_size() * dtype.block_size();
    if data_elems < ncols * nrows {
        crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems)
    }
    if y.len() != ncols {
        crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len())
    }
    let kernel_name = match dtype {
        GgmlDType::Q4_0 => "dequantize_mul_mat_vec_q4_0_cuda",
        GgmlDType::Q4_1 => "dequantize_mul_mat_vec_q4_1_cuda",
        GgmlDType::Q5_0 => "dequantize_mul_mat_vec_q5_0_cuda",
        GgmlDType::Q5_1 => "dequantize_mul_mat_vec_q5_1_cuda",
        GgmlDType::Q8_0 => "dequantize_mul_mat_vec_q8_0_cuda",
        GgmlDType::Q2K => "dequantize_mul_mat_vec_q2_k",
        GgmlDType::Q3K => "dequantize_mul_mat_vec_q3_k",
        GgmlDType::Q4K => "dequantize_mul_mat_vec_q4_k",
        GgmlDType::Q5K => "dequantize_mul_mat_vec_q5_k",
        GgmlDType::Q6K => "dequantize_mul_mat_vec_q6_k",
        _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
    };
    let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
    let dst = unsafe { dev.alloc::<f32>(nrows)? };
    let block_num_y = ceil_div(nrows, GGML_CUDA_MMV_Y);
    let cfg = cudarc::driver::LaunchConfig {
        grid_dim: (block_num_y as u32, 1, 1),
        block_dim: (WARP_SIZE as u32, GGML_CUDA_MMV_Y as u32, 1),
        shared_mem_bytes: 0,
    };
    let mut builder = func.builder();
    builder.arg(&data.inner);
    builder.arg(y);
    builder.arg(&dst);
    barg!(builder, ncols as i32, nrows as i32);
    unsafe { builder.launch(cfg) }.w()?;
    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
 }
 fn mul_mat_vec_via_q8_1(
    data: &PaddedCudaSlice,
    y: &CudaView<f32>,
    dtype: GgmlDType,
    ncols: usize,
    nrows: usize,
    b_size: usize,
    dev: &CudaDevice,
 ) -> Result<CudaStorage> {
    let data_elems = data.len / dtype.type_size() * dtype.block_size();
    if data_elems < ncols * nrows {
        crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems)
    }
    if y.len() != ncols * b_size {
        crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len())
    }
    if b_size == 0 || b_size > 8 {
        crate::bail!("only bsize between 1 and 8 are supported, got {b_size}")
    }
    // Start by quantizing y
    let ncols_padded = pad(ncols, MATRIX_ROW_PADDING);
    let y_size_in_bytes =
        b_size * ncols_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
    let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
    quantize_q8_1(y, &mut y_q8_1, ncols, b_size, dev)?;
    let kernel_name = match dtype {
        GgmlDType::Q4_0 => "mul_mat_vec_q4_0_q8_1_cuda",
        GgmlDType::Q4_1 => "mul_mat_vec_q4_1_q8_1_cuda",
        GgmlDType::Q5_0 => "mul_mat_vec_q5_0_q8_1_cuda",
        GgmlDType::Q5_1 => "mul_mat_vec_q5_1_q8_1_cuda",
        GgmlDType::Q8_0 => "mul_mat_vec_q8_0_q8_1_cuda",
        GgmlDType::Q2K => "mul_mat_vec_q2_K_q8_1_cuda",
        GgmlDType::Q3K => "mul_mat_vec_q3_K_q8_1_cuda",
        GgmlDType::Q4K => "mul_mat_vec_q4_K_q8_1_cuda",
        GgmlDType::Q5K => "mul_mat_vec_q5_K_q8_1_cuda",
        GgmlDType::Q6K => "mul_mat_vec_q6_K_q8_1_cuda",
        _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
    };
    let kernel_name = format!("{kernel_name}{b_size}");
    let func = dev.get_or_load_func(&kernel_name, &candle_kernels::QUANTIZED)?;
    let dst = unsafe { dev.alloc::<f32>(nrows * b_size)? };
    // https://github.com/ggerganov/llama.cpp/blob/facb8b56f8fd3bb10a693bf0943ae9d69d0828ef/ggml-cuda/mmvq.cu#L98
    let (nblocks, nwarps) = match b_size {
        1 => (nrows as u32, 4),
        2..=4 => ((nrows as u32).div_ceil(2), 4),
        5..=8 => ((nrows as u32).div_ceil(2), 2),
        _ => crate::bail!("unexpected bsize {b_size}"),
    };
    let cfg = cudarc::driver::LaunchConfig {
        grid_dim: (nblocks, 1, 1),
        block_dim: (WARP_SIZE as u32, nwarps, 1),
        shared_mem_bytes: 0,
    };
    let mut builder = func.builder();
    builder.arg(&data.inner);
    builder.arg(&y_q8_1);
    builder.arg(&dst);
    barg!(
        builder,
        /* ncols_x */ ncols as i32,
        /* nrows_x */ nrows as i32,
        /* nrows_y */ ncols_padded as i32,
        /* nrows_dst */ nrows as i32
    );
    unsafe { builder.launch(cfg) }.w()?;
    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
 }
 #[allow(clippy::too_many_arguments)]
 fn mul_mat_via_q8_1(
    data: &PaddedCudaSlice,
    y: &CudaView<f32>,
    dtype: GgmlDType,
    x_rows: usize,
    x_cols: usize,
    y_rows: usize,
    y_cols: usize,
    dev: &CudaDevice,
 ) -> Result<CudaStorage> {
    let data_elems = data.len / dtype.type_size() * dtype.block_size();
    if data_elems < x_rows * x_cols {
        crate::bail!("unexpected lhs size {}, {x_rows} {x_cols}", data_elems)
    }
    if y.len() != y_rows * y_cols {
        crate::bail!("unexpected y size {}, {y_rows} {y_cols}", y.len())
    }
    if x_cols != y_rows {
        crate::bail!("unexpected x/y size {x_rows} {x_cols} {y_rows} {y_cols}")
    }
    let k = x_cols;
    // Start by quantizing y
    let k_padded = pad(k, MATRIX_ROW_PADDING);
    let y_size_in_bytes =
        k_padded * y_cols * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
    let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
    quantize_q8_1(y, &mut y_q8_1, k, y_cols, dev)?;
    let (kernel_name, mmq_x, mmq_y) = match dtype {
        GgmlDType::Q4_0 => ("mul_mat_q4_0", 64, 128),
        GgmlDType::Q4_1 => ("mul_mat_q4_1", 64, 128),
        GgmlDType::Q5_0 => ("mul_mat_q5_0", 128, 64),
        GgmlDType::Q5_1 => ("mul_mat_q5_1", 128, 64),
        GgmlDType::Q8_0 => ("mul_mat_q8_0", 128, 64),
        GgmlDType::Q2K => ("mul_mat_q2_K", 64, 128),
        GgmlDType::Q3K => ("mul_mat_q3_K", 128, 128),
        GgmlDType::Q4K => ("mul_mat_q4_K", 64, 128),
        GgmlDType::Q5K => ("mul_mat_q5_K", 64, 128),
        GgmlDType::Q6K => ("mul_mat_q6_K", 64, 64),
        _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
    };
    let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
    let dst = unsafe { dev.alloc::<f32>(x_rows * y_cols)? };
    let cfg = cudarc::driver::LaunchConfig {
        grid_dim: (
            ceil_div(x_rows, mmq_y) as u32,
            ceil_div(y_cols, mmq_x) as u32,
            1,
        ),
        block_dim: (WARP_SIZE as u32, 4, 1),
        shared_mem_bytes: 0,
    };
    let mut builder = func.builder();
    builder.arg(/* vx */ &data.inner);
    builder.arg(/* vy */ &y_q8_1);
    builder.arg(/* dst */ &dst);
    barg!(
        builder,
        /* ncols_x */ x_cols as i32,
        /* nrows_x */ x_rows as i32,
        /* ncols_y */ y_cols as i32,
        /* nrows_y */ k_padded as i32,
        /* nrows_dst */ x_rows as i32
    );
    unsafe { builder.launch(cfg) }.w()?;
    Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
 }
 impl QCudaStorage {
    pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result<Self> {
        let size_in_bytes = ceil_div(el_count, dtype.block_size()) * dtype.type_size();
        let padded_size_in_bytes =
            ceil_div(el_count + MATRIX_ROW_PADDING, dtype.block_size()) * dtype.type_size();
        let inner = device.alloc_zeros::<u8>(padded_size_in_bytes)?;
        Ok(QCudaStorage {
            data: PaddedCudaSlice {
                inner,
                len: size_in_bytes,
            },
            device: device.clone(),
            dtype,
        })
    }
    pub fn dtype(&self) -> GgmlDType {
        self.dtype
    }
    pub fn device(&self) -> &CudaDevice {
        &self.device
    }
    pub fn dequantize(&self, elem_count: usize) -> Result<CudaStorage> {
        fn deq<T: GgmlType>(buffer: &[u8], n: usize, dst: &mut [f32]) -> Result<()> {
            let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) };
            let vec = slice.to_vec();
            T::to_float(&vec, dst)
        }
        let fast_kernel = matches!(
            self.dtype,
            GgmlDType::Q4_0
                | GgmlDType::Q4_1
                | GgmlDType::Q5_0
                | GgmlDType::Q5_1
                | GgmlDType::Q8_0
                | GgmlDType::Q2K
                | GgmlDType::Q3K
                | GgmlDType::Q4K
                | GgmlDType::Q5K
                | GgmlDType::Q6K
                | GgmlDType::Q8K
        );
        if fast_kernel {
            return dequantize_f32(&self.data, self.dtype, elem_count, self.device());
        }
        // Run the dequantization on cpu.
        let buffer = self
            .device
            .memcpy_dtov(&self.data.inner.slice(..self.data.len))?;
        let mut out = vec![0.0; elem_count];
        let block_len = elem_count / self.dtype.block_size();
        match self.dtype {
            GgmlDType::F32 => deq::<f32>(&buffer, block_len, &mut out)?,
            GgmlDType::F16 => deq::<half::f16>(&buffer, block_len, &mut out)?,
            GgmlDType::Q4_0 => deq::<crate::quantized::BlockQ4_0>(&buffer, block_len, &mut out)?,
            GgmlDType::Q4_1 => deq::<crate::quantized::BlockQ4_1>(&buffer, block_len, &mut out)?,
            GgmlDType::Q5_0 => deq::<crate::quantized::BlockQ5_0>(&buffer, block_len, &mut out)?,
            GgmlDType::Q5_1 => deq::<crate::quantized::BlockQ5_1>(&buffer, block_len, &mut out)?,
            GgmlDType::Q8_0 => deq::<crate::quantized::BlockQ8_0>(&buffer, block_len, &mut out)?,
            GgmlDType::Q8_1 => deq::<crate::quantized::BlockQ8_1>(&buffer, block_len, &mut out)?,
            GgmlDType::Q2K => deq::<crate::quantized::BlockQ2K>(&buffer, block_len, &mut out)?,
            GgmlDType::Q3K => deq::<crate::quantized::BlockQ3K>(&buffer, block_len, &mut out)?,
            GgmlDType::Q4K => deq::<crate::quantized::BlockQ4K>(&buffer, block_len, &mut out)?,
            GgmlDType::Q5K => deq::<crate::quantized::BlockQ5K>(&buffer, block_len, &mut out)?,
            GgmlDType::Q6K => deq::<crate::quantized::BlockQ6K>(&buffer, block_len, &mut out)?,
            GgmlDType::Q8K => deq::<crate::quantized::BlockQ8K>(&buffer, block_len, &mut out)?,
        }
        self.device
            .storage_from_cpu_storage(&crate::CpuStorage::F32(out))
    }
    pub fn dequantize_f16(&self, elem_count: usize) -> Result<CudaStorage> {
        dequantize_f16(&self.data, self.dtype, elem_count, self.device())
    }
    pub fn quantize(&mut self, src: &CudaStorage) -> Result<()> {
        // Run the quantization on cpu.
        let src = match &src.slice {
            crate::cuda_backend::CudaStorageSlice::F32(data) => self.device.memcpy_dtov(data)?,
            _ => crate::bail!("only f32 can be quantized"),
        };
        let src_len = src.len();
        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
        let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
        qcpu_storage.quantize(&src)?;
        let data = qcpu_storage.data()?;
        let padded_len =
            data.len() + MATRIX_ROW_PADDING * self.dtype.type_size() / self.dtype.block_size();
        let mut inner = unsafe { self.device.alloc::<u8>(padded_len)? };
        self.device
            .memcpy_htod(data.as_ref(), &mut inner.slice_mut(..data.len()))?;
        self.data = PaddedCudaSlice {
            inner,
            len: data.len(),
        };
        Ok(())
    }
    pub fn storage_size_in_bytes(&self) -> usize {
        self.data.len
    }
    pub fn fwd(
        &self,
        self_shape: &crate::Shape,
        storage: &CudaStorage,
        layout: &crate::Layout,
    ) -> Result<(CudaStorage, crate::Shape)> {
        let max_bm = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
            1
        } else {
            8
        };
        let use_vec_kernel = match layout.shape().dims() {
            [b, m, _k] => b * m <= max_bm,
            [b, _k] => *b <= max_bm,
            _ => false,
        };
        if use_vec_kernel {
            self.dequantize_matmul_vec(self_shape, storage, layout)
        } else {
            self.dequantize_matmul(self_shape, storage, layout)
        }
    }
 }
 impl QCudaStorage {
    fn dequantize_matmul_vec(
        &self,
        self_shape: &crate::Shape,
        rhs: &CudaStorage,
        rhs_l: &crate::Layout,
    ) -> Result<(CudaStorage, crate::Shape)> {
        let (nrows, ncols) = self_shape.dims2()?;
        let rhs = rhs.as_cuda_slice::<f32>()?;
        let rhs = match rhs_l.contiguous_offsets() {
            Some((o1, o2)) => rhs.slice(o1..o2),
            None => Err(crate::Error::RequiresContiguous { op: "dmmv" }.bt())?,
        };
        let (b_size, k) = match rhs_l.shape().dims() {
            [b, m, k] => (b * m, *k),
            [b, k] => (*b, *k),
            _ => crate::bail!("unexpected rhs shape in dmmv {:?}", rhs_l.shape()),
        };
        if ncols != k {
            crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", rhs_l.shape())
        }
        let out = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
            dequantize_mul_mat_vec(&self.data, &rhs, self.dtype, ncols, nrows, self.device())?
        } else {
            mul_mat_vec_via_q8_1(
                &self.data,
                &rhs,
                self.dtype,
                ncols,
                nrows,
                b_size,
                self.device(),
            )?
        };
        let mut out_shape = rhs_l.shape().dims().to_vec();
        out_shape.pop();
        out_shape.push(nrows);
        Ok((out, out_shape.into()))
    }
    fn dequantize_matmul(
        &self,
        self_shape: &crate::Shape,
        storage: &CudaStorage,
        layout: &crate::Layout,
    ) -> Result<(CudaStorage, crate::Shape)> {
        use crate::backend::BackendStorage;
        let (n, k) = self_shape.dims2()?;
        let (b, m, k2) = match layout.shape().dims() {
            &[b, m, k2] => (b, m, k2),
            &[m, k2] => (1, m, k2),
            s => crate::bail!("unexpected shape for input {s:?}"),
        };
        if k2 != k {
            crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", layout.shape())
        }
        let out = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
            let data_f32 = self.dequantize(n * k)?;
            let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0).broadcast_as((b, k, n))?;
            storage.matmul(&data_f32, (b, m, n, k), layout, &rhs_l)?
        } else {
            let storage = storage.as_cuda_slice::<f32>()?;
            let storage = match layout.contiguous_offsets() {
                Some((o1, o2)) => storage.slice(o1..o2),
                None => Err(crate::Error::RequiresContiguous {
                    op: "quantized-matmul",
                }
                .bt())?,
            };
            mul_mat_via_q8_1(
                &self.data,
                &storage,
                self.dtype,
                /* x_rows */ n,
                /* x_cols */ k,
                /* y_rows */ k,
                /* y_cols */ b * m,
                self.device(),
            )?
        };
        let mut out_shape = layout.shape().dims().to_vec();
        out_shape.pop();
        out_shape.push(n);
        Ok((out, out_shape.into()))
    }
 }
 pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
    device: &CudaDevice,
    data: &[T],
 ) -> Result<super::QStorage> {
    let data = unsafe {
        std::slice::from_raw_parts(data.as_ptr() as *const u8, core::mem::size_of_val(data))
    };
    let dtype = T::DTYPE;
    let padded_len = data.len() + MATRIX_ROW_PADDING * dtype.type_size() / dtype.block_size();
    let mut inner = unsafe { device.alloc::<u8>(padded_len)? };
    device.memcpy_htod(data, &mut inner.slice_mut(..data.len()))?;
    Ok(QStorage::Cuda(QCudaStorage {
        data: PaddedCudaSlice {
            inner,
            len: data.len(),
        },
        device: device.clone(),
        dtype,
    }))
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn cuda_quantize_q8_1() -> Result<()> {
        let dev = CudaDevice::new(0)?;
        let el = 256;
        let el_padded = pad(el, MATRIX_ROW_PADDING);
        let y_size_in_bytes =
            el_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
        let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
        let vs: Vec<f32> = (0..el).map(|v| v as f32).collect();
        let y = dev.memcpy_stod(&vs)?;
        quantize_q8_1(&y.slice(..), &mut y_q8_1, el, 1, &dev)?;
        Ok(())
    }
    #[test]
    fn cuda_mmv_q8_1() -> Result<()> {
        let dev = CudaDevice::new(0)?;
        let ncols = 256;
        let vs: Vec<f32> = (0..ncols).map(|v| v as f32).collect();
        let y = dev.memcpy_stod(&vs)?;
        let mut xs = QCudaStorage::zeros(&dev, ncols, GgmlDType::Q4_0)?;
        xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
        let cuda_storage = mul_mat_vec_via_q8_1(
            &xs.data,
            &y.slice(..),
            /* dtype */ GgmlDType::Q4_0,
            /* ncols */ ncols,
            /* nrows */ 1,
            /* b_size */ 1,
            &dev,
        )?;
        let vs = cuda_storage.as_cuda_slice::<f32>()?;
        let vs = dev.memcpy_dtov(&vs.slice(..))?;
        assert_eq!(vs.len(), 1);
        // for n = 255, n.(n+1).(2n+1) / 6 = 5559680
        // Q8 means 1/256 precision.
        assert_eq!(vs[0], 5561664.5);
        let cuda_storage = dequantize_mul_mat_vec(
            &xs.data,
            &y.slice(..),
            /* dtype */ GgmlDType::Q4_0,
            /* ncols */ ncols,
            /* nrows */ 1,
            &dev,
        )?;
        let vs = cuda_storage.as_cuda_slice::<f32>()?;
        let vs = dev.memcpy_dtov(&vs.slice(..))?;
        assert_eq!(vs.len(), 1);
        assert_eq!(vs[0], 5561851.0);
        Ok(())
    }
    #[test]
    fn cuda_mm_q8_1() -> Result<()> {
        let dev = CudaDevice::new(0)?;
        let ncols = 256;
        let vs: Vec<f32> = (0..ncols * 4).map(|v| v as f32 / 4.).collect();
        let y = dev.memcpy_stod(&vs)?;
        let mut xs = QCudaStorage::zeros(&dev, ncols * 4, GgmlDType::Q4_0)?;
        xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
        let cuda_storage = mul_mat_via_q8_1(
            &xs.data,
            &y.slice(..),
            /* dtype */ GgmlDType::Q4_0,
            /* x_rows */ 4,
            /* x_cols */ ncols,
            /* y_rows */ ncols,
            /* y_cols */ 4,
            &dev,
        )?;
        let vs = cuda_storage.as_cuda_slice::<f32>()?;
        let vs = dev.memcpy_dtov(&vs.slice(..))?;
        /*
           x = torch.tensor([float(v) for v in range(1024)]).reshape(4, 256)
           x @ x.t() / 16
        tensor([[  347480.0000,   869720.0000,  1391960.0000,  1914200.0000],
                [  869720.0000,  2440536.0000,  4011352.0000,  5582166.5000],
                [ 1391960.0000,  4011352.0000,  6630742.0000,  9250132.0000],
                [ 1914200.0000,  5582166.5000,  9250132.0000, 12918099.0000]])
                */
        assert_eq!(vs.len(), 16);
        assert_eq!(vs[0], 347604.0);
        assert_eq!(vs[1], 888153.06);
        assert_eq!(vs[4], 869780.7);
        assert_eq!(vs[5], 2483145.0);
        assert_eq!(vs[11], 9407368.0);
        assert_eq!(vs[14], 9470856.0);
        assert_eq!(vs[15], 13138824.0);
        Ok(())
    }
    // The following test used to fail under compute-sanitizer until #2526.
    #[test]
    fn cuda_mm_q8_1_pad() -> Result<()> {
        let dev = CudaDevice::new(0)?;
        let (x_rows, ncols, y_cols) = (4, 16, 2048);
        let vs: Vec<f32> = (0..ncols * y_cols).map(|v| v as f32 / 256.).collect();
        let y = dev.memcpy_stod(&vs)?;
        let mut xs = QCudaStorage::zeros(&dev, ncols * x_rows, GgmlDType::Q4_0)?;
        xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
        let cuda_storage = mul_mat_via_q8_1(
            &xs.data,
            &y.slice(..),
            /* dtype */ GgmlDType::Q4_0,
            /* x_rows */ x_rows,
            /* x_cols */ ncols,
            /* y_rows */ ncols,
            /* y_cols */ y_cols,
            &dev,
        )?;
        let vs = cuda_storage.as_cuda_slice::<f32>()?;
        let _vs = dev.memcpy_dtov(&vs.slice(..))?;
        Ok(())
    }
 }
--- a/candle-core/src/quantized/dummy_cuda.rs
+++ b/candle-core/src/quantized/dummy_cuda.rs
@ -1,54 +0,0 @@
 #![allow(unused)]
 use super::GgmlDType;
 use crate::{CudaDevice, CudaStorage, Error, Result};
 pub struct QCudaStorage {
    dtype: GgmlDType,
    device: CudaDevice,
 }
 impl QCudaStorage {
    pub fn zeros(_: &CudaDevice, _: usize, _: GgmlDType) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    pub fn dtype(&self) -> GgmlDType {
        self.dtype
    }
    pub fn device(&self) -> &CudaDevice {
        &self.device
    }
    pub fn dequantize(&self, _elem_count: usize) -> Result<CudaStorage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    pub fn dequantize_f16(&self, _elem_count: usize) -> Result<CudaStorage> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    pub fn quantize(&mut self, _src: &CudaStorage) -> Result<()> {
        Err(Error::NotCompiledWithCudaSupport)
    }
    pub fn storage_size_in_bytes(&self) -> usize {
        0
    }
    pub fn fwd(
        &self,
        _self_shape: &crate::Shape,
        _storage: &CudaStorage,
        _layout: &crate::Layout,
    ) -> Result<(CudaStorage, crate::Shape)> {
        Err(Error::NotCompiledWithCudaSupport)
    }
 }
 pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
    _device: &CudaDevice,
    _data: &[T],
 ) -> Result<super::QStorage> {
    Err(Error::NotCompiledWithCudaSupport)
 }
--- a/candle-core/src/quantized/dummy_metal.rs
+++ b/candle-core/src/quantized/dummy_metal.rs
@ -1,50 +0,0 @@
 #![allow(unused)]
 use super::GgmlDType;
 use crate::{Error, MetalDevice, MetalStorage, Result};
 pub struct QMetalStorage {
    dtype: GgmlDType,
    device: MetalDevice,
 }
 impl QMetalStorage {
    pub fn zeros(_: &MetalDevice, _: usize, _: GgmlDType) -> Result<Self> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    pub fn dtype(&self) -> GgmlDType {
        self.dtype
    }
    pub fn device(&self) -> &MetalDevice {
        &self.device
    }
    pub fn dequantize(&self, _elem_count: usize) -> Result<MetalStorage> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    pub fn quantize(&mut self, _src: &MetalStorage) -> Result<()> {
        Err(Error::NotCompiledWithMetalSupport)
    }
    pub fn storage_size_in_bytes(&self) -> usize {
        0
    }
    pub fn fwd(
        &self,
        _self_shape: &crate::Shape,
        _storage: &MetalStorage,
        _layout: &crate::Layout,
    ) -> Result<(MetalStorage, crate::Shape)> {
        Err(Error::NotCompiledWithMetalSupport)
    }
 }
 pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
    _device: &MetalDevice,
    _data: &[T],
 ) -> Result<super::QStorage> {
    Err(Error::NotCompiledWithMetalSupport)
 }
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -1,7 +1,7 @@
 //! Support for the GGML file format.
-use super::{k_quants, GgmlDType, QStorage};
+use super::{k_quants, GgmlDType};
-use crate::{Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt};
 use std::collections::HashMap;
@ -121,68 +121,35 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
    raw_data: &[u8],
    size_in_bytes: usize,
    dims: Vec<usize>,
    device: &Device,
 ) -> Result<super::QTensor> {
    let raw_data_ptr = raw_data.as_ptr();
    let n_blocks = size_in_bytes / std::mem::size_of::<T>();
    let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
-    let data: QStorage = match device {
+    super::QTensor::new(data.to_vec(), dims)
        Device::Cpu => QStorage::Cpu(Box::new(data.to_vec())),
        Device::Metal(metal) => super::metal::load_quantized(metal, data)?,
        Device::Cuda(cuda) => super::cuda::load_quantized(cuda, data)?,
    };
    super::QTensor::new(data, dims)
 }
-/// Creates a Tensor from a raw GGML tensor.
+/// Creates a [Tensor] from a raw GGML tensor.
 pub fn qtensor_from_ggml(
    ggml_dtype: GgmlDType,
    raw_data: &[u8],
    dims: Vec<usize>,
    device: &Device,
 ) -> Result<super::QTensor> {
    let tensor_elems = dims.iter().product::<usize>();
-    let block_size = ggml_dtype.block_size();
+    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
    if tensor_elems % block_size != 0 {
        crate::bail!(
            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
        )
    }
    let size_in_bytes = tensor_elems / block_size * ggml_dtype.type_size();
    match ggml_dtype {
-        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims, device),
+        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims),
-        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims, device),
+        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q4_0 => {
+        GgmlDType::Q4_0 => from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims),
-            from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims, device)
+        GgmlDType::Q4_1 => from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims),
-        }
+        GgmlDType::Q5_0 => from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q4_1 => {
+        GgmlDType::Q5_1 => from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims),
-            from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims, device)
+        GgmlDType::Q8_0 => from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims),
-        }
+        GgmlDType::Q2K => from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q5_0 => {
+        GgmlDType::Q3K => from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims),
-            from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims, device)
+        GgmlDType::Q4K => from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims),
-        }
+        GgmlDType::Q5K => from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q5_1 => {
+        GgmlDType::Q6K => from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims),
            from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims, device)
        }
        GgmlDType::Q8_0 => {
            from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims, device)
        }
        GgmlDType::Q2K => {
            from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims, device)
        }
        GgmlDType::Q3K => {
            from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims, device)
        }
        GgmlDType::Q4K => {
            from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims, device)
        }
        GgmlDType::Q5K => {
            from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims, device)
        }
        GgmlDType::Q6K => {
            from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims, device)
        }
        _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
    }
 }
@ -190,7 +157,6 @@ pub fn qtensor_from_ggml(
 fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    reader: &mut R,
    magic: VersionedMagic,
    device: &Device,
 ) -> Result<(String, super::QTensor)> {
    let n_dims = reader.read_u32::<LittleEndian>()?;
    let name_len = reader.read_u32::<LittleEndian>()?;
@ -211,11 +177,11 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
    }
    let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>();
    let tensor_elems = dims.iter().product::<usize>();
-    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.block_size();
+    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
    // TODO: Mmap version to avoid copying the data around?
    let mut raw_data = vec![0u8; size_in_bytes];
    reader.read_exact(&mut raw_data)?;
-    match qtensor_from_ggml(ggml_dtype, &raw_data, dims, device) {
+    match qtensor_from_ggml(ggml_dtype, &raw_data, dims) {
        Ok(tensor) => Ok((name, tensor)),
        Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
    }
@ -226,14 +192,10 @@ pub struct Content {
    pub hparams: HParams,
    pub vocab: Vocab,
    pub tensors: HashMap<String, super::QTensor>,
    pub device: Device,
 }
 impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(
+    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Content> {
        reader: &mut R,
        device: &Device,
    ) -> Result<Content> {
        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
        let last_position = reader.seek(std::io::SeekFrom::End(0))?;
        reader.seek(std::io::SeekFrom::Start(0))?;
@ -243,16 +205,14 @@ impl Content {
        let mut tensors = HashMap::new();
        while reader.stream_position()? != last_position {
-            let (name, tensor) = read_one_tensor(reader, magic, device)?;
+            let (name, tensor) = read_one_tensor(reader, magic)?;
            tensors.insert(name, tensor);
        }
        let device = device.clone();
        Ok(Self {
            magic,
            hparams,
            vocab,
            tensors,
            device,
        })
    }
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -1,8 +1,9 @@
-//! Support for the [GGUF file format](https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md).
+//! Support for the GGUF file format.
 //!
 //! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md
 use super::{GgmlDType, QTensor};
-use crate::{Context, Device, Result};
+use crate::Result;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::collections::HashMap;
@ -28,7 +29,6 @@ impl TryFrom<u32> for Magic {
 pub enum VersionedMagic {
    GgufV1,
    GgufV2,
    GgufV3,
 }
 impl VersionedMagic {
@ -39,8 +39,7 @@ impl VersionedMagic {
        let versioned_magic = match (magic, version) {
            (Magic::Gguf, 1) => Self::GgufV1,
            (Magic::Gguf, 2) => Self::GgufV2,
-            (Magic::Gguf, 3) => Self::GgufV3,
+            _ => crate::bail!("ggml: unsupported magic/version {magic:?}/{version}"),
            _ => crate::bail!("gguf: unsupported magic/version {magic:?}/{version}"),
        };
        Ok(versioned_magic)
    }
@ -58,25 +57,14 @@ impl TensorInfo {
        &self,
        reader: &mut R,
        tensor_data_offset: u64,
        device: &Device,
    ) -> Result<QTensor> {
        let tensor_elems = self.shape.elem_count();
-        let block_size = self.ggml_dtype.block_size();
+        let size_in_bytes =
-        if tensor_elems % block_size != 0 {
+            tensor_elems * self.ggml_dtype.type_size() / self.ggml_dtype.blck_size();
            crate::bail!(
            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
        )
        }
        let size_in_bytes = tensor_elems / block_size * self.ggml_dtype.type_size();
        let mut raw_data = vec![0u8; size_in_bytes];
        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(
+        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
            self.ggml_dtype,
            &raw_data,
            self.shape.dims().to_vec(),
            device,
        )
    }
 }
@ -91,9 +79,7 @@ pub struct Content {
 fn read_string<R: std::io::Read>(reader: &mut R, magic: &VersionedMagic) -> Result<String> {
    let len = match magic {
        VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-        VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+        VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
            reader.read_u64::<LittleEndian>()? as usize
        }
    };
    let mut v = vec![0u8; len];
    reader.read_exact(&mut v)?;
@ -134,6 +120,7 @@ pub enum ValueType {
    // The value is a UTF-8 non-null-terminated string, with length prepended.
    String,
    // The value is an array of other values, with the length and type prepended.
    ///
    // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
    Array,
 }
@ -216,16 +203,10 @@ impl Value {
        }
    }
    /// This will also automatically upcast any integral types which will not truncate.
    pub fn to_u64(&self) -> Result<u64> {
        match self {
            Self::U64(v) => Ok(*v),
-            // Autoupcast cases here
+            v => crate::bail!("not a u64 {v:?}"),
            Self::U8(v) => Ok(*v as u64),
            Self::U16(v) => Ok(*v as u64),
            Self::U32(v) => Ok(*v as u64),
            Self::Bool(v) => Ok(*v as u64),
            v => crate::bail!("not a u64 or upcastable to u64 {v:?}"),
        }
    }
@ -298,9 +279,7 @@ impl Value {
                let value_type = ValueType::from_u32(value_type)?;
                let len = match magic {
                    VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-                    VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                    VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
                        reader.read_u64::<LittleEndian>()? as usize
                    }
                };
                let mut vs = Vec::with_capacity(len);
                for _ in 0..len {
@ -338,7 +317,7 @@ impl Value {
                    if value_type.len() != 1 {
                        crate::bail!("multiple value-types in the same array {value_type:?}")
                    }
-                    value_type.into_iter().next().context("empty value_type")?
+                    value_type.into_iter().next().unwrap()
                };
                w.write_u32::<LittleEndian>(value_type.to_u32())?;
                w.write_u64::<LittleEndian>(v.len() as u64)?;
@ -397,15 +376,11 @@ impl Content {
        let tensor_count = match magic {
            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
                reader.read_u64::<LittleEndian>()? as usize
            }
        };
        let metadata_kv_count = match magic {
            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
                reader.read_u64::<LittleEndian>()? as usize
            }
        };
        let mut metadata = HashMap::new();
@ -427,7 +402,7 @@ impl Content {
                    reader.read_u32_into::<LittleEndian>(&mut dimensions)?;
                    dimensions.into_iter().map(|c| c as usize).collect()
                }
-                VersionedMagic::GgufV2 | VersionedMagic::GgufV3 => {
+                VersionedMagic::GgufV2 => {
                    let mut dimensions = vec![0; n_dimensions as usize];
                    reader.read_u64_into::<LittleEndian>(&mut dimensions)?;
                    dimensions.into_iter().map(|c| c as usize).collect()
@ -457,7 +432,7 @@ impl Content {
            Some(Value::I32(v)) if *v >= 0 => *v as u64,
            _ => DEFAULT_ALIGNMENT,
        };
-        let tensor_data_offset = position.div_ceil(alignment) * alignment;
+        let tensor_data_offset = (position + alignment - 1) / alignment * alignment;
        Ok(Self {
            magic,
            metadata,
@ -470,13 +445,12 @@ impl Content {
        &self,
        reader: &mut R,
        name: &str,
        device: &Device,
    ) -> Result<QTensor> {
        let tensor_info = match self.tensor_infos.get(name) {
            Some(tensor_info) => tensor_info,
-            None => crate::bail!("cannot find tensor info for {name}"),
+            None => crate::bail!("cannot find tensor-infor for {name}"),
        };
-        tensor_info.read(reader, self.tensor_data_offset, device)
+        tensor_info.read(reader, self.tensor_data_offset)
    }
 }
@ -528,9 +502,10 @@ pub fn write<W: std::io::Seek + std::io::Write>(
                "internal error, unexpected current position {tensor_start_pos} {offset} {pos}"
            )
        }
-        let data = tensor.data()?;
+        let data_ptr = tensor.as_ptr();
-        let size_in_bytes = data.len();
+        let size_in_bytes = tensor.storage_size_in_bytes();
-        w.write_all(&data)?;
+        let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
        w.write_all(data)?;
        let padding = 31 - (31 + size_in_bytes) % 32;
        w.write_all(&vec![0u8; padding])?;
    }
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@ -34,9 +34,6 @@ pub trait GgmlType: Sized + Clone + Send + Sync {
    /// Dot product used as a building block for quantized mat-mul.
    /// n is the number of elements to be considered.
    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32>;
    /// Generic implementation of the dot product without simd optimizations.
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32>;
 }
 #[derive(Debug, Clone, PartialEq)]
@ -228,17 +225,15 @@ impl GgmlType for BlockQ4_0 {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q4_0_q8_0(n, xs, ys);
        #[cfg(target_feature = "simd128")]
        return super::simd128::vec_dot_q4_0_q8_0(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = QK8_0;
        let nb = n / qk;
        if n % QK8_0 != 0 {
            crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
        }
        if nb % 2 != 0 {
            crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
        }
        // Generic implementation.
        let mut sumf = 0f32;
        for (xs, ys) in xs.iter().zip(ys.iter()) {
@ -260,10 +255,6 @@ impl GgmlType for BlockQ4_1 {
    type VecDotType = BlockQ8_1;
    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        // ggml_vec_dot_q4_1_q8_1
        let qk = QK8_1;
        if n % qk != 0 {
@ -363,10 +354,7 @@ impl GgmlType for BlockQ5_0 {
        if nb % 2 != 0 {
            crate::bail!("vec_dot_q5_0_q8_0: {n}, nb is not divisible by 2")
        }
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(_n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        // Generic implementation.
        let mut sumf = 0f32;
@ -457,10 +445,6 @@ impl GgmlType for BlockQ5_1 {
    type VecDotType = BlockQ8_1;
    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = Self::BLCK_SIZE;
        if n % Self::BLCK_SIZE != 0 {
            crate::bail!("vec_dot_q5_1_q8_1: {n} is not divisible by {qk}")
@ -622,13 +606,6 @@ impl GgmlType for BlockQ8_0 {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q8_0_q8_0(n, xs, ys);
        #[cfg(target_feature = "simd128")]
        return super::simd128::vec_dot_q8_0_q8_0(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = QK8_0;
        if n % QK8_0 != 0 {
            crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
@ -654,11 +631,7 @@ impl GgmlType for BlockQ8_1 {
    const BLCK_SIZE: usize = QK8_1;
    type VecDotType = BlockQ8_1;
-    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+    fn vec_dot(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
        unimplemented!("no support for vec-dot on Q8_1")
    }
@ -708,13 +681,6 @@ impl GgmlType for BlockQ2K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q2k_q8k(n, xs, ys);
        #[cfg(target_feature = "simd128")]
        return super::simd128::vec_dot_q2k_q8k(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
        }
@ -735,17 +701,18 @@ impl GgmlType for BlockQ2K {
            let mut isum = 0;
            let mut is = 0;
            let mut d;
            for _ in 0..(QK_K / 128) {
                let mut shift = 0;
                for _ in 0..4 {
-                    let d = (sc[is] & 0xF) as i32;
+                    d = (sc[is] & 0xF) as i32;
                    is += 1;
                    let mut isuml = 0;
                    for l in 0..16 {
                        isuml += q8[l] as i32 * (((q2[l] >> shift) & 3) as i32);
                    }
                    isum += d * isuml;
-                    let d = (sc[is] & 0xF) as i32;
+                    d = (sc[is] & 0xF) as i32;
                    is += 1;
                    isuml = 0;
                    for l in 16..32 {
@ -884,10 +851,6 @@ impl GgmlType for BlockQ3K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q3k_q8k(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q3k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1114,6 +1077,7 @@ impl GgmlType for BlockQ3K {
            let d_all = block.d.to_f32();
            let mut m = 1;
            let mut is = 0;
            let mut dl;
            // Dequantize both 128 long blocks
            // 32 qs values per 128 long block
@ -1124,7 +1088,7 @@ impl GgmlType for BlockQ3K {
                    for (scale_index, scale_scoped_y) in
                        shift_scoped_y.chunks_exact_mut(16).enumerate()
                    {
-                        let dl = d_all * (scales[is] as f32 - 32.0);
+                        dl = d_all * (scales[is] as f32 - 32.0);
                        for (i, inner_y) in scale_scoped_y.iter_mut().enumerate() {
                            let new_y = dl
                                * (((qs[i + 16 * scale_index] >> shift) & 3) as i8
@ -1162,13 +1126,6 @@ impl GgmlType for BlockQ4K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q4k_q8k(n, xs, ys);
        #[cfg(target_feature = "simd128")]
        return super::simd128::vec_dot_q4k_q8k(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1355,10 +1312,6 @@ impl GgmlType for BlockQ5K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q5k_q8k(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1545,13 +1498,13 @@ impl GgmlType for BlockQ5K {
                let d2 = d * sc as f32;
                let m2 = min * m as f32;
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u1 != 0 { 16f32 } else { 0f32 };
+                    let to_add = if qh & u1 != 0 { 16 } else { 1 };
-                    y[ys_index] = d1 * ((ql & 0xF) as f32 + to_add) - m1;
+                    y[ys_index] = d1 * ((ql & 0xF) + to_add) as f32 - m1;
                    ys_index += 1;
                }
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u2 != 0 { 16f32 } else { 0f32 };
+                    let to_add = if qh & u2 != 0 { 16 } else { 1 };
-                    y[ys_index] = d2 * ((ql >> 4) as f32 + to_add) - m2;
+                    y[ys_index] = d2 * ((ql >> 4) + to_add) as f32 - m2;
                    ys_index += 1;
                }
                is += 2;
@ -1576,13 +1529,6 @@ impl GgmlType for BlockQ6K {
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q6k_q8k(n, xs, ys);
        #[cfg(target_feature = "simd128")]
        return super::simd128::vec_dot_q6k_q8k(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
        }
@ -1751,38 +1697,8 @@ impl GgmlType for BlockQ8K {
    const BLCK_SIZE: usize = QK_K;
    type VecDotType = BlockQ8K;
-    #[allow(unreachable_code)]
+    fn vec_dot(_n: usize, _xs: &[Self], _ys: &[Self::VecDotType]) -> Result<f32> {
-    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
+        unreachable!()
        #[cfg(target_feature = "avx")]
        return super::avx::vec_dot_q8k_q8k(n, xs, ys);
        #[cfg(target_feature = "neon")]
        return super::neon::vec_dot_q8k_q8k(n, xs, ys);
        #[cfg(target_feature = "simd128")]
        return super::simd128::vec_dot_q8k_q8k(n, xs, ys);
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        let qk = QK_K;
        if n % QK_K != 0 {
            crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
        }
        // Generic implementation.
        let mut sumf = 0f32;
        for (xs, ys) in xs.iter().zip(ys.iter()) {
            let sum_i = xs
                .qs
                .iter()
                .zip(ys.qs.iter())
                .map(|(&x, &y)| x as i32 * y as i32)
                .sum::<i32>();
            sumf += sum_i as f32 * xs.d * ys.d
        }
        Ok(sumf)
    }
    fn from_float(xs: &[f32], ys: &mut [Self]) -> Result<()> {
@ -1850,8 +1766,8 @@ pub fn matmul<T: GgmlType>(
        crate::bail!("unexpected lhs length {} {mkn:?}", lhs.len());
    }
-    let k_in_lhs_blocks = k.div_ceil(T::BLCK_SIZE);
+    let k_in_lhs_blocks = (k + T::BLCK_SIZE - 1) / T::BLCK_SIZE;
-    let k_in_rhs_blocks = k.div_ceil(T::VecDotType::BLCK_SIZE);
+    let k_in_rhs_blocks = (k + T::VecDotType::BLCK_SIZE - 1) / T::VecDotType::BLCK_SIZE;
    // TODO: Do not make this copy if the DotType is f32.
    // TODO: Pre-allocate this.
    let mut lhs_b = vec![T::VecDotType::zeros(); m * k_in_lhs_blocks];
@ -1888,10 +1804,6 @@ impl GgmlType for f32 {
    type VecDotType = f32;
    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if xs.len() < n {
            crate::bail!("size mismatch {} < {n}", xs.len())
        }
@ -1926,10 +1838,6 @@ impl GgmlType for f16 {
    type VecDotType = f16;
    fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        Self::vec_dot_unopt(n, xs, ys)
    }
    fn vec_dot_unopt(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
        if xs.len() < n {
            crate::bail!("size mismatch {} < {n}", xs.len())
        }
--- a/candle-core/src/quantized/metal.rs
+++ b/candle-core/src/quantized/metal.rs
@ -1,230 +0,0 @@
 use super::{GgmlDType, QStorage};
 use crate::backend::BackendStorage;
 use crate::{DType, MetalDevice, MetalStorage, Result, Shape};
 use metal::Buffer;
 use std::sync::Arc;
 pub struct QMetalStorage {
    dtype: GgmlDType,
    device: MetalDevice,
    buffer: Arc<Buffer>,
 }
 impl QMetalStorage {
    pub fn zeros(device: &MetalDevice, elem_count: usize, dtype: GgmlDType) -> Result<Self> {
        let size = elem_count * dtype.type_size() / dtype.block_size();
        let buffer = device.allocate_zeros(size)?;
        Ok(Self {
            buffer,
            device: device.clone(),
            dtype,
        })
    }
    pub fn dtype(&self) -> GgmlDType {
        self.dtype
    }
    pub fn device(&self) -> &MetalDevice {
        &self.device
    }
    pub fn buffer(&self) -> &Buffer {
        &self.buffer
    }
    pub fn dequantize(&self, elem_count: usize) -> Result<MetalStorage> {
        use crate::quantized::k_quants::GgmlType;
        let buffer = self.device.new_buffer_managed(self.buffer.length())?;
        let command_buffer = self.device.command_buffer()?;
        command_buffer.set_label("to_cpu");
        let blit = command_buffer.new_blit_command_encoder();
        blit.set_label("blit_to_cpu");
        blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
        blit.end_encoding();
        self.device.wait_until_completed()?;
        let mut out = vec![0.0; elem_count];
        let block_len = elem_count / self.dtype.block_size();
        match self.dtype {
            GgmlDType::F32 => {
                let vec: Vec<f32> = read_to_vec(&buffer, block_len);
                f32::to_float(&vec, &mut out)?;
            }
            GgmlDType::F16 => {
                let vec: Vec<half::f16> = read_to_vec(&buffer, block_len);
                half::f16::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q4_0 => {
                let vec: Vec<crate::quantized::BlockQ4_0> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q4_1 => {
                let vec: Vec<crate::quantized::BlockQ4_1> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q5_0 => {
                let vec: Vec<crate::quantized::BlockQ5_0> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q5_1 => {
                let vec: Vec<crate::quantized::BlockQ5_1> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q8_0 => {
                let vec: Vec<crate::quantized::BlockQ8_0> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q8_1 => {
                let vec: Vec<crate::quantized::BlockQ8_1> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q2K => {
                let vec: Vec<crate::quantized::BlockQ2K> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ2K::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q3K => {
                let vec: Vec<crate::quantized::BlockQ3K> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ3K::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q4K => {
                let vec: Vec<crate::quantized::BlockQ4K> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ4K::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q5K => {
                let vec: Vec<crate::quantized::BlockQ5K> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ5K::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q6K => {
                let vec: Vec<crate::quantized::BlockQ6K> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ6K::to_float(&vec, &mut out)?;
            }
            GgmlDType::Q8K => {
                let vec: Vec<crate::quantized::BlockQ8K> = read_to_vec(&buffer, block_len);
                crate::quantized::BlockQ8K::to_float(&vec, &mut out)?;
            }
        }
        let buffer = self.device.new_buffer_with_data(&out)?;
        Ok(MetalStorage::new(
            buffer,
            self.device.clone(),
            elem_count,
            DType::F32,
        ))
    }
    pub fn quantize(&mut self, src: &MetalStorage) -> Result<()> {
        // Quantization only happens on CPU for now.
        let src = src.to_cpu::<f32>()?;
        let elem_count = src.len();
        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
        let mut qcpu_storage = crate::Device::Cpu.qzeros(elem_count, self.dtype)?;
        qcpu_storage.quantize(&src)?;
        let buffer = self.device.new_buffer_with_data(&qcpu_storage.data()?)?;
        self.buffer = buffer;
        Ok(())
    }
    pub fn storage_size_in_bytes(&self) -> usize {
        self.buffer.length() as usize
    }
    pub fn fwd(
        &self,
        self_shape: &Shape,
        storage: &MetalStorage,
        layout: &crate::Layout,
    ) -> Result<(MetalStorage, Shape)> {
        use crate::MetalError;
        if !layout.is_contiguous() {
            crate::bail!("input tensor is not contiguous {layout:?}")
        }
        let src_shape = layout.shape();
        // self is transposed so n is first then k.
        if src_shape.rank() < 2 {
            crate::bail!("input tensor has only one dimension {layout:?}")
        }
        let (n, k) = self_shape.dims2()?;
        let mut dst_shape = src_shape.dims().to_vec();
        // We always use a single batch dimension and stack all the tensors in the batch on the
        // second dimension as the implementation in candle-metal-kernels doesn't handle batch
        // properly.
        let m = match dst_shape.len() {
            3 => dst_shape[0] * dst_shape[1],
            2 => dst_shape[0],
            n => crate::bail!("Invalid rank {n} for quantized matmul metal"),
        };
        let last_k = dst_shape.pop().unwrap();
        if last_k != k {
            crate::bail!("input tensor {layout:?} incompatible with {:?}", self_shape)
        }
        dst_shape.push(n);
        let dst_shape = Shape::from(dst_shape);
        let device = storage.device().clone();
        let dst = device.new_buffer(dst_shape.elem_count(), DType::F32, "qmatmul")?;
        let command_buffer = device.command_buffer()?;
        // In some cases it would be better to use the mm variant, though it has its drawbacks
        // around memory alignemnt.
        for batch_id in 0..m {
            candle_metal_kernels::call_quantized_matmul_mv_t(
                device.device(),
                &command_buffer,
                device.kernels(),
                self.dtype.into(),
                (1, 1, n, k),
                storage.buffer(),
                (layout.start_offset() + batch_id * k) * storage.dtype().size_in_bytes(),
                &self.buffer,
                batch_id * n * DType::F32.size_in_bytes(),
                &dst,
            )
            .map_err(MetalError::from)?;
        }
        let dst_storage = crate::MetalStorage::new(dst, device, dst_shape.elem_count(), DType::F32);
        Ok((dst_storage, dst_shape))
    }
 }
 pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
    device: &MetalDevice,
    data: &[T],
 ) -> Result<QStorage> {
    let buffer = device.new_buffer_with_data(data)?;
    let device = device.clone();
    Ok(QStorage::Metal(QMetalStorage {
        dtype: T::DTYPE,
        device,
        buffer,
    }))
 }
 fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
    let ptr = buffer.contents() as *const T;
    assert!(!ptr.is_null());
    let slice = unsafe { std::slice::from_raw_parts(ptr, n) };
    slice.to_vec()
 }
 impl From<GgmlDType> for candle_metal_kernels::GgmlDType {
    fn from(value: GgmlDType) -> Self {
        match value {
            GgmlDType::Q4_0 => candle_metal_kernels::GgmlDType::Q4_0,
            GgmlDType::Q4_1 => candle_metal_kernels::GgmlDType::Q4_1,
            GgmlDType::Q5_0 => candle_metal_kernels::GgmlDType::Q5_0,
            GgmlDType::Q5_1 => candle_metal_kernels::GgmlDType::Q5_1,
            GgmlDType::Q8_0 => candle_metal_kernels::GgmlDType::Q8_0,
            GgmlDType::Q8_1 => candle_metal_kernels::GgmlDType::Q8_1,
            GgmlDType::Q2K => candle_metal_kernels::GgmlDType::Q2K,
            GgmlDType::Q3K => candle_metal_kernels::GgmlDType::Q3K,
            GgmlDType::Q4K => candle_metal_kernels::GgmlDType::Q4K,
            GgmlDType::Q5K => candle_metal_kernels::GgmlDType::Q5K,
            GgmlDType::Q6K => candle_metal_kernels::GgmlDType::Q6K,
            GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K,
            GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16,
            GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32,
        }
    }
 }
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -1,135 +1,21 @@
-//! Code for GGML and GGUF files
+use crate::{Device, Result, Shape, Tensor};
 use crate::{Context, CpuStorage, DType, Device, Result, Shape, Storage, Tensor};
 use k_quants::*;
 use std::borrow::Cow;
 #[cfg(target_feature = "avx")]
 pub mod avx;
 mod dummy_cuda;
 mod dummy_metal;
 pub mod ggml_file;
 pub mod gguf_file;
 pub mod k_quants;
 #[cfg(feature = "metal")]
 pub mod metal;
 #[cfg(not(feature = "metal"))]
 mod metal {
    pub use super::dummy_metal::*;
 }
 #[cfg(feature = "cuda")]
 pub mod cuda;
 #[cfg(not(feature = "cuda"))]
 mod cuda {
    pub use super::dummy_cuda::*;
 }
 #[cfg(target_feature = "neon")]
 pub mod neon;
 #[cfg(target_feature = "simd128")]
 pub mod simd128;
 pub mod utils;
 use half::f16;
 pub use k_quants::GgmlType;
 pub struct QTensor {
-    storage: QStorage,
+    data: Box<dyn QuantizedType>,
    shape: Shape,
 }
 impl Device {
    fn qzeros(&self, elem_count: usize, dtype: GgmlDType) -> Result<QStorage> {
        match self {
            Device::Cpu => {
                let storage = dtype.cpu_zeros(elem_count);
                Ok(QStorage::Cpu(storage))
            }
            Device::Metal(metal) => {
                let storage = metal::QMetalStorage::zeros(metal, elem_count, dtype)?;
                Ok(QStorage::Metal(storage))
            }
            Device::Cuda(cuda) => {
                let storage = cuda::QCudaStorage::zeros(cuda, elem_count, dtype)?;
                Ok(QStorage::Cuda(storage))
            }
        }
    }
 }
 pub enum QStorage {
    Cpu(Box<dyn QuantizedType>),
    Metal(metal::QMetalStorage),
    Cuda(cuda::QCudaStorage),
 }
 impl QStorage {
    fn block_size(&self) -> usize {
        match self {
            QStorage::Cpu(storage) => storage.block_size(),
            QStorage::Metal(storage) => storage.dtype().block_size(),
            QStorage::Cuda(storage) => storage.dtype().block_size(),
        }
    }
    fn dtype(&self) -> GgmlDType {
        match self {
            QStorage::Cpu(storage) => storage.dtype(),
            QStorage::Metal(storage) => storage.dtype(),
            QStorage::Cuda(storage) => storage.dtype(),
        }
    }
    fn device(&self) -> Device {
        match self {
            QStorage::Cpu(_storage) => Device::Cpu,
            QStorage::Metal(storage) => Device::Metal(storage.device().clone()),
            QStorage::Cuda(storage) => Device::Cuda(storage.device().clone()),
        }
    }
    fn size_in_bytes(&self) -> usize {
        match self {
            QStorage::Cpu(storage) => storage.storage_size_in_bytes(),
            QStorage::Metal(storage) => storage.storage_size_in_bytes(),
            QStorage::Cuda(storage) => storage.storage_size_in_bytes(),
        }
    }
    fn quantize(&mut self, src: &Storage) -> Result<()> {
        match (self, src) {
            (QStorage::Cpu(storage), Storage::Cpu(src)) => {
                storage.from_float(src.as_slice::<f32>()?)?;
            }
            (QStorage::Metal(storage), Storage::Metal(src)) => storage.quantize(src)?,
            (QStorage::Cuda(storage), Storage::Cuda(src)) => storage.quantize(src)?,
            _ => crate::bail!("Invalid dequantize storage locations do not match"),
        }
        Ok(())
    }
    fn dequantize(&self, elem_count: usize) -> Result<Storage> {
        match self {
            QStorage::Cpu(storage) => Ok(Storage::Cpu(storage.dequantize(elem_count)?)),
            QStorage::Metal(storage) => Ok(Storage::Metal(storage.dequantize(elem_count)?)),
            QStorage::Cuda(storage) => Ok(Storage::Cuda(storage.dequantize(elem_count)?)),
        }
    }
    fn data(&self) -> Result<Cow<[u8]>> {
        match self {
            QStorage::Cpu(storage) => {
                let data_ptr = storage.as_ptr();
                let size_in_bytes = storage.storage_size_in_bytes();
                let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
                Ok(Cow::from(data))
            }
            QStorage::Metal(_) | QStorage::Cuda(_) => {
                crate::bail!("not implemented");
            }
        }
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum GgmlDType {
    F32,
@ -189,25 +75,6 @@ impl GgmlDType {
        }
    }
    /// The block dtype
    pub fn cpu_zeros(&self, elem_count: usize) -> Box<dyn QuantizedType> {
        match self {
            Self::F32 => Box::new(vec![f32::zeros(); elem_count]),
            Self::F16 => Box::new(vec![f16::zeros(); elem_count]),
            Self::Q4_0 => Box::new(vec![BlockQ4_0::zeros(); elem_count / BlockQ4_0::BLCK_SIZE]),
            Self::Q4_1 => Box::new(vec![BlockQ4_1::zeros(); elem_count / BlockQ4_1::BLCK_SIZE]),
            Self::Q5_0 => Box::new(vec![BlockQ5_0::zeros(); elem_count / BlockQ5_0::BLCK_SIZE]),
            Self::Q5_1 => Box::new(vec![BlockQ5_1::zeros(); elem_count / BlockQ5_1::BLCK_SIZE]),
            Self::Q8_0 => Box::new(vec![BlockQ8_0::zeros(); elem_count / BlockQ8_0::BLCK_SIZE]),
            Self::Q8_1 => Box::new(vec![BlockQ8_1::zeros(); elem_count / BlockQ8_1::BLCK_SIZE]),
            Self::Q2K => Box::new(vec![BlockQ2K::zeros(); elem_count / BlockQ2K::BLCK_SIZE]),
            Self::Q3K => Box::new(vec![BlockQ3K::zeros(); elem_count / BlockQ3K::BLCK_SIZE]),
            Self::Q4K => Box::new(vec![BlockQ4K::zeros(); elem_count / BlockQ4K::BLCK_SIZE]),
            Self::Q5K => Box::new(vec![BlockQ5K::zeros(); elem_count / BlockQ5K::BLCK_SIZE]),
            Self::Q6K => Box::new(vec![BlockQ6K::zeros(); elem_count / BlockQ6K::BLCK_SIZE]),
            Self::Q8K => Box::new(vec![BlockQ8K::zeros(); elem_count / BlockQ8K::BLCK_SIZE]),
        }
    }
    /// The type size for blocks in bytes.
    pub fn type_size(&self) -> usize {
        use k_quants::*;
@ -231,7 +98,7 @@ impl GgmlDType {
    }
    /// The block size, i.e. the number of elements stored in each block.
-    pub fn block_size(&self) -> usize {
+    pub fn blck_size(&self) -> usize {
        match self {
            Self::F32 => 1,
            Self::F16 => 1,
@ -250,13 +117,9 @@ impl GgmlDType {
 pub trait QuantizedType: Send + Sync {
    fn dtype(&self) -> GgmlDType;
    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()>;
-    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage>;
+    fn to_float(&self, ys: &mut [f32]) -> Result<()>;
    fn storage_size_in_bytes(&self) -> usize;
    fn as_ptr(&self) -> *const u8;
    fn block_size(&self) -> usize;
    #[allow(clippy::wrong_self_convention)]
    fn from_float(&mut self, xs: &[f32]) -> Result<()>;
    fn size(&self) -> usize;
 }
 impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
@ -264,26 +127,12 @@ impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
        k_quants::matmul(mkn, lhs, self.as_slice(), dst)
    }
    fn size(&self) -> usize {
        self.len() * core::mem::size_of::<T>()
    }
    fn from_float(&mut self, xs: &[f32]) -> Result<()> {
        T::from_float(xs, self)
    }
    fn dtype(&self) -> GgmlDType {
        T::DTYPE
    }
-    fn block_size(&self) -> usize {
+    fn to_float(&self, ys: &mut [f32]) -> Result<()> {
-        T::BLCK_SIZE
+        T::to_float(self.as_slice(), ys)
    }
    fn dequantize(&self, elem_count: usize) -> Result<CpuStorage> {
        let mut ys = vec![0.0f32; elem_count];
        T::to_float(self.as_slice(), &mut ys)?;
        Ok(CpuStorage::F32(ys))
    }
    fn storage_size_in_bytes(&self) -> usize {
@ -301,53 +150,56 @@ impl std::fmt::Debug for QTensor {
    }
 }
-fn check_shape(shape: &Shape, block_size: usize) -> Result<()> {
+fn check_shape<T: k_quants::GgmlType>(shape: &Shape) -> Result<()> {
    let dims = shape.dims();
    if dims.is_empty() {
        crate::bail!("scalar tensor cannot be quantized {shape:?}")
    }
-    if dims[dims.len() - 1] % block_size != 0 {
+    if dims[dims.len() - 1] % T::BLCK_SIZE != 0 {
        crate::bail!(
            "quantized tensor must have their last dim divisible by block size {shape:?} {}",
-            block_size
+            T::BLCK_SIZE
        )
    }
    Ok(())
 }
 impl QTensor {
-    pub fn new<S: Into<Shape>>(storage: QStorage, shape: S) -> Result<Self> {
+    pub fn new<S: Into<Shape>, T: k_quants::GgmlType + Send + Sync + 'static>(
        data: Vec<T>,
        shape: S,
    ) -> Result<Self> {
        let shape = shape.into();
-        check_shape(&shape, storage.block_size())?;
+        check_shape::<T>(&shape)?;
-        Ok(Self { storage, shape })
+        Ok(Self {
            data: Box::new(data),
            shape,
        })
    }
-    pub fn quantize(src: &Tensor, dtype: GgmlDType) -> Result<Self> {
+    pub fn quantize<T: k_quants::GgmlType + Send + Sync + 'static>(src: &Tensor) -> Result<Self> {
        let shape = src.shape();
-        let block_size = dtype.block_size();
+        check_shape::<T>(shape)?;
-        check_shape(shape, block_size)?;
+        let src = src
-        let src = src.to_dtype(crate::DType::F32)?.flatten_all()?;
+            .to_dtype(crate::DType::F32)?
-        let elem_count = shape.elem_count();
+            .flatten_all()?
-        if elem_count % block_size != 0 {
+            .to_vec1::<f32>()?;
        if src.len() % T::BLCK_SIZE != 0 {
            crate::bail!(
                "tensor size ({shape:?}) is not divisible by block size {}",
-                block_size
+                T::BLCK_SIZE
            )
        }
-        let mut storage = src.device().qzeros(elem_count, dtype)?;
+        let mut data = vec![T::zeros(); src.len() / T::BLCK_SIZE];
-        storage.quantize(&src.storage())?;
+        T::from_float(&src, &mut data)?;
        Ok(Self {
-            storage,
+            data: Box::new(data),
            shape: shape.clone(),
        })
    }
    pub fn dtype(&self) -> GgmlDType {
-        self.storage.dtype()
+        self.data.dtype()
    }
    pub fn device(&self) -> Device {
        self.storage.device()
    }
    pub fn rank(&self) -> usize {
@ -359,105 +211,38 @@ impl QTensor {
    }
    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
-        let storage = self.storage.dequantize(self.shape.elem_count())?;
+        let mut f32_data = vec![0f32; self.shape.elem_count()];
-        let none = crate::op::BackpropOp::none();
+        self.data.to_float(&mut f32_data)?;
-        crate::tensor::from_storage(storage, self.shape.clone(), none, false).to_device(device)
+        Tensor::from_vec(f32_data, &self.shape, device)
    }
-    pub fn dequantize_f16(&self, device: &Device) -> Result<Tensor> {
+    pub fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> {
-        // In the CUDA case, we have a specialized kernel as this can be useful for volta
+        self.data.matmul_t(mkn, lhs, dst)
        // architectures. https://github.com/huggingface/candle/issues/2136
        match &self.storage {
            QStorage::Cuda(s) => {
                let s = s.dequantize_f16(self.shape.elem_count())?;
                let none = crate::op::BackpropOp::none();
                crate::tensor::from_storage(Storage::Cuda(s), self.shape.clone(), none, false)
                    .to_device(device)
            }
            _ => {
                let s = self.dequantize(device)?.to_dtype(crate::DType::F16)?;
                Ok(s)
            }
        }
    }
    pub fn storage_size_in_bytes(&self) -> usize {
-        self.storage.size_in_bytes()
+        self.data.storage_size_in_bytes()
    }
-    pub fn data(&self) -> Result<Cow<'_, [u8]>> {
+    pub fn as_ptr(&self) -> *const u8 {
-        self.storage.data()
+        self.data.as_ptr()
    }
 }
-#[derive(Clone, Debug)]
+#[derive(Debug)]
-pub enum QMatMul {
+pub struct QMatMul(std::sync::Arc<QTensor>);
    QTensor(std::sync::Arc<QTensor>),
    Tensor(Tensor),
    TensorF16(Tensor),
 }
 thread_local! {
    static DEQUANTIZE_ALL: bool = {
        match std::env::var("CANDLE_DEQUANTIZE_ALL") {
            Ok(s) => {
                !s.is_empty() && s != "0"
            },
            Err(_) => false,
        }
    }
 }
 thread_local! {
    static DEQUANTIZE_ALL_F16: bool = {
        match std::env::var("CANDLE_DEQUANTIZE_ALL_F16") {
            Ok(s) => {
                !s.is_empty() && s != "0"
            },
            Err(_) => false,
        }
    }
 }
 impl QMatMul {
-    pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Result<Self> {
+    pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Self {
-        let dequantize = match qtensor.dtype() {
+        Self(qtensor)
            GgmlDType::F32 | GgmlDType::F16 => true,
            _ => DEQUANTIZE_ALL.with(|b| *b),
        };
        let t = if dequantize {
            let tensor = qtensor.dequantize(&qtensor.device())?;
            Self::Tensor(tensor)
        } else if DEQUANTIZE_ALL_F16.with(|b| *b) {
            let tensor = qtensor.dequantize_f16(&qtensor.device())?;
            Self::TensorF16(tensor)
        } else {
            Self::QTensor(qtensor)
        };
        Ok(t)
    }
-    pub fn from_qtensor(qtensor: QTensor) -> Result<Self> {
+    pub fn from_qtensor(qtensor: QTensor) -> Self {
-        Self::from_arc(std::sync::Arc::new(qtensor))
+        Self(std::sync::Arc::new(qtensor))
    }
-    pub fn dequantize_f16(&self) -> Result<Tensor> {
+    pub fn inner(&self) -> &std::sync::Arc<QTensor> {
-        match self {
+        &self.0
            Self::QTensor(t) => t.dequantize_f16(&t.device()),
            Self::Tensor(t) => t.to_dtype(DType::F16),
            Self::TensorF16(t) => Ok(t.clone()),
        }
    }
    pub fn forward_via_f16(&self, xs: &Tensor) -> Result<Tensor> {
        let w = self.dequantize_f16()?;
        let in_dtype = xs.dtype();
        let w = match *xs.dims() {
            [b1, b2, _, _] => w.broadcast_left((b1, b2))?.t()?,
            [bsize, _, _] => w.broadcast_left(bsize)?.t()?,
            _ => w.t()?,
        };
        xs.to_dtype(DType::F16)?.matmul(&w)?.to_dtype(in_dtype)
    }
 }
@ -481,70 +266,27 @@ impl crate::CustomOp1 for QTensor {
            crate::bail!("input tensor has only one dimension {layout:?}")
        }
        let mut dst_shape = src_shape.dims().to_vec();
-        let last_k = dst_shape.pop().context("empty dst_shape")?;
+        let last_k = dst_shape.pop().unwrap();
        if last_k != k {
            crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape)
        }
        dst_shape.push(n);
        let dst_shape = Shape::from(dst_shape);
-        #[allow(clippy::infallible_destructuring_match)]
+        let storage = storage.as_slice::<f32>()?;
-        let self_storage = match &self.storage {
+        let storage =
-            QStorage::Cpu(storage) => storage,
+            &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
            QStorage::Metal(_) | QStorage::Cuda(_) => crate::bail!("Invalid storage"),
        };
        let slice = storage.as_slice::<f32>()?;
        let slice = &slice[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
        let mut dst_storage = vec![0f32; dst_shape.elem_count()];
-        self_storage.matmul_t((dst_shape.elem_count() / n, k, n), slice, &mut dst_storage)?;
+        self.matmul_t(
            (dst_shape.elem_count() / n, k, n),
            storage,
            &mut dst_storage,
        )?;
        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
    }
    fn metal_fwd(
        &self,
        storage: &crate::MetalStorage,
        layout: &crate::Layout,
    ) -> Result<(crate::MetalStorage, Shape)> {
        let self_storage = match &self.storage {
            QStorage::Metal(metal) => metal,
            _ => unreachable!("Cannot call metal matmul on non metal QTensor"),
        };
        self_storage.fwd(&self.shape, storage, layout)
    }
    fn cuda_fwd(
        &self,
        storage: &crate::CudaStorage,
        layout: &crate::Layout,
    ) -> Result<(crate::CudaStorage, Shape)> {
        let self_storage = match &self.storage {
            QStorage::Cuda(cuda) => cuda,
            _ => unreachable!("Cannot call cuda matmul on non cuda QTensor"),
        };
        self_storage.fwd(&self.shape, storage, layout)
    }
 }
-impl crate::Module for QMatMul {
+impl QMatMul {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        match self {
+        xs.apply_op1_no_bwd(self.0.as_ref())
            Self::QTensor(t) => xs.apply_op1_no_bwd(t.as_ref()),
            Self::Tensor(w) => {
                let w = match *xs.dims() {
                    [b1, b2, _, _] => w.broadcast_left((b1, b2))?.t()?,
                    [bsize, _, _] => w.broadcast_left(bsize)?.t()?,
                    _ => w.t()?,
                };
                xs.matmul(&w)
            }
            Self::TensorF16(w) => {
                let in_dtype = xs.dtype();
                let w = match *xs.dims() {
                    [b1, b2, _, _] => w.broadcast_left((b1, b2))?.t()?,
                    [bsize, _, _] => w.broadcast_left(bsize)?.t()?,
                    _ => w.t()?,
                };
                xs.to_dtype(DType::F16)?.matmul(&w)?.to_dtype(in_dtype)
            }
        }
    }
 }
--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
@ -12,14 +12,6 @@ use core::arch::arm::*;
 #[cfg(target_arch = "aarch64")]
 use core::arch::aarch64::*;
 #[inline(always)]
 unsafe fn vdotq_s32(a: int8x16_t, b: int8x16_t) -> int32x4_t {
    // TODO: dotprod
    let p0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
    let p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
    vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
@ -27,39 +19,71 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
    if nb % 2 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
    }
    unsafe {
        let mut sumv0 = vdupq_n_f32(0.0f32);
-        for i in 0..nb {
+        let mut sumv1 = vdupq_n_f32(0.0f32);
        for i in (0..nb).step_by(2) {
            let x0 = &xs[i];
            let x1 = &xs[i + 1];
            let y0 = &ys[i];
            let y1 = &ys[i + 1];
            let m4b = vdupq_n_u8(0x0F);
            let s8b = vdupq_n_s8(0x8);
            let v0_0 = vld1q_u8(x0.qs.as_ptr());
            let v0_1 = vld1q_u8(x1.qs.as_ptr());
            // 4-bit -> 8-bit
            let v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
            let v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
            let v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
            let v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
            // sub 8
            let v0_0ls = vsubq_s8(v0_0l, s8b);
            let v0_0hs = vsubq_s8(v0_0h, s8b);
            let v0_1ls = vsubq_s8(v0_1l, s8b);
            let v0_1hs = vsubq_s8(v0_1h, s8b);
            // load y
            let v1_0l = vld1q_s8(y0.qs.as_ptr());
            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));
            let v1_1l = vld1q_s8(y1.qs.as_ptr());
            let v1_1h = vld1q_s8(y1.qs.as_ptr().add(16));
            // TODO: Support dotprod when it's available outside of nightly.
            let pl0l = vmull_s8(vget_low_s8(v0_0ls), vget_low_s8(v1_0l));
            let pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
            let ph0l = vmull_s8(vget_low_s8(v0_0hs), vget_low_s8(v1_0h));
            let ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
            let pl1l = vmull_s8(vget_low_s8(v0_1ls), vget_low_s8(v1_1l));
            let pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
            let ph1l = vmull_s8(vget_low_s8(v0_1hs), vget_low_s8(v1_1h));
            let ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
            let pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
            let ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
            let pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
            let ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
            let pl0 = vdotq_s32(v0_0ls, v1_0l);
            let ph0 = vdotq_s32(v0_0hs, v1_0h);
            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
                x0.d.to_f32() * y0.d.to_f32(),
            );
            sumv1 = vmlaq_n_f32(
                sumv1,
                vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
                x1.d.to_f32() * y1.d.to_f32(),
            );
        }
-        Ok(vaddvq_f32(sumv0))
+        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
    }
 }
@ -70,58 +94,60 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
    }
    let nb = n / QK8_0;
    if nb % 2 != 0 {
        crate::bail!("vec_dot_q8_0_q8_0: {nb} is not even")
    }
    unsafe {
        let mut sumv0 = vdupq_n_f32(0.0f32);
-        for i in 0..nb {
+        let mut sumv1 = vdupq_n_f32(0.0f32);
        for i in (0..nb).step_by(2) {
            let x0 = &xs[i];
            let x1 = &xs[i + 1];
            let y0 = &ys[i];
            let y1 = &ys[i + 1];
            let x0_0 = vld1q_s8(x0.qs.as_ptr());
            let x0_1 = vld1q_s8(x0.qs.as_ptr().add(16));
            let x1_0 = vld1q_s8(x1.qs.as_ptr());
            let x1_1 = vld1q_s8(x1.qs.as_ptr().add(16));
            // load y
            let y0_0 = vld1q_s8(y0.qs.as_ptr());
            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));
            let y1_0 = vld1q_s8(y1.qs.as_ptr());
            let y1_1 = vld1q_s8(y1.qs.as_ptr().add(16));
-            let p0 = vdotq_s32(x0_0, y0_0);
+            // TODO dotprod once this is the intrinsics are.
-            let p1 = vdotq_s32(x0_1, y0_1);
+            let p0_0 = vmull_s8(vget_low_s8(x0_0), vget_low_s8(y0_0));
            let p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
            let p0_2 = vmull_s8(vget_low_s8(x0_1), vget_low_s8(y0_1));
            let p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
            let p1_0 = vmull_s8(vget_low_s8(x1_0), vget_low_s8(y1_0));
            let p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
            let p1_2 = vmull_s8(vget_low_s8(x1_1), vget_low_s8(y1_1));
            let p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
            let p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
            let p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
            let p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
            let p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(p0, p1)),
                x0.d.to_f32() * y0.d.to_f32(),
            );
            sumv1 = vmlaq_n_f32(
                sumv1,
                vcvtq_f32_s32(vaddq_s32(p2, p3)),
                x1.d.to_f32() * y1.d.to_f32(),
            );
        }
-        Ok(vaddvq_f32(sumv0))
+        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
    let qk = QK_K;
    if n % QK_K != 0 {
        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
    }
    let mut sumf = 0f32;
    for (xs, ys) in xs.iter().zip(ys.iter()) {
        unsafe {
            let mut sum_i = vdupq_n_s32(0);
            let scale = xs.d * ys.d;
            let xs = xs.qs.as_ptr();
            let ys = ys.qs.as_ptr();
            for i in (0..QK_K).step_by(16) {
                let xs = vld1q_s8(xs.add(i));
                let ys = vld1q_s8(ys.add(i));
                let xy = vdotq_s32(xs, ys);
                sum_i = vaddq_s32(sum_i, xy)
            }
            sumf += vaddvq_s32(sum_i) as f32 * scale
        }
    }
    Ok(sumf)
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
    if n % QK_K != 0 {
@ -183,16 +209,30 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.2, m4b), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.3, m4b), q6h_3));
-                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
+                // TODO: dotprod
-                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+
                let p0 = vaddq_s16(
                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
                );
                let p1 = vaddq_s16(
                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
                scale = scale.add(2);
-                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
+                let p2 = vaddq_s16(
-                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
                );
                let p3 = vaddq_s16(
                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
                scale = scale.add(2);
                let q8bytes = vld1q_s8_x4(q8);
@ -212,16 +252,29 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.2, 4), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.3, 4), q6h_3));
-                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
+                // TODO: dotprod case.
-                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
+                let p0 = vaddq_s16(
                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
                );
                let p1 = vaddq_s16(
                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
+                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
                scale = scale.add(2);
-                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
+                let p2 = vaddq_s16(
-                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
+                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
                );
                let p3 = vaddq_s16(
                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
                );
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
+                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
                scale = scale.add(2);
            }
            sum += d_all * y.d * ((isum - 32 * isum_mins) as f32);
@ -298,14 +351,28 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.0, 4), q5h_2));
                let q5bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.1, 4), q5h_3));
-                let p0 = vdotq_s32(q5bytes_0, q8bytes.0);
+                // TODO: dotprod
-                let p1 = vdotq_s32(q5bytes_1, q8bytes.1);
+
-                sumi += vaddvq_s32(vaddq_s32(p0, p1)) * *scales as i32;
+                let p0 = vaddq_s16(
                    vmull_s8(vget_low_s8(q5bytes_0), vget_low_s8(q8bytes.0)),
                    vmull_s8(vget_high_s8(q5bytes_0), vget_high_s8(q8bytes.0)),
                );
                let p1 = vaddq_s16(
                    vmull_s8(vget_low_s8(q5bytes_1), vget_low_s8(q8bytes.1)),
                    vmull_s8(vget_high_s8(q5bytes_1), vget_high_s8(q8bytes.1)),
                );
                sumi += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * *scales as i32;
                scales = scales.add(1);
-                let p2 = vdotq_s32(q5bytes_2, q8bytes.2);
+                let p2 = vaddq_s16(
-                let p3 = vdotq_s32(q5bytes_3, q8bytes.3);
+                    vmull_s8(vget_low_s8(q5bytes_2), vget_low_s8(q8bytes.2)),
-                sumi += vaddvq_s32(vaddq_s32(p2, p3)) * *scales as i32;
+                    vmull_s8(vget_high_s8(q5bytes_2), vget_high_s8(q8bytes.2)),
                );
                let p3 = vaddq_s16(
                    vmull_s8(vget_low_s8(q5bytes_3), vget_low_s8(q8bytes.3)),
                    vmull_s8(vget_high_s8(q5bytes_3), vget_high_s8(q8bytes.3)),
                );
                sumi += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * *scales as i32;
                scales = scales.add(1);
            }
            sumf += d * sumi as f32 - dmin * sumi_mins as f32;
@ -368,15 +435,22 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
            for j in 0..QK_K / 64 {
                let q4bits = vld1q_u8_x2(q4);
                q4 = q4.add(32);
                // TODO: dotprod
                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
                let q4bytes = int8x16x2_t(
                    vreinterpretq_s8_u8(vandq_u8(q4bits.0, m4b)),
                    vreinterpretq_s8_u8(vandq_u8(q4bits.1, m4b)),
                );
-                let p0 = vdotq_s32(q4bytes.0, q8bytes.0);
+                let p0 = vaddq_s16(
-                let p1 = vdotq_s32(q4bytes.1, q8bytes.1);
+                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
-                sumi1 += vaddvq_s32(vaddq_s32(p0, p1)) * scales[2 * j] as i32;
+                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
                );
                let p1 = vaddq_s16(
                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
                );
                sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * scales[2 * j] as i32;
                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
@ -384,9 +458,15 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.0, 4)),
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.1, 4)),
                );
-                let p2 = vdotq_s32(q4bytes.0, q8bytes.0);
+                let p2 = vaddq_s16(
-                let p3 = vdotq_s32(q4bytes.1, q8bytes.1);
+                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
-                sumi2 += vaddvq_s32(vaddq_s32(p2, p3)) * scales[2 * j + 1] as i32;
+                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
                );
                let p3 = vaddq_s16(
                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
                );
                sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * scales[2 * j + 1] as i32;
            }
            sumf += d * (sumi1 + sumi2) as f32;
        }
@ -464,14 +544,27 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );
-                let p0 = vdotq_s32(q3bytes_0, q8bytes_1.0);
+                // TODO: dotprod
-                let p1 = vdotq_s32(q3bytes_1, q8bytes_1.1);
+                let p0 = vaddq_s16(
-                let p2 = vdotq_s32(q3bytes_2, q8bytes_1.2);
+                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_1.0)),
-                let p3 = vdotq_s32(q3bytes_3, q8bytes_1.3);
+                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_1.0)),
-                isum += vaddvq_s32(p0) * *scale as i32
+                );
-                    + vaddvq_s32(p1) * *scale.add(1) as i32
+                let p1 = vaddq_s16(
-                    + vaddvq_s32(p2) * *scale.add(2) as i32
+                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_1.1)),
-                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_1.1)),
                );
                let p2 = vaddq_s16(
                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_1.2)),
                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_1.2)),
                );
                let p3 = vaddq_s16(
                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_1.3)),
                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_1.3)),
                );
                isum += vaddvq_s16(p0) as i32 * *scale as i32
                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
                scale = scale.add(4);
                let q3h_0 = vbicq_u8(m2, qhbits.0);
@ -496,14 +589,27 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );
-                let p0 = vdotq_s32(q3bytes_0, q8bytes_2.0);
+                // TODO: dotprod
-                let p1 = vdotq_s32(q3bytes_1, q8bytes_2.1);
+                let p0 = vaddq_s16(
-                let p2 = vdotq_s32(q3bytes_2, q8bytes_2.2);
+                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_2.0)),
-                let p3 = vdotq_s32(q3bytes_3, q8bytes_2.3);
+                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_2.0)),
-                isum += vaddvq_s32(p0) * *scale as i32
+                );
-                    + vaddvq_s32(p1) * *scale.add(1) as i32
+                let p1 = vaddq_s16(
-                    + vaddvq_s32(p2) * *scale.add(2) as i32
+                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_2.1)),
-                    + vaddvq_s32(p3) * *scale.add(3) as i32;
+                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_2.1)),
                );
                let p2 = vaddq_s16(
                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_2.2)),
                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_2.2)),
                );
                let p3 = vaddq_s16(
                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_2.3)),
                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_2.3)),
                );
                isum += vaddvq_s16(p0) as i32 * *scale as i32
                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
                scale = scale.add(4);
                if j == 0 {
@ -561,6 +667,7 @@ pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Res
            let mut is = 0usize;
            // TODO: dotprod
            for _j in 0..QK_K / 128 {
                let q2bits = vld1q_u8_x2(q2);
                q2 = q2.add(32);
@ -607,7 +714,14 @@ unsafe fn multiply_accum_with_scale(
    q2bytes: int8x16x2_t,
    q8bytes: int8x16x2_t,
 ) -> i32 {
-    let p1 = vdotq_s32(q2bytes.0, q8bytes.0);
+    let p1 = vaddq_s16(
-    let p2 = vdotq_s32(q2bytes.1, q8bytes.1);
+        vmull_s8(vget_low_s8(q2bytes.0), vget_low_s8(q8bytes.0)),
-    vaddvq_s32(p1) * aux[is + index] as i32 + vaddvq_s32(p2) * aux[is + 1 + index] as i32
+        vmull_s8(vget_high_s8(q2bytes.0), vget_high_s8(q8bytes.0)),
    );
    let p2 = vaddq_s16(
        vmull_s8(vget_low_s8(q2bytes.1), vget_low_s8(q8bytes.1)),
        vmull_s8(vget_high_s8(q2bytes.1), vget_high_s8(q8bytes.1)),
    );
    vaddvq_s16(p1) as i32 * aux[is + index] as i32
        + vaddvq_s16(p2) as i32 * aux[is + 1 + index] as i32
 }
--- a/candle-core/src/quantized/simd128.rs
+++ b/candle-core/src/quantized/simd128.rs
@ -1,419 +0,0 @@
 use super::k_quants::{BlockQ2K, BlockQ4K, BlockQ4_0, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K};
 use crate::Result;
 use byteorder::{ByteOrder, LittleEndian};
 use half::f16;
 use core::arch::wasm32::*;
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
    }
    unsafe {
        let mut acc = f32x4_splat(0.0f32);
        for (x, y) in xs.iter().zip(ys.iter()) {
            let x1234 = v128_load(x.qs.as_ptr() as *const v128);
            let x12 = v128_and(x1234, u8x16_splat(0x0F));
            let x12 = i8x16_sub(x12, i8x16_splat(8));
            let x34 = u8x16_shr(x1234, 4);
            let x34 = i8x16_sub(x34, i8x16_splat(8));
            let x1 = i16x8_extend_low_i8x16(x12);
            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
            let sum_xy = i32x4_dot_i16x8(x1, y1);
            let x2 = i16x8_extend_high_i8x16(x12);
            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
            let x3 = i16x8_extend_low_i8x16(x34);
            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
            let x4 = i16x8_extend_high_i8x16(x34);
            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
            let sum_xy = f32x4_convert_i32x4(sum_xy);
            // f32x4_relaxed_madd is nightly only.
            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
            let scaled = f32x4_mul(sum_xy, d);
            acc = f32x4_add(acc, scaled)
        }
        let res = f32x4_extract_lane::<0>(acc)
            + f32x4_extract_lane::<1>(acc)
            + f32x4_extract_lane::<2>(acc)
            + f32x4_extract_lane::<3>(acc);
        Ok(res)
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
    if n % QK8_0 != 0 {
        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
    }
    unsafe {
        let mut acc = f32x4_splat(0.0f32);
        for (x, y) in xs.iter().zip(ys.iter()) {
            let x1 = i16x8_load_extend_i8x8(x.qs.as_ptr());
            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
            let sum_xy = i32x4_dot_i16x8(x1, y1);
            let x2 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(8));
            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
            let x3 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(16));
            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
            let x4 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(24));
            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
            let sum_xy = f32x4_convert_i32x4(sum_xy);
            // f32x4_relaxed_madd is nightly only.
            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
            let scaled = f32x4_mul(sum_xy, d);
            acc = f32x4_add(acc, scaled)
        }
        let res = f32x4_extract_lane::<0>(acc)
            + f32x4_extract_lane::<1>(acc)
            + f32x4_extract_lane::<2>(acc)
            + f32x4_extract_lane::<3>(acc);
        Ok(res)
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
    if n % QK_K != 0 {
        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
    }
    unsafe {
        let mut sumf = f32x4_splat(0f32);
        for (x, y) in xs.iter().zip(ys.iter()) {
            let mut q2: &[_] = &x.qs;
            let mut q8: &[_] = &y.qs;
            let sc = &x.scales;
            let mut summs = i32x4_splat(0);
            for i in (0..(QK_K / 16)).step_by(4) {
                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(i));
                let scales = i32x4_shr(
                    i32x4(
                        sc[i] as i32,
                        sc[i + 1] as i32,
                        sc[i + 2] as i32,
                        sc[i + 3] as i32,
                    ),
                    4,
                );
                summs = i32x4_add(summs, i32x4_mul(bsums, scales))
            }
            let summs = f32x4_convert_i32x4(summs);
            let dall = y.d * x.d.to_f32();
            let dmin = y.d * x.dmin.to_f32();
            let mut isum = i32x4_splat(0);
            let mut is = 0;
            for _ in 0..(QK_K / 128) {
                let mut shift = 0;
                for _ in 0..4 {
                    let d = (sc[is] & 0xF) as i32;
                    is += 1;
                    let mut isuml = i16x8_splat(0);
                    for l in (0..16).step_by(8) {
                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
                    }
                    let dd = i32x4_splat(d);
                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
                    let d = (sc[is] & 0xF) as i32;
                    is += 1;
                    let mut isuml = i16x8_splat(0);
                    for l in (16..32).step_by(8) {
                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
                    }
                    let dd = i32x4_splat(d);
                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
                    shift += 2;
                    // adjust the indexing
                    q8 = &q8[32..];
                }
                // adjust the indexing
                q2 = &q2[32..];
            }
            let isum = f32x4_convert_i32x4(isum);
            sumf = f32x4_add(
                sumf,
                f32x4_sub(
                    f32x4_mul(isum, f32x4_splat(dall)),
                    f32x4_mul(summs, f32x4_splat(dmin)),
                ),
            );
        }
        let sumf = f32x4_extract_lane::<0>(sumf)
            + f32x4_extract_lane::<1>(sumf)
            + f32x4_extract_lane::<2>(sumf)
            + f32x4_extract_lane::<3>(sumf);
        Ok(sumf)
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
    if n % QK_K != 0 {
        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
    }
    const KMASK1: u32 = 0x3f3f3f3f;
    const KMASK2: u32 = 0x0f0f0f0f;
    const KMASK3: u32 = 0x03030303;
    let mut utmp: [u32; 4] = [0; 4];
    let mut scales: [u8; 8] = [0; 8];
    let mut mins: [u8; 8] = [0; 8];
    let mut aux8: [u8; QK_K] = [0; QK_K];
    let mut sums = f32x4_splat(0f32);
    unsafe {
        for (y, x) in ys.iter().zip(xs.iter()) {
            let q4 = &x.qs;
            let q8 = &y.qs;
            for j in 0..QK_K / 64 {
                let q4_1 = v128_load(q4.as_ptr().add(32 * j) as *const v128);
                let q4_2 = v128_load(q4.as_ptr().add(32 * j + 16) as *const v128);
                v128_store(
                    aux8.as_mut_ptr().add(64 * j) as *mut v128,
                    v128_and(q4_1, u8x16_splat(0x0F)),
                );
                v128_store(
                    aux8.as_mut_ptr().add(64 * j + 16) as *mut v128,
                    v128_and(q4_2, u8x16_splat(0x0F)),
                );
                v128_store(
                    aux8.as_mut_ptr().add(64 * j + 32) as *mut v128,
                    u8x16_shr(q4_1, 4),
                );
                v128_store(
                    aux8.as_mut_ptr().add(64 * j + 48) as *mut v128,
                    u8x16_shr(q4_2, 4),
                );
            }
            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
            let uaux = utmp[1] & KMASK1;
            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
            utmp[2] = uaux;
            utmp[0] &= KMASK1;
            //extract scales and mins
            LittleEndian::write_u32_into(&utmp[0..2], &mut scales);
            LittleEndian::write_u32_into(&utmp[2..4], &mut mins);
            let mut sumi = i32x4_splat(0);
            for j in (0..QK_K / 16).step_by(4) {
                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(j));
                let (m1, m2) = (mins[j / 2] as i32, mins[j / 2 + 1] as i32);
                let mins = i32x4(m1, m1, m2, m2);
                sumi = i32x4_add(sumi, i32x4_mul(bsums, mins));
            }
            let mut aux32 = i32x4_splat(0i32);
            for (scale_i, scale) in scales.iter().enumerate() {
                let scale = i32x4_splat(*scale as i32);
                for j in 0..4 {
                    let i = 32 * scale_i + 8 * j;
                    let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(i));
                    let aux8 = i16x8_load_extend_u8x8(aux8.as_ptr().add(i));
                    let aux16 = i16x8_mul(q8, aux8);
                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_low_i16x8(aux16)));
                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_high_i16x8(aux16)));
                }
            }
            let aux32 = f32x4_convert_i32x4(aux32);
            let d = f32x4_splat(x.d.to_f32() * y.d);
            sums = f32x4_add(sums, f32x4_mul(aux32, d));
            let dmin = x.dmin.to_f32() * y.d;
            let dmin = f32x4_splat(dmin);
            let sumi = f32x4_convert_i32x4(sumi);
            sums = f32x4_sub(sums, f32x4_mul(sumi, dmin));
        }
        let sums = f32x4_extract_lane::<0>(sums)
            + f32x4_extract_lane::<1>(sums)
            + f32x4_extract_lane::<2>(sums)
            + f32x4_extract_lane::<3>(sums);
        Ok(sums)
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
    if n % QK_K != 0 {
        crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
    }
    let mut aux8 = [0i8; QK_K];
    unsafe {
        let mut sums = f32x4_splat(0f32);
        for (x, y) in xs.iter().zip(ys.iter()) {
            let q4 = &x.ql;
            let qh = &x.qh;
            let q8 = &y.qs;
            let mut aux32 = f32x4_splat(0f32);
            for j in (0..QK_K).step_by(128) {
                let aux8 = aux8.as_mut_ptr().add(j);
                let q4 = &q4.as_ptr().add(j / 2);
                let qh = &qh.as_ptr().add(j / 4);
                for l in (0..32).step_by(16) {
                    // aux8[l] = (((q4[l] & 0xF) | ((qh[l] & 3) << 4)) as i32 - 32) as i8;
                    let a8 = v128_or(
                        v128_and(v128_load(q4.add(l) as *const v128), u8x16_splat(0xF)),
                        u8x16_shl(
                            v128_and(v128_load(qh.add(l) as *const v128), u8x16_splat(3)),
                            4,
                        ),
                    );
                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
                    v128_store(
                        aux8.add(l) as *mut v128,
                        i8x16_narrow_i16x8(a8_low, a8_high),
                    );
                    // aux8[l + 32] =
                    //    (((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) as i32 - 32) as i8;
                    let a8 = v128_or(
                        v128_and(v128_load(q4.add(l + 32) as *const v128), u8x16_splat(0xF)),
                        u8x16_shl(
                            v128_and(
                                u8x16_shr(v128_load(qh.add(l) as *const v128), 2),
                                u8x16_splat(3),
                            ),
                            4,
                        ),
                    );
                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
                    v128_store(
                        aux8.add(l + 32) as *mut v128,
                        i8x16_narrow_i16x8(a8_low, a8_high),
                    );
                    // aux8[l + 64] = (((q4[l] >> 4) | (((qh[l] >> 4) & 3) << 4)) as i32 - 32) as i8;
                    let a8 = v128_or(
                        u8x16_shr(v128_load(q4.add(l) as *const v128), 4),
                        u8x16_shl(
                            v128_and(
                                u8x16_shr(v128_load(qh.add(l) as *const v128), 4),
                                u8x16_splat(3),
                            ),
                            4,
                        ),
                    );
                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
                    v128_store(
                        aux8.add(l + 64) as *mut v128,
                        i8x16_narrow_i16x8(a8_low, a8_high),
                    );
                    // aux8[l + 96] =
                    //    (((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) as i32 - 32) as i8;
                    let a8 = v128_or(
                        u8x16_shr(v128_load(q4.add(l + 32) as *const v128), 4),
                        u8x16_shl(
                            v128_and(
                                u8x16_shr(v128_load(qh.add(l) as *const v128), 6),
                                u8x16_splat(3),
                            ),
                            4,
                        ),
                    );
                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
                    v128_store(
                        aux8.add(l + 96) as *mut v128,
                        i8x16_narrow_i16x8(a8_low, a8_high),
                    );
                }
            }
            for (j, &scale) in x.scales.iter().enumerate() {
                let scale = f32x4_splat(scale as f32);
                for offset in [0, 8] {
                    let aux16 = i16x8_mul(
                        i16x8_load_extend_i8x8(q8.as_ptr().add(16 * j + offset)),
                        i16x8_load_extend_i8x8(aux8.as_ptr().add(16 * j + offset)),
                    );
                    aux32 = f32x4_add(
                        aux32,
                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_low_i16x8(aux16)), scale),
                    );
                    aux32 = f32x4_add(
                        aux32,
                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_high_i16x8(aux16)), scale),
                    );
                }
            }
            let d = f32x4_splat(x.d.to_f32() * y.d);
            sums = f32x4_add(sums, f32x4_mul(aux32, d));
        }
        let sums = f32x4_extract_lane::<0>(sums)
            + f32x4_extract_lane::<1>(sums)
            + f32x4_extract_lane::<2>(sums)
            + f32x4_extract_lane::<3>(sums);
        Ok(sums)
    }
 }
 #[inline(always)]
 pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
    let qk = QK_K;
    if n % QK_K != 0 {
        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
    }
    unsafe {
        let mut acc = f32x4_splat(0.0f32);
        for (xs, ys) in xs.iter().zip(ys.iter()) {
            let x_qs = xs.qs.as_ptr();
            let y_qs = ys.qs.as_ptr();
            let mut sumi = i32x4_splat(0);
            for j in (0..QK_K).step_by(8) {
                let xs = i16x8_load_extend_i8x8(x_qs.add(j));
                let ys = i16x8_load_extend_i8x8(y_qs.add(j));
                let sum_xy = i32x4_dot_i16x8(xs, ys);
                sumi = i32x4_add(sumi, sum_xy)
            }
            let d = f32x4_splat(xs.d * ys.d);
            acc = f32x4_add(acc, f32x4_mul(f32x4_convert_i32x4(sumi), d))
        }
        let res = f32x4_extract_lane::<0>(acc)
            + f32x4_extract_lane::<1>(acc)
            + f32x4_extract_lane::<2>(acc)
            + f32x4_extract_lane::<3>(acc);
        Ok(res)
    }
 }
--- a/candle-core/src/quantized/utils.rs
+++ b/candle-core/src/quantized/utils.rs
@ -17,7 +17,7 @@ pub(super) fn group_for_quantization<'a, 'b, T: super::k_quants::GgmlType>(
    let expected_blocks = xs.len() / block_size;
    let actual_blocks = ys.len();
-    // Validate that the input is the right size
+    //validate that the input is the right size
    if expected_blocks != actual_blocks {
        crate::bail!("quantize {dtype:?}: expected {expected_blocks} blocks but only {actual_blocks} were provided!")
    }
@ -37,12 +37,12 @@ pub(super) fn group_for_dequantization<'a, 'b, T: super::k_quants::GgmlType>(
    let actual_output_len = ys.len();
    let expected_output_len = xs.len() * block_size;
-    // Validate that the output is the right size
+    //validate that the output is the right size
    if expected_output_len != actual_output_len {
        crate::bail!("dequantize {dtype:?}: ys (len = {actual_output_len}) does not match the expected length of {expected_output_len}!")
    }
-    // Zip the blocks and outputs together
+    //zip the blocks and outputs together
    Ok(xs.iter().zip(ys.chunks_exact_mut(block_size)).collect())
 }
--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
@ -1,14 +1,3 @@
 //! Module to load `safetensor` files into CPU/GPU memory.
 //!
 //! There are multiple ways to load tensors from safetensor files:
 //! - `load` function for loading directly into memory and returning a HashMap of tensors
 //! - `MmapedSafetensors` for memory mapping files and avoiding full allocation
 //! - `SliceSafetensors` for working with in-memory buffers
 //! - `BufferedSafetensors` for owning a buffer of data
 //!
 //! Tensors can also be serialized to safetensor format using the `save` function or
 //! `Tensor::save_safetensors` method.
 //!
 use crate::{DType, Device, Error, Result, Tensor, WithDType};
 use safetensors::tensor as st;
 use safetensors::tensor::SafeTensors;
@ -89,7 +78,11 @@ impl st::View for &Tensor {
 }
 impl Tensor {
-    pub fn save_safetensors<P: AsRef<Path>>(&self, name: &str, filename: P) -> Result<()> {
+    pub fn save_safetensors<P: AsRef<std::path::Path>>(
        &self,
        name: &str,
        filename: P,
    ) -> Result<()> {
        let data = [(name, self.clone())];
        Ok(st::serialize_to_file(data, &None, filename.as_ref())?)
    }
@ -182,7 +175,7 @@ pub trait Load {
    fn load(&self, device: &Device) -> Result<Tensor>;
 }
-impl Load for st::TensorView<'_> {
+impl<'a> Load for st::TensorView<'a> {
    fn load(&self, device: &Device) -> Result<Tensor> {
        convert(self, device)
    }
@ -262,158 +255,6 @@ pub fn save<K: AsRef<str> + Ord + std::fmt::Display, P: AsRef<Path>>(
    Ok(st::serialize_to_file(tensors, &None, filename.as_ref())?)
 }
 #[derive(yoke::Yokeable)]
 struct SafeTensors_<'a>(SafeTensors<'a>);
 pub struct MmapedSafetensors {
    safetensors: Vec<yoke::Yoke<SafeTensors_<'static>, memmap2::Mmap>>,
    routing: Option<HashMap<String, usize>>,
 }
 impl MmapedSafetensors {
    /// Creates a wrapper around a memory mapped file and deserialize the safetensors header.
    ///
    /// # Safety
    ///
    /// The unsafe is inherited from [`memmap2::MmapOptions`].
    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
        let p = p.as_ref();
        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
        let file = memmap2::MmapOptions::new()
            .map(&file)
            .map_err(|e| Error::from(e).with_path(p))?;
        let safetensors = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
            file,
            |data: &[u8]| {
                let st = safetensors::SafeTensors::deserialize(data)
                    .map_err(|e| Error::from(e).with_path(p))?;
                Ok::<_, Error>(SafeTensors_(st))
            },
        )?;
        Ok(Self {
            safetensors: vec![safetensors],
            routing: None,
        })
    }
    /// Creates a wrapper around multiple memory mapped file and deserialize the safetensors headers.
    ///
    /// If a tensor name appears in multiple files, the last entry is returned.
    ///
    /// # Safety
    ///
    /// The unsafe is inherited from [`memmap2::MmapOptions`].
    pub unsafe fn multi<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
        let mut routing = HashMap::new();
        let mut safetensors = vec![];
        for (index, p) in paths.iter().enumerate() {
            let p = p.as_ref();
            let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
            let file = memmap2::MmapOptions::new()
                .map(&file)
                .map_err(|e| Error::from(e).with_path(p))?;
            let data = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
                file,
                |data: &[u8]| {
                    let st = safetensors::SafeTensors::deserialize(data)
                        .map_err(|e| Error::from(e).with_path(p))?;
                    Ok::<_, Error>(SafeTensors_(st))
                },
            )?;
            for k in data.get().0.names() {
                routing.insert(k.to_string(), index);
            }
            safetensors.push(data)
        }
        Ok(Self {
            safetensors,
            routing: Some(routing),
        })
    }
    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
        self.get(name)?.load(dev)
    }
    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
        let mut tensors = vec![];
        for safetensors in self.safetensors.iter() {
            tensors.push(safetensors.get().0.tensors())
        }
        tensors.into_iter().flatten().collect()
    }
    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
        let index = match &self.routing {
            None => 0,
            Some(routing) => {
                let index = routing.get(name).ok_or_else(|| {
                    Error::CannotFindTensor {
                        path: name.to_string(),
                    }
                    .bt()
                })?;
                *index
            }
        };
        Ok(self.safetensors[index].get().0.tensor(name)?)
    }
 }
 pub struct SliceSafetensors<'a> {
    safetensors: SafeTensors<'a>,
 }
 impl<'a> SliceSafetensors<'a> {
    /// Creates a wrapper around a binary buffer and deserialize the safetensors header.
    pub fn new(buffer: &'a [u8]) -> Result<Self> {
        let safetensors = safetensors::SafeTensors::deserialize(buffer)?;
        Ok(Self { safetensors })
    }
    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
        self.safetensors.tensor(name)?.load(dev)
    }
    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
        self.safetensors.tensors()
    }
    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
        Ok(self.safetensors.tensor(name)?)
    }
 }
 pub struct BufferedSafetensors {
    safetensors: yoke::Yoke<SafeTensors_<'static>, Vec<u8>>,
 }
 impl BufferedSafetensors {
    /// Creates a wrapper around a binary buffer and deserialize the safetensors header.
    pub fn new(buffer: Vec<u8>) -> Result<Self> {
        let safetensors = yoke::Yoke::<SafeTensors_<'static>, Vec<u8>>::try_attach_to_cart(
            buffer,
            |data: &[u8]| {
                let st = safetensors::SafeTensors::deserialize(data)?;
                Ok::<_, Error>(SafeTensors_(st))
            },
        )?;
        Ok(Self { safetensors })
    }
    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
        self.get(name)?.load(dev)
    }
    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
        self.safetensors.get().0.tensors()
    }
    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
        Ok(self.safetensors.get().0.tensor(name)?)
    }
 }
 pub struct MmapedFile {
    path: std::path::PathBuf,
    inner: memmap2::Mmap,
@ -426,7 +267,7 @@ impl MmapedFile {
    /// # Safety
    ///
    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
+    pub unsafe fn new<P: AsRef<std::path::Path>>(p: P) -> Result<Self> {
        let p = p.as_ref();
        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
        let inner = memmap2::MmapOptions::new()
--- a/candle-core/src/scalar.rs
+++ b/candle-core/src/scalar.rs
@ -1,5 +1,3 @@
 //! TensorScalar Enum and Trait
 //!
 use crate::{Result, Tensor, WithDType};
 pub enum TensorScalar {
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -43,22 +43,43 @@ impl From<usize> for Shape {
    }
 }
-macro_rules! impl_from_tuple {
+impl From<(usize,)> for Shape {
-    ($tuple:ty, $($index:tt),+) => {
+    fn from(d1: (usize,)) -> Self {
-        impl From<$tuple> for Shape {
+        Self(vec![d1.0])
            fn from(d: $tuple) -> Self {
                Self(vec![$(d.$index,)+])
            }
        }
    }
 }
-impl_from_tuple!((usize,), 0);
+impl From<(usize, usize)> for Shape {
-impl_from_tuple!((usize, usize), 0, 1);
+    fn from(d12: (usize, usize)) -> Self {
-impl_from_tuple!((usize, usize, usize), 0, 1, 2);
+        Self(vec![d12.0, d12.1])
-impl_from_tuple!((usize, usize, usize, usize), 0, 1, 2, 3);
+    }
-impl_from_tuple!((usize, usize, usize, usize, usize), 0, 1, 2, 3, 4);
+}
-impl_from_tuple!((usize, usize, usize, usize, usize, usize), 0, 1, 2, 3, 4, 5);
+
 impl From<(usize, usize, usize)> for Shape {
    fn from(d123: (usize, usize, usize)) -> Self {
        Self(vec![d123.0, d123.1, d123.2])
    }
 }
 impl From<(usize, usize, usize, usize)> for Shape {
    fn from(d1234: (usize, usize, usize, usize)) -> Self {
        Self(vec![d1234.0, d1234.1, d1234.2, d1234.3])
    }
 }
 impl From<(usize, usize, usize, usize, usize)> for Shape {
    fn from(d12345: (usize, usize, usize, usize, usize)) -> Self {
        Self(vec![d12345.0, d12345.1, d12345.2, d12345.3, d12345.4])
    }
 }
 impl From<(usize, usize, usize, usize, usize, usize)> for Shape {
    fn from(d123456: (usize, usize, usize, usize, usize, usize)) -> Self {
        Self(vec![
            d123456.0, d123456.1, d123456.2, d123456.3, d123456.4, d123456.5,
        ])
    }
 }
 impl From<Vec<usize>> for Shape {
    fn from(dims: Vec<usize>) -> Self {
@ -121,12 +142,6 @@ impl Shape {
        &self.0
    }
    /// The dimension size for a specified dimension index.
    pub fn dim<D: Dim>(&self, dim: D) -> Result<usize> {
        let dim = dim.to_index(self, "dim")?;
        Ok(self.dims()[dim])
    }
    /// The total number of elements, this is the product of all dimension sizes.
    pub fn elem_count(&self) -> usize {
        self.0.iter().product()
@ -156,7 +171,7 @@ impl Shape {
        }
        let mut acc = 1;
        for (&stride, &dim) in stride.iter().zip(self.0.iter()).rev() {
-            if dim > 1 && stride != acc {
+            if stride != acc {
                return false;
            }
            acc *= dim;
@ -171,7 +186,7 @@ impl Shape {
        }
        let mut acc = 1;
        for (&stride, &dim) in stride.iter().zip(self.0.iter()) {
-            if dim > 1 && stride != acc {
+            if stride != acc {
                return false;
            }
            acc *= dim;
@ -188,7 +203,7 @@ impl Shape {
    /// Check whether the two shapes are compatible for broadcast, and if it is the case return the
    /// broadcasted shape. This is to be used for binary pointwise ops.
-    pub fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
+    pub(crate) fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
        let lhs = self;
        let lhs_dims = lhs.dims();
        let rhs_dims = rhs.dims();
@ -289,7 +304,6 @@ impl Dim for usize {
 pub enum D {
    Minus1,
    Minus2,
    Minus(usize),
 }
 impl D {
@ -297,7 +311,6 @@ impl D {
        let dim = match self {
            Self::Minus1 => -1,
            Self::Minus2 => -2,
            Self::Minus(u) => -(*u as i32),
        };
        Error::DimOutOfRange {
            shape: shape.clone(),
@ -314,7 +327,6 @@ impl Dim for D {
        match self {
            Self::Minus1 if rank >= 1 => Ok(rank - 1),
            Self::Minus2 if rank >= 2 => Ok(rank - 2),
            Self::Minus(u) if *u > 0 && rank >= *u => Ok(rank - *u),
            _ => Err(self.out_of_range(shape, op)),
        }
    }
@ -324,7 +336,6 @@ impl Dim for D {
        match self {
            Self::Minus1 => Ok(rank),
            Self::Minus2 if rank >= 1 => Ok(rank - 1),
            Self::Minus(u) if *u > 0 && rank + 1 >= *u => Ok(rank + 1 - *u),
            _ => Err(self.out_of_range(shape, op)),
        }
    }
@ -433,18 +444,6 @@ impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim> Dims for (D1, D2, D3, D4, D5)
    }
 }
 impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim, D6: Dim> Dims for (D1, D2, D3, D4, D5, D6) {
    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
        let d0 = self.0.to_index(shape, op)?;
        let d1 = self.1.to_index(shape, op)?;
        let d2 = self.2.to_index(shape, op)?;
        let d3 = self.3.to_index(shape, op)?;
        let d4 = self.4.to_index(shape, op)?;
        let d5 = self.5.to_index(shape, op)?;
        Ok(vec![d0, d1, d2, d3, d4, d5])
    }
 }
 extract_dims!(dims0, 0, |_: &[usize]| (), ());
 extract_dims!(dims1, 1, |d: &[usize]| d[0], usize);
 extract_dims!(dims2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
@ -467,6 +466,23 @@ extract_dims!(
    (usize, usize, usize, usize, usize)
 );
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn stride() {
        let shape = Shape::from(());
        assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
        let shape = Shape::from(42);
        assert_eq!(shape.stride_contiguous(), [1]);
        let shape = Shape::from((42, 1337));
        assert_eq!(shape.stride_contiguous(), [1337, 1]);
        let shape = Shape::from((299, 792, 458));
        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
    }
 }
 pub trait ShapeWithOneHole {
    fn into_shape(self, el_count: usize) -> Result<Shape>;
 }
@ -483,152 +499,154 @@ impl ShapeWithOneHole for ((),) {
    }
 }
 fn hole_size(el_count: usize, prod_d: usize, s: &dyn std::fmt::Debug) -> Result<usize> {
    if prod_d == 0 {
        crate::bail!("cannot reshape tensor of {el_count} elements to {s:?}")
    }
    if el_count % prod_d != 0 {
        crate::bail!("cannot reshape tensor with {el_count} elements to {s:?}")
    }
    Ok(el_count / prod_d)
 }
 impl ShapeWithOneHole for ((), usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let ((), d1) = self;
-        Ok((hole_size(el_count, d1, &self)?, d1).into())
+        if el_count % d1 != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d1}")
        }
        Ok((el_count / d1, d1).into())
    }
 }
 impl ShapeWithOneHole for (usize, ()) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, ()) = self;
-        Ok((d1, hole_size(el_count, d1, &self)?).into())
+        if el_count % d1 != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d1}")
        }
        Ok((d1, el_count / d1).into())
    }
 }
 impl ShapeWithOneHole for ((), usize, usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let ((), d1, d2) = self;
-        Ok((hole_size(el_count, d1 * d2, &self)?, d1, d2).into())
+        let d = d1 * d2;
        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((el_count / d, d1, d2).into())
    }
 }
 impl ShapeWithOneHole for (usize, (), usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, (), d2) = self;
-        Ok((d1, hole_size(el_count, d1 * d2, &self)?, d2).into())
+        let d = d1 * d2;
        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, el_count / d, d2).into())
    }
 }
 impl ShapeWithOneHole for (usize, usize, ()) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, d2, ()) = self;
-        Ok((d1, d2, hole_size(el_count, d1 * d2, &self)?).into())
+        let d = d1 * d2;
        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, d2, el_count / d).into())
    }
 }
 impl ShapeWithOneHole for ((), usize, usize, usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let ((), d1, d2, d3) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        let d = d1 * d2 * d3;
-        Ok((d, d1, d2, d3).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((el_count / d, d1, d2, d3).into())
    }
 }
 impl ShapeWithOneHole for (usize, (), usize, usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, (), d2, d3) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        let d = d1 * d2 * d3;
-        Ok((d1, d, d2, d3).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, el_count / d, d2, d3).into())
    }
 }
 impl ShapeWithOneHole for (usize, usize, (), usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, d2, (), d3) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        let d = d1 * d2 * d3;
-        Ok((d1, d2, d, d3).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, d2, el_count / d, d3).into())
    }
 }
 impl ShapeWithOneHole for (usize, usize, usize, ()) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, d2, d3, ()) = self;
-        let d = hole_size(el_count, d1 * d2 * d3, &self)?;
+        let d = d1 * d2 * d3;
-        Ok((d1, d2, d3, d).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, d2, d3, el_count / d).into())
    }
 }
 impl ShapeWithOneHole for ((), usize, usize, usize, usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let ((), d1, d2, d3, d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        let d = d1 * d2 * d3 * d4;
-        Ok((d, d1, d2, d3, d4).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((el_count / d, d1, d2, d3, d4).into())
    }
 }
 impl ShapeWithOneHole for (usize, (), usize, usize, usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, (), d2, d3, d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        let d = d1 * d2 * d3 * d4;
-        Ok((d1, d, d2, d3, d4).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, el_count / d, d2, d3, d4).into())
    }
 }
 impl ShapeWithOneHole for (usize, usize, (), usize, usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, d2, (), d3, d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        let d = d1 * d2 * d3 * d4;
-        Ok((d1, d2, d, d3, d4).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, d2, el_count / d, d3, d4).into())
    }
 }
 impl ShapeWithOneHole for (usize, usize, usize, (), usize) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, d2, d3, (), d4) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        let d = d1 * d2 * d3 * d4;
-        Ok((d1, d2, d3, d, d4).into())
+        if el_count % d != 0 {
            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
        }
        Ok((d1, d2, d3, el_count / d, d4).into())
    }
 }
 impl ShapeWithOneHole for (usize, usize, usize, usize, ()) {
    fn into_shape(self, el_count: usize) -> Result<Shape> {
        let (d1, d2, d3, d4, ()) = self;
-        let d = hole_size(el_count, d1 * d2 * d3 * d4, &self)?;
+        let d = d1 * d2 * d3 * d4;
-        Ok((d1, d2, d3, d4, d).into())
+        if el_count % d != 0 {
-    }
+            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-}
+        }
-
+        Ok((d1, d2, d3, d4, el_count / d).into())
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn stride() {
        let shape = Shape::from(());
        assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
        let shape = Shape::from(42);
        assert_eq!(shape.stride_contiguous(), [1]);
        let shape = Shape::from((42, 1337));
        assert_eq!(shape.stride_contiguous(), [1337, 1]);
        let shape = Shape::from((299, 792, 458));
        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
    }
    #[test]
    fn test_from_tuple() {
        let shape = Shape::from((2,));
        assert_eq!(shape.dims(), &[2]);
        let shape = Shape::from((2, 3));
        assert_eq!(shape.dims(), &[2, 3]);
        let shape = Shape::from((2, 3, 4));
        assert_eq!(shape.dims(), &[2, 3, 4]);
        let shape = Shape::from((2, 3, 4, 5));
        assert_eq!(shape.dims(), &[2, 3, 4, 5]);
        let shape = Shape::from((2, 3, 4, 5, 6));
        assert_eq!(shape.dims(), &[2, 3, 4, 5, 6]);
        let shape = Shape::from((2, 3, 4, 5, 6, 7));
        assert_eq!(shape.dims(), &[2, 3, 4, 5, 6, 7]);
    }
 }
--- a/candle-core/src/sort.rs
+++ b/candle-core/src/sort.rs
@ -1,250 +0,0 @@
 use crate::{Result, Tensor};
 use rayon::prelude::*;
 #[derive(Debug, Clone, Copy)]
 struct ArgSort {
    asc: bool,
    last_dim: usize,
 }
 impl ArgSort {
    fn asort<T: crate::WithDType>(&self, vs: &[T], layout: &crate::Layout) -> Vec<u32> {
        #[allow(clippy::uninit_vec)]
        // Safety: indexes are set later in the parallelized section.
        let mut sort_indexes = unsafe {
            let el_count = layout.shape().elem_count();
            let mut v = Vec::with_capacity(el_count);
            v.set_len(el_count);
            v
        };
        if self.asc {
            sort_indexes
                .par_chunks_exact_mut(self.last_dim)
                .zip(vs.par_chunks_exact(self.last_dim))
                .for_each(|(indexes, vs)| {
                    indexes
                        .iter_mut()
                        .enumerate()
                        .for_each(|(i, v)| *v = i as u32);
                    indexes.sort_by(|&i, &j| {
                        vs[i as usize]
                            .partial_cmp(&vs[j as usize])
                            .unwrap_or(std::cmp::Ordering::Greater)
                    })
                });
        } else {
            sort_indexes
                .par_chunks_exact_mut(self.last_dim)
                .zip(vs.par_chunks_exact(self.last_dim))
                .for_each(|(indexes, vs)| {
                    indexes
                        .iter_mut()
                        .enumerate()
                        .for_each(|(i, v)| *v = i as u32);
                    indexes.sort_by(|&j, &i| {
                        vs[i as usize]
                            .partial_cmp(&vs[j as usize])
                            .unwrap_or(std::cmp::Ordering::Greater)
                    })
                });
        }
        sort_indexes
    }
 }
 #[cfg(feature = "cuda")]
 mod cuda {
    use super::*;
    use crate::cuda_backend::cudarc::driver::{
        CudaSlice, DeviceRepr, LaunchConfig, ValidAsZeroBits,
    };
    use crate::cuda_backend::{kernel_name, kernels, CudaStorageSlice as S, WrapErr};
    use crate::{CudaDevice, WithDType};
    impl crate::cuda_backend::Map1Any for ArgSort {
        fn f<T: DeviceRepr + WithDType + ValidAsZeroBits, W: Fn(CudaSlice<T>) -> S>(
            &self,
            src: &CudaSlice<T>,
            dev: &CudaDevice,
            layout: &crate::Layout,
            _wrap: W,
        ) -> Result<S> {
            use cudarc::driver::PushKernelArg;
            let slice = match layout.contiguous_offsets() {
                None => crate::bail!("input has to be contiguous"),
                Some((o1, o2)) => src.slice(o1..o2),
            };
            let elem_count = layout.shape().elem_count();
            let dst = unsafe { dev.alloc::<u32>(elem_count)? };
            let func = if self.asc {
                dev.get_or_load_func(&kernel_name::<T>("asort_asc"), &kernels::SORT)?
            } else {
                dev.get_or_load_func(&kernel_name::<T>("asort_desc"), &kernels::SORT)?
            };
            let ncols = self.last_dim;
            let nrows = elem_count / ncols;
            let ncols_pad = next_power_of_2(ncols);
            let cfg = LaunchConfig {
                grid_dim: (1, nrows as u32, 1),
                block_dim: (ncols_pad as u32, 1, 1),
                shared_mem_bytes: (ncols_pad * std::mem::size_of::<u32>()) as u32,
            };
            let stream = dev.cuda_stream();
            let mut builder = stream.launch_builder(&func);
            let ncols = ncols as i32;
            let ncols_pad = ncols_pad as i32;
            builder.arg(&slice).arg(&dst).arg(&ncols).arg(&ncols_pad);
            unsafe { builder.launch(cfg) }.w()?;
            Ok(S::U32(dst))
        }
    }
 }
 impl crate::CustomOp1 for ArgSort {
    fn name(&self) -> &'static str {
        "argsort"
    }
    fn cpu_fwd(
        &self,
        storage: &crate::CpuStorage,
        layout: &crate::Layout,
    ) -> Result<(crate::CpuStorage, crate::Shape)> {
        let sort_indexes = match storage {
            crate::CpuStorage::U8(vs) => self.asort(vs, layout),
            crate::CpuStorage::U32(vs) => self.asort(vs, layout),
            crate::CpuStorage::I64(vs) => self.asort(vs, layout),
            crate::CpuStorage::BF16(vs) => self.asort(vs, layout),
            crate::CpuStorage::F16(vs) => self.asort(vs, layout),
            crate::CpuStorage::F32(vs) => self.asort(vs, layout),
            crate::CpuStorage::F64(vs) => self.asort(vs, layout),
        };
        let sort_indexes = crate::CpuStorage::U32(sort_indexes);
        Ok((sort_indexes, layout.shape().into()))
    }
    #[cfg(feature = "cuda")]
    fn cuda_fwd(
        &self,
        storage: &crate::CudaStorage,
        layout: &crate::Layout,
    ) -> Result<(crate::CudaStorage, crate::Shape)> {
        use crate::backend::BackendStorage;
        use crate::cuda_backend::Map1Any;
        let dev = storage.device();
        let slice = self.map(&storage.slice, dev, layout)?;
        let dst = crate::cuda_backend::CudaStorage {
            slice,
            device: dev.clone(),
        };
        Ok((dst, layout.shape().clone()))
    }
    #[cfg(feature = "metal")]
    fn metal_fwd(
        &self,
        storage: &crate::MetalStorage,
        layout: &crate::Layout,
    ) -> Result<(crate::MetalStorage, crate::Shape)> {
        use crate::backend::BackendStorage;
        use crate::DType;
        let name = {
            if self.asc {
                match storage.dtype() {
                    DType::BF16 => "asort_asc_bf16",
                    DType::F16 => "asort_asc_f16",
                    DType::F32 => "asort_asc_f32",
                    DType::F64 => "asort_asc_f64",
                    DType::U8 => "asort_asc_u8",
                    DType::U32 => "asort_asc_u32",
                    DType::I64 => "asort_asc_i64",
                }
            } else {
                match storage.dtype() {
                    DType::BF16 => "asort_desc_bf16",
                    DType::F16 => "asort_desc_f16",
                    DType::F32 => "asort_desc_f32",
                    DType::F64 => "asort_desc_f64",
                    DType::U8 => "asort_desc_u8",
                    DType::U32 => "asort_desc_u32",
                    DType::I64 => "asort_desc_i64",
                }
            }
        };
        let device = storage.device();
        let kernels = device.kernels();
        let command_buffer = device.command_buffer()?;
        let el = layout.shape().elem_count();
        let ncols = self.last_dim;
        let nrows = el / ncols;
        let src = crate::metal_backend::buffer_o(storage.buffer(), layout, storage.dtype());
        let dst = device.new_buffer(el, DType::U32, "asort")?;
        let mut ncols_pad = 1;
        while ncols_pad < ncols {
            ncols_pad *= 2;
        }
        candle_metal_kernels::call_arg_sort(
            device.metal_device(),
            &command_buffer,
            kernels,
            name,
            nrows,
            ncols,
            ncols_pad,
            src,
            &dst,
        )
        .map_err(crate::Error::wrap)?;
        let dst = crate::MetalStorage::new(dst, device.clone(), el, DType::U32);
        Ok((dst, layout.shape().clone()))
    }
 }
 #[allow(unused)]
 fn next_power_of_2(x: usize) -> usize {
    let mut n = 1;
    while n < x {
        n *= 2
    }
    n
 }
 impl Tensor {
    /// Returns the indices that sort the tensor along the last dimension.
    ///
    /// If `asc` is `true`, sorting is in ascending order. Otherwise sorting is performed in
    /// descending order. The sort is unstable so there is no guarantees on the final order when it
    /// comes to ties.
    pub fn arg_sort_last_dim(&self, asc: bool) -> Result<Tensor> {
        if !self.is_contiguous() {
            return Err(crate::Error::RequiresContiguous {
                op: "arg_sort_last_dim",
            });
        }
        let last_dim = match self.dims().last() {
            None => crate::bail!("empty last-dim in arg-sort"),
            Some(last_dim) => *last_dim,
        };
        // No need for a backward pass for arg sort.
        self.apply_op1_no_bwd(&ArgSort { asc, last_dim })
    }
    /// Sorts the tensor along the last dimension, returns the sorted tensor together with the
    /// sorted indexes.
    ///
    /// If `asc` is `true`, sorting is in ascending order. Otherwise sorting is performed in
    /// descending order. The sort is unstable so there is no guarantees on the final order when it
    /// comes to ties.
    pub fn sort_last_dim(&self, asc: bool) -> Result<(Tensor, Tensor)> {
        if !self.is_contiguous() {
            return Err(crate::Error::RequiresContiguous {
                op: "sort_last_dim",
            });
        }
        let asort = self.arg_sort_last_dim(asc)?;
        let sorted = self.gather(&asort, crate::D::Minus1)?;
        Ok((sorted, asort))
    }
 }
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,7 +1,6 @@
 use crate::backend::BackendStorage;
-use crate::op::{self, CmpOp, ReduceOp};
+use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
-use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape};
+use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};
 use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3};
 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -9,7 +8,6 @@ use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}
 pub enum Storage {
    Cpu(CpuStorage),
    Cuda(CudaStorage),
    Metal(MetalStorage),
 }
 impl Storage {
@ -20,10 +18,6 @@ impl Storage {
                let storage = storage.try_clone(layout)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.try_clone(layout)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -31,7 +25,6 @@ impl Storage {
        match self {
            Self::Cpu(_) => Device::Cpu,
            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
            Self::Metal(storage) => Device::Metal(storage.device().clone()),
        }
    }
@ -39,24 +32,13 @@ impl Storage {
        match self {
            Self::Cpu(storage) => storage.dtype(),
            Self::Cuda(storage) => storage.dtype(),
            Self::Metal(storage) => storage.dtype(),
        }
    }
    pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
-        let lhs_device = self.device();
+        let lhs = self.device().location();
-        let rhs_device = rhs.device();
+        let rhs = rhs.device().location();
-        let lhs = lhs_device.location();
+        if lhs != rhs {
        let rhs = rhs_device.location();
        let same_device = if self.device().is_metal() {
            // On metal, we require the device to be exactly the same rather than
            // having the same location. In cuda this is not necessary as all CudaDevice on the
            // same GPU will use the same cuda stream.
            lhs_device.same_device(&rhs_device)
        } else {
            lhs == rhs
        };
        if !same_device {
            Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op }.bt())
        } else {
            Ok(())
@ -83,10 +65,6 @@ impl Storage {
                let storage = storage.affine(layout, mul, add)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.affine(layout, mul, add)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -100,10 +78,6 @@ impl Storage {
                let storage = storage.powf(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.powf(layout, alpha)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -117,10 +91,6 @@ impl Storage {
                let storage = storage.elu(layout, alpha)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.elu(layout, alpha)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -142,10 +112,6 @@ impl Storage {
                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(lhs), Self::Metal(rhs)) => {
                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Metal(storage))
            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -169,10 +135,6 @@ impl Storage {
                let storage = storage.reduce_op(op, layout, s)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.reduce_op(op, layout, s)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -186,10 +148,6 @@ impl Storage {
                let storage = storage.to_dtype(layout, dtype)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.to_dtype(layout, dtype)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -203,10 +161,6 @@ impl Storage {
                let (storage, shape) = c.cuda_fwd(storage, l)?;
                Ok((Self::Cuda(storage), shape))
            }
            Self::Metal(storage) => {
                let (storage, shape) = c.metal_fwd(storage, l)?;
                Ok((Self::Metal(storage), shape))
            }
        }
    }
@ -227,10 +181,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
                Ok((Self::Cuda(s), shape))
            }
            (Self::Metal(s1), Self::Metal(s2)) => {
                let (s, shape) = c.metal_fwd(s1, l1, s2, l2)?;
                Ok((Self::Metal(s), shape))
            }
            _ => unreachable!(),
        }
    }
@ -255,55 +205,6 @@ impl Storage {
                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
                Ok((Self::Cuda(s), shape))
            }
            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
                let (s, shape) = c.metal_fwd(s1, l1, s2, l2, s3, l3)?;
                Ok((Self::Metal(s), shape))
            }
            _ => unreachable!(),
        }
    }
    pub(crate) fn inplace_op1(&mut self, l: &Layout, c: &dyn InplaceOp1) -> Result<()> {
        match self {
            Self::Cpu(storage) => c.cpu_fwd(storage, l),
            Self::Cuda(storage) => c.cuda_fwd(storage, l),
            Self::Metal(storage) => c.metal_fwd(storage, l),
        }
    }
    pub(crate) fn inplace_op2(
        &mut self,
        l1: &Layout,
        t2: &Self,
        l2: &Layout,
        c: &dyn InplaceOp2,
    ) -> Result<()> {
        self.same_device(t2, c.name())?;
        match (self, t2) {
            (Self::Cpu(s1), Self::Cpu(s2)) => c.cpu_fwd(s1, l1, s2, l2),
            (Self::Cuda(s1), Self::Cuda(s2)) => c.cuda_fwd(s1, l1, s2, l2),
            (Self::Metal(s1), Self::Metal(s2)) => c.metal_fwd(s1, l1, s2, l2),
            _ => unreachable!(),
        }
    }
    pub(crate) fn inplace_op3(
        &mut self,
        l1: &Layout,
        t2: &Self,
        l2: &Layout,
        t3: &Self,
        l3: &Layout,
        c: &dyn InplaceOp3,
    ) -> Result<()> {
        self.same_device(t2, c.name())?;
        self.same_device(t3, c.name())?;
        match (self, t2, t3) {
            (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => c.cpu_fwd(s1, l1, s2, l2, s3, l3),
            (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => c.cuda_fwd(s1, l1, s2, l2, s3, l3),
            (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => {
                c.metal_fwd(s1, l1, s2, l2, s3, l3)
            }
            _ => unreachable!(),
        }
    }
@ -318,10 +219,6 @@ impl Storage {
                let storage = storage.unary_impl::<B>(layout)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.unary_impl::<B>(layout)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -342,10 +239,6 @@ impl Storage {
                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(lhs), Self::Metal(rhs)) => {
                let storage = lhs.binary_impl::<B>(rhs, lhs_layout, rhs_layout)?;
                Ok(Self::Metal(storage))
            }
            (lhs, rhs) => {
                // Should not happen because of the same device check above but we're defensive
                // anyway.
@ -377,10 +270,6 @@ impl Storage {
                let s = inp.conv1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
            (Storage::Metal(inp), Storage::Metal(kernel)) => {
                let s = inp.conv1d(l, kernel, kernel_l, params)?;
                Ok(Self::Metal(s))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -390,37 +279,6 @@ impl Storage {
        }
    }
    pub(crate) fn conv_transpose1d(
        &self,
        l: &Layout,
        kernel: &Self,
        kernel_l: &Layout,
        params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
        self.same_device(kernel, "conv-transpose1d")?;
        self.same_dtype(kernel, "conv-transpose1d")?;
        match (self, &kernel) {
            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cpu(s))
            }
            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
            (Storage::Metal(inp), Storage::Metal(kernel)) => {
                let s = inp.conv_transpose1d(l, kernel, kernel_l, params)?;
                Ok(Self::Metal(s))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
                op: "conv-transpose1d",
            }
            .bt()),
        }
    }
    pub(crate) fn conv2d(
        &self,
        l: &Layout,
@ -439,10 +297,6 @@ impl Storage {
                let s = inp.conv2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
            (Storage::Metal(inp), Storage::Metal(kernel)) => {
                let s = inp.conv2d(l, kernel, kernel_l, params)?;
                Ok(Self::Metal(s))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -470,10 +324,6 @@ impl Storage {
                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
                Ok(Self::Cuda(s))
            }
            (Storage::Metal(inp), Storage::Metal(kernel)) => {
                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
                Ok(Self::Metal(s))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -498,10 +348,6 @@ impl Storage {
                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -520,27 +366,6 @@ impl Storage {
                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
                Ok(Self::Metal(storage))
            }
        }
    }
    pub(crate) fn upsample_nearest1d(&self, layout: &Layout, sz: usize) -> Result<Self> {
        match self {
            Storage::Cpu(storage) => {
                let storage = storage.upsample_nearest1d(layout, sz)?;
                Ok(Self::Cpu(storage))
            }
            Self::Cuda(storage) => {
                let storage = storage.upsample_nearest1d(layout, sz)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.upsample_nearest1d(layout, sz)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -554,10 +379,6 @@ impl Storage {
                let storage = storage.upsample_nearest2d(layout, h, w)?;
                Ok(Self::Cuda(storage))
            }
            Self::Metal(storage) => {
                let storage = storage.upsample_nearest2d(layout, h, w)?;
                Ok(Self::Metal(storage))
            }
        }
    }
@ -581,10 +402,6 @@ impl Storage {
                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(cond), Self::Metal(t), Self::Metal(f)) => {
                let storage = cond.where_cond(layout, t, layout_t, f, layout_f)?;
                Ok(Self::Metal(storage))
            }
            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -611,10 +428,6 @@ impl Storage {
                let storage = s.gather(l, indexes, indexes_l, d)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(s), Self::Metal(indexes)) => {
                let storage = s.gather(l, indexes, indexes_l, d)?;
                Ok(Self::Metal(storage))
            }
            _ => unreachable!(),
        }
    }
@ -639,10 +452,6 @@ impl Storage {
                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Metal(storage))
            }
            _ => unreachable!(),
        }
    }
@ -667,10 +476,6 @@ impl Storage {
                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(s), Self::Metal(indexes), Self::Metal(source)) => {
                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
                Ok(Self::Metal(storage))
            }
            _ => unreachable!(),
        }
    }
@ -692,10 +497,6 @@ impl Storage {
                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(lhs), Self::Metal(rhs)) => {
                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
                Ok(Self::Metal(storage))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -723,10 +524,6 @@ impl Storage {
                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
                Ok(Self::Cuda(storage))
            }
            (Self::Metal(lhs), Self::Metal(rhs)) => {
                let storage = lhs.matmul(rhs, bmnk, lhs_layout, rhs_layout)?;
                Ok(Self::Metal(storage))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -746,9 +543,6 @@ impl Storage {
        match (self, dst) {
            (Self::Cpu(src), Self::Cpu(dst)) => src.copy_strided_src(dst, dst_offset, src_l),
            (Self::Cuda(src), Self::Cuda(dst)) => Ok(src.copy_strided_src(dst, dst_offset, src_l)?),
            (Self::Metal(src), Self::Metal(dst)) => {
                Ok(src.copy_strided_src(dst, dst_offset, src_l)?)
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
@ -757,32 +551,4 @@ impl Storage {
            .bt()),
        }
    }
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn copy2d(
        &self,
        dst: &mut Self,
        d1: usize,
        d2: usize,
        src_s: usize,
        dst_s: usize,
        src_o: usize,
        dst_o: usize,
    ) -> Result<()> {
        match (self, dst) {
            (Self::Cpu(src), Self::Cpu(dst)) => src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o),
            (Self::Cuda(src), Self::Cuda(dst)) => {
                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
            }
            (Self::Metal(src), Self::Metal(dst)) => {
                Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?)
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
                op: "copy2d",
            }
            .bt()),
        }
    }
 }
--- a/candle-core/src/streaming.rs
+++ b/candle-core/src/streaming.rs
@ -1,208 +0,0 @@
 //! StreamTensror useful for streaming ops.
 //!
 use crate::{Result, Shape, Tensor};
 pub trait Dim: crate::shape::Dim + Copy {}
 impl<T: crate::shape::Dim + Copy> Dim for T {}
 /// A stream tensor is used in streaming module. It can either contain an actual tensor or be
 /// empty.
 #[derive(Clone)]
 pub struct StreamTensor(Option<Tensor>);
 impl std::fmt::Debug for StreamTensor {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match &self.0 {
            Some(t) => write!(f, "{:?}", t.shape()),
            None => write!(f, "Empty"),
        }
    }
 }
 impl std::convert::From<Option<Tensor>> for StreamTensor {
    fn from(value: Option<Tensor>) -> Self {
        Self(value)
    }
 }
 impl std::convert::From<Tensor> for StreamTensor {
    fn from(value: Tensor) -> Self {
        Self(Some(value))
    }
 }
 impl std::convert::From<()> for StreamTensor {
    fn from(_value: ()) -> Self {
        Self(None)
    }
 }
 impl StreamTensor {
    pub fn empty() -> Self {
        Self(None)
    }
    pub fn from_tensor(tensor: Tensor) -> Self {
        Self(Some(tensor))
    }
    pub fn shape(&self) -> Option<&Shape> {
        self.0.as_ref().map(|t| t.shape())
    }
    pub fn cat2<D: Dim>(&self, rhs: &Self, dim: D) -> Result<Self> {
        let xs = match (&self.0, &rhs.0) {
            (Some(lhs), Some(rhs)) => {
                let xs = Tensor::cat(&[lhs, rhs], dim)?;
                Some(xs)
            }
            (Some(xs), None) | (None, Some(xs)) => Some(xs.clone()),
            (None, None) => None,
        };
        Ok(Self(xs))
    }
    pub fn seq_len<D: Dim>(&self, dim: D) -> Result<usize> {
        match &self.0 {
            None => Ok(0),
            Some(v) => v.dim(dim),
        }
    }
    pub fn reset(&mut self) {
        self.0 = None
    }
    pub fn narrow<D: Dim>(&self, dim: D, offset: usize, len: usize) -> Result<StreamTensor> {
        let t = match &self.0 {
            None => None,
            Some(t) => {
                let seq_len = t.dim(dim)?;
                if seq_len <= offset {
                    None
                } else {
                    let t = t.narrow(dim, offset, usize::min(len, seq_len - offset))?;
                    Some(t)
                }
            }
        };
        Ok(Self(t))
    }
    /// Splits the Streaming Tensor on the time axis `dim` with the first `lhs_len` elements
    /// returned in the first output and the remaining in the second output.
    pub fn split<D: Dim>(&self, dim: D, lhs_len: usize) -> Result<(Self, Self)> {
        match &self.0 {
            None => Ok((Self::empty(), Self::empty())),
            Some(t) => {
                let seq_len = t.dim(dim)?;
                let lhs_len = usize::min(seq_len, lhs_len);
                if lhs_len == 0 {
                    Ok((Self::empty(), t.clone().into()))
                } else {
                    let lhs = Self::from_tensor(t.narrow(dim, 0, lhs_len)?);
                    let rhs_len = seq_len - lhs_len;
                    let rhs = if rhs_len == 0 {
                        Self::empty()
                    } else {
                        Self::from_tensor(t.narrow(dim, lhs_len, rhs_len)?)
                    };
                    Ok((lhs, rhs))
                }
            }
        }
    }
    pub fn as_option(&self) -> Option<&Tensor> {
        self.0.as_ref()
    }
    pub fn apply<M: crate::Module>(&self, m: &M) -> Result<Self> {
        match &self.0 {
            None => Ok(Self::empty()),
            Some(t) => Ok(Self::from_tensor(t.apply(m)?)),
        }
    }
 }
 /// Streaming modules take as input a stream tensor and return a stream tensor. They may perform
 /// some internal buffering so that enough data has been received for the module to be able to
 /// perform some operations.
 pub trait StreamingModule {
    // TODO: Should we also have a flush method?
    fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor>;
    fn reset_state(&mut self);
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum BinOp {
    Add,
    Mul,
    Sub,
    Div,
 }
 #[derive(Debug, Clone)]
 pub struct StreamingBinOp {
    prev_lhs: StreamTensor,
    prev_rhs: StreamTensor,
    pub op: BinOp,
    pub dim: crate::D,
 }
 impl StreamingBinOp {
    pub fn new(op: BinOp, dim: crate::D) -> Self {
        Self {
            prev_lhs: StreamTensor::empty(),
            prev_rhs: StreamTensor::empty(),
            op,
            dim,
        }
    }
    pub fn reset_state(&mut self) {
        self.prev_lhs.reset();
        self.prev_rhs.reset();
    }
    pub fn forward(&self, lhs: &Tensor, rhs: &Tensor) -> Result<Tensor> {
        match self.op {
            BinOp::Add => Tensor::add(lhs, rhs),
            BinOp::Mul => Tensor::mul(lhs, rhs),
            BinOp::Sub => Tensor::sub(lhs, rhs),
            BinOp::Div => Tensor::div(lhs, rhs),
        }
    }
    pub fn step(&mut self, lhs: &StreamTensor, rhs: &StreamTensor) -> Result<StreamTensor> {
        let lhs = StreamTensor::cat2(&self.prev_lhs, lhs, self.dim)?;
        let rhs = StreamTensor::cat2(&self.prev_rhs, rhs, self.dim)?;
        let lhs_len = lhs.seq_len(self.dim)?;
        let rhs_len = rhs.seq_len(self.dim)?;
        let common_len = usize::min(lhs_len, rhs_len);
        let (lhs, prev_lhs) = lhs.split(self.dim, common_len)?;
        let (rhs, prev_rhs) = rhs.split(self.dim, common_len)?;
        let ys = match (lhs.0, rhs.0) {
            (Some(lhs), Some(rhs)) => {
                let ys = self.forward(&lhs, &rhs)?;
                StreamTensor::from_tensor(ys)
            }
            (None, None) => StreamTensor::empty(),
            (lhs, rhs) => crate::bail!("INTERNAL ERROR inconsistent lhs and rhs {lhs:?} {rhs:?}"),
        };
        self.prev_lhs = prev_lhs;
        self.prev_rhs = prev_rhs;
        Ok(ys)
    }
 }
 /// Simple wrapper that doesn't do any buffering.
 pub struct Map<T: crate::Module>(T);
 impl<T: crate::Module> StreamingModule for Map<T> {
    fn reset_state(&mut self) {}
    fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
        xs.apply(&self.0)
    }
 }
--- a/candle-core/src/strided_index.rs
+++ b/candle-core/src/strided_index.rs
@ -32,11 +32,14 @@ impl<'a> StridedIndex<'a> {
    }
 }
-impl Iterator for StridedIndex<'_> {
+impl<'a> Iterator for StridedIndex<'a> {
    type Item = usize;
    fn next(&mut self) -> Option<Self::Item> {
-        let storage_index = self.next_storage_index?;
+        let storage_index = match self.next_storage_index {
            None => return None,
            Some(storage_index) => storage_index,
        };
        let mut updated = false;
        let mut next_storage_index = storage_index;
        for ((multi_i, max_i), stride_i) in self
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
--- a/candle-core/src/tensor_cat.rs
+++ b/candle-core/src/tensor_cat.rs
@ -1,303 +0,0 @@
 use crate::{shape::Dim, Context, Error, Result, Shape, Tensor};
 impl Tensor {
    /// Concatenates two or more tensors along a particular dimension.
    ///
    /// All tensors must of the same rank, and the output will have
    /// the same rank
    ///
    /// ```rust
    /// # use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
    /// let c = Tensor::cat(&[&a, &b], 0)?;
    /// assert_eq!(c.shape().dims(), &[4, 3]);
    ///
    /// let c = Tensor::cat(&[&a, &b], 1)?;
    /// assert_eq!(c.shape().dims(), &[2, 6]);
    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
        if args.is_empty() {
            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
        }
        let arg0 = args[0].as_ref();
        if args.len() == 1 {
            return Ok(arg0.clone());
        }
        let dim = dim.to_index(arg0.shape(), "cat")?;
        for arg in args {
            arg.as_ref().check_dim(dim, "cat")?;
        }
        for (arg_idx, arg) in args.iter().enumerate() {
            let arg = arg.as_ref();
            if arg0.rank() != arg.rank() {
                Err(Error::UnexpectedNumberOfDims {
                    expected: arg0.rank(),
                    got: arg.rank(),
                    shape: arg.shape().clone(),
                }
                .bt())?
            }
            for (dim_idx, (v1, v2)) in arg0
                .shape()
                .dims()
                .iter()
                .zip(arg.shape().dims().iter())
                .enumerate()
            {
                if dim_idx != dim && v1 != v2 {
                    Err(Error::ShapeMismatchCat {
                        dim: dim_idx,
                        first_shape: arg0.shape().clone(),
                        n: arg_idx + 1,
                        nth_shape: arg.shape().clone(),
                    }
                    .bt())?
                }
            }
        }
        let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous());
        if all_contiguous {
            Self::cat_contiguous(args, dim)
        } else if dim == 0 {
            Self::cat0(args)
        } else {
            let args: Vec<Tensor> = args
                .iter()
                .map(|a| a.as_ref().transpose(0, dim))
                .collect::<Result<Vec<_>>>()?;
            let cat = Self::cat0(&args)?;
            cat.transpose(0, dim)
        }
    }
    fn cat0<A: AsRef<Tensor>>(args: &[A]) -> Result<Self> {
        if args.is_empty() {
            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
        }
        let arg0 = args[0].as_ref();
        if args.len() == 1 {
            return Ok(arg0.clone());
        }
        let rank = arg0.rank();
        let device = arg0.device();
        let dtype = arg0.dtype();
        let first_dims = arg0.shape().dims();
        let mut cat_dims = first_dims.to_vec();
        cat_dims[0] = 0;
        let mut offsets = vec![0usize];
        for (arg_idx, arg) in args.iter().enumerate() {
            let arg = arg.as_ref();
            if arg.dtype() != dtype {
                Err(Error::DTypeMismatchBinaryOp {
                    lhs: dtype,
                    rhs: arg.dtype(),
                    op: "cat",
                }
                .bt())?
            }
            if arg.device().location() != device.location() {
                Err(Error::DeviceMismatchBinaryOp {
                    lhs: device.location(),
                    rhs: arg.device().location(),
                    op: "cat",
                }
                .bt())?
            }
            if rank != arg.rank() {
                Err(Error::UnexpectedNumberOfDims {
                    expected: rank,
                    got: arg.rank(),
                    shape: arg.shape().clone(),
                }
                .bt())?
            }
            for (dim_idx, (v1, v2)) in arg0
                .shape()
                .dims()
                .iter()
                .zip(arg.shape().dims().iter())
                .enumerate()
            {
                if dim_idx == 0 {
                    cat_dims[0] += v2;
                }
                if dim_idx != 0 && v1 != v2 {
                    Err(Error::ShapeMismatchCat {
                        dim: dim_idx,
                        first_shape: arg0.shape().clone(),
                        n: arg_idx + 1,
                        nth_shape: arg.shape().clone(),
                    }
                    .bt())?
                }
            }
            let next_offset = offsets.last().context("empty offsets")? + arg.elem_count();
            offsets.push(next_offset);
        }
        let shape = Shape::from(cat_dims);
        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0));
        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
        for (arg, &offset) in args.iter().zip(offsets.iter()) {
            let arg = arg.as_ref();
            arg.storage()
                .copy_strided_src(&mut storage, offset, arg.layout())?;
        }
        Ok(crate::tensor::from_storage(storage, shape, op, false))
    }
    fn cat_contiguous<A: AsRef<Tensor>>(args: &[A], dim: usize) -> Result<Self> {
        if args.is_empty() {
            Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())?
        }
        let arg0 = args[0].as_ref();
        if args.len() == 1 {
            return Ok(arg0.clone());
        }
        let rank = arg0.rank();
        let device = arg0.device();
        let dtype = arg0.dtype();
        let first_dims = arg0.shape().dims();
        let mut cat_dims = first_dims.to_vec();
        cat_dims[dim] = 0;
        for (arg_idx, arg) in args.iter().enumerate() {
            let arg = arg.as_ref();
            if arg.dtype() != dtype {
                Err(Error::DTypeMismatchBinaryOp {
                    lhs: dtype,
                    rhs: arg.dtype(),
                    op: "cat",
                }
                .bt())?
            }
            if arg.device().location() != device.location() {
                Err(Error::DeviceMismatchBinaryOp {
                    lhs: device.location(),
                    rhs: arg.device().location(),
                    op: "cat",
                }
                .bt())?
            }
            if rank != arg.rank() {
                Err(Error::UnexpectedNumberOfDims {
                    expected: rank,
                    got: arg.rank(),
                    shape: arg.shape().clone(),
                }
                .bt())?
            }
            for (dim_idx, (v1, v2)) in arg0
                .shape()
                .dims()
                .iter()
                .zip(arg.shape().dims().iter())
                .enumerate()
            {
                if dim_idx == dim {
                    cat_dims[dim] += v2;
                }
                if dim_idx != dim && v1 != v2 {
                    Err(Error::ShapeMismatchCat {
                        dim: dim_idx,
                        first_shape: arg0.shape().clone(),
                        n: arg_idx + 1,
                        nth_shape: arg.shape().clone(),
                    }
                    .bt())?
                }
            }
        }
        let cat_target_dim_len = cat_dims[dim];
        let block_size: usize = cat_dims.iter().skip(1 + dim).product();
        let shape = Shape::from(cat_dims);
        let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim));
        let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? };
        let mut dst_o = 0;
        for arg in args.iter() {
            let arg = arg.as_ref();
            let arg_dims = arg.shape().dims();
            let d1: usize = arg_dims.iter().take(dim).product();
            let d2 = block_size * arg_dims[dim];
            let dst_s = block_size * cat_target_dim_len;
            let src_o = arg.layout().start_offset();
            arg.storage().copy2d(
                &mut storage,
                d1,
                d2,
                /* src_s */ d2,
                dst_s,
                src_o,
                dst_o,
            )?;
            dst_o += d2;
        }
        Ok(crate::tensor::from_storage(storage, shape, op, false))
    }
    /// Set the values on `self` using values from `src`. The copy starts at the specified
    /// `offset` for the target dimension `dim` on `self`.
    /// `self` and `src` must have the same shape except on dimension `dim` where the `self` size
    /// has to be greater than or equal to `offset` plus the `src` size.
    ///
    /// Note that this modifies `self` in place and as such is not compatibel with
    /// back-propagation.  
    pub fn slice_set<D: Dim>(&self, src: &Self, dim: D, offset: usize) -> Result<()> {
        let dim = dim.to_index(self.shape(), "slice-set")?;
        if !self.is_contiguous() || !src.is_contiguous() {
            Err(Error::RequiresContiguous { op: "slice-set" }.bt())?
        }
        if self.same_storage(src) {
            crate::bail!("cannot use slice_set when self and src share their storage")
        }
        if self.dtype() != src.dtype() {
            Err(Error::DTypeMismatchBinaryOp {
                lhs: self.dtype(),
                rhs: src.dtype(),
                op: "slice-set",
            }
            .bt())?
        }
        if self.device().location() != src.device().location() {
            Err(Error::DeviceMismatchBinaryOp {
                lhs: self.device().location(),
                rhs: src.device().location(),
                op: "slice-set",
            }
            .bt())?
        }
        if self.rank() != src.rank() {
            Err(Error::UnexpectedNumberOfDims {
                expected: self.rank(),
                got: src.rank(),
                shape: self.shape().clone(),
            }
            .bt())?
        }
        for (dim_idx, (v1, v2)) in self.dims().iter().zip(src.dims().iter()).enumerate() {
            if dim_idx == dim && *v2 + offset > *v1 {
                crate::bail!("shape mismatch on target dim, dst: {v1}, src: {v2} + {offset}")
            }
            if dim_idx != dim && v1 != v2 {
                crate::bail!("shape mismatch on dim {dim_idx}, {v1} <> {v2}")
            }
        }
        let block_size: usize = src.dims().iter().skip(1 + dim).product();
        let d1: usize = src.dims().iter().take(dim).product();
        let d2 = block_size * src.dims()[dim];
        let dst_o = self.layout().start_offset() + offset * block_size;
        let src_o = src.layout().start_offset();
        src.storage().copy2d(
            &mut self.storage_mut(),
            d1,
            d2,
            /* src_s */ d2,
            /* dst_s */ block_size * self.dims()[dim],
            src_o,
            dst_o,
        )?;
        Ok(())
    }
 }
--- a/candle-core/src/test_utils.rs
+++ b/candle-core/src/test_utils.rs
@ -4,7 +4,7 @@ use crate::{Result, Tensor};
 macro_rules! test_device {
    // TODO: Switch to generating the two last arguments automatically once concat_idents is
    // stable. https://github.com/rust-lang/rust/issues/29599
-    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident, $test_metal: ident) => {
+    ($fn_name: ident, $test_cpu: ident, $test_cuda: ident) => {
        #[test]
        fn $test_cpu() -> Result<()> {
            $fn_name(&Device::Cpu)
@ -15,24 +15,9 @@ macro_rules! test_device {
        fn $test_cuda() -> Result<()> {
            $fn_name(&Device::new_cuda(0)?)
        }
        #[cfg(feature = "metal")]
        #[test]
        fn $test_metal() -> Result<()> {
            $fn_name(&Device::new_metal(0)?)
        }
    };
 }
 pub fn assert_tensor_eq(t1: &Tensor, t2: &Tensor) -> Result<()> {
    assert_eq!(t1.shape(), t2.shape());
    // Default U8 may not be large enough to hold the sum (`t.sum_all` defaults to the dtype of `t`)
    let eq_tensor = t1.eq(t2)?.to_dtype(crate::DType::U32)?;
    let all_equal = eq_tensor.sum_all()?;
    assert_eq!(all_equal.to_scalar::<u32>()?, eq_tensor.elem_count() as u32);
    Ok(())
 }
 pub fn to_vec0_round(t: &Tensor, digits: i32) -> Result<f32> {
    let b = 10f32.powi(digits);
    let t = t.to_vec0::<f32>()?;
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -1,4 +1,3 @@
 //! Useful functions for checking features.
 use std::str::FromStr;
 pub fn get_num_threads() -> usize {
@ -24,10 +23,6 @@ pub fn cuda_is_available() -> bool {
    cfg!(feature = "cuda")
 }
 pub fn metal_is_available() -> bool {
    cfg!(feature = "metal")
 }
 pub fn with_avx() -> bool {
    cfg!(target_feature = "avx")
 }
--- a/candle-core/src/variable.rs
+++ b/candle-core/src/variable.rs
@ -34,14 +34,9 @@ impl Var {
        Ok(Self(inner))
    }
    // Convert a tensor to a variable, if the tensor is already a variable then it is returned as is.
    pub fn from_tensor(t: &Tensor) -> Result<Self> {
-        if t.is_variable() {
+        let inner = t.make_var()?;
-            Ok(Self(t.clone()))
+        Ok(Self(inner))
        } else {
            let inner = t.make_var()?;
            Ok(Self(inner))
        }
    }
    pub fn rand_f64<S: Into<Shape>>(
@ -112,10 +107,6 @@ impl Var {
        Ok(Self(inner))
    }
    pub fn as_detached_tensor(&self) -> Tensor {
        self.0.detach()
    }
    pub fn as_tensor(&self) -> &Tensor {
        &self.0
    }
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -13,14 +13,6 @@ res = torch.nn.functional.conv1d(t, w)
 print(res.flatten())
 res = torch.nn.functional.conv1d(t, w, padding=1)
 print(res.flatten())
 w_t = w.transpose(0, 1)
 res = torch.nn.functional.conv_transpose1d(t, w_t)
 print(res.shape)
 print(res)
 res = torch.nn.functional.conv_transpose1d(t, w_t, groups=2)
 print(res.shape)
 print(res)
 */
 fn conv1d(dev: &Device) -> Result<()> {
    let t = Tensor::new(
@ -53,45 +45,6 @@ fn conv1d(dev: &Device) -> Result<()> {
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
    let res = {
        let t = Tensor::cat(&[&t.zeros_like()?, &t, &t.zeros_like()?], 0)?;
        t.conv1d(&w, /*padding*/ 1, 1, 1, 1)?
    };
    assert_eq!(res.dims(), [3, 2, 5]);
    // Same as pytorch default padding: use zeros.
    assert_eq!(
        test_utils::to_vec1_round(&res.i(0)?.flatten_all()?, 4)?,
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]
    );
    assert_eq!(
        test_utils::to_vec1_round(&res.i(1)?.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
    let w = w.transpose(0, 1)?;
    // The CPU kernels applied in the contiguous and non contiguous cases are different.
    for w in [w.clone(), w.contiguous()?] {
        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 1)?;
        assert_eq!(res.dims(), [1, 2, 7]);
        assert_eq!(
            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
            [
                0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
                4.7076, -5.9745, -0.8276, 1.621
            ],
        );
        let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 2)?;
        assert_eq!(res.dims(), [1, 4, 7]);
        assert_eq!(
            test_utils::to_vec2_round(&res.squeeze(0)?, 4)?,
            [
                [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819],
                [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721],
                [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113],
                [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488]
            ]
        );
    }
    Ok(())
 }
@ -149,7 +102,7 @@ fn conv2d(dev: &Device) -> Result<()> {
            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
        ],
        dev,
    )?;
@ -177,25 +130,7 @@ fn conv2d(dev: &Device) -> Result<()> {
            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
        ]
    );
    let res = {
        let t = Tensor::cat(&[&t.zeros_like()?, &t, &t.zeros_like()?], 0)?;
        t.conv2d(&w, 0, 1, 1, 1)?
    };
    assert_eq!(res.dims(), [3, 2, 3, 3]);
    assert_eq!(
        test_utils::to_vec1_round(&res.i(0)?.flatten_all()?, 4)?,
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]
    );
    assert_eq!(
        test_utils::to_vec1_round(&res.i(1)?.flatten_all()?, 4)?,
        [
            -4.2812, 2.0923, 5.2187, 7.5184, 0.752, -14.9426, 10.0087, 4.391, 0.2918, 1.6715,
            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
        ]
    );
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
    assert_eq!(res.dims(), [1, 2, 7, 7]);
    assert_eq!(
        test_utils::to_vec3_round(&res.i(0)?, 4)?,
@ -220,7 +155,6 @@ fn conv2d(dev: &Device) -> Result<()> {
            ]
        ]
    );
    // Dilations.
    let res = t.conv2d(&w, 0, 1, 2, 1)?;
    assert_eq!(res.dims(), [1, 2, 1, 1]);
@ -259,7 +193,6 @@ fn conv2d(dev: &Device) -> Result<()> {
            ]
        ]
    );
    Ok(())
 }
@ -306,13 +239,13 @@ fn conv2d_small(dev: &Device) -> Result<()> {
    assert_eq!(
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1640,
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
-            -0.0111, -0.1742, 0.0, 0.0, 0.0, 0.0, 2.6437, -2.0268, 1.1823, 0.0, 0.0, 0.0, 0.0,
+            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1640, -0.0111, -0.1742, 0.0000, 0.0000,
-            3.2855, -1.0324, 0.2539, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0000, 0.0000, 2.6437, -2.0268, 1.1823, 0.0000, 0.0000, 0.0000, 0.0000, 3.2855,
-            0.0, 0.0, 0.0, 0.0
+            -1.0324, 0.2539, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000
        ]
    );
    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
    assert_eq!(res.dims(), [1, 1, 3, 3]);
    assert_eq!(
@ -414,7 +347,6 @@ print(w.grad.shape)
 print(w.grad[0])
 */
 fn conv2d_grad(dev: &Device) -> Result<()> {
    // conv-transposes are not implemented for metal
    use candle_core::Var;
    let t = Var::from_slice(
        &[
@ -427,7 +359,7 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
+            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
        ],
        (1, 4, 5, 5),
        dev,
@ -547,348 +479,17 @@ fn conv2d_grad(dev: &Device) -> Result<()> {
            ]
        ]
    );
    // Replicate the issue from https://github.com/huggingface/candle/issues/1212
    let res = t.i((.., .., 0..4, 0..4))?.conv2d(&w, 0, 2, 1, 1)?;
    let loss = res.sqr()?.sum_all()?;
    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 21.12f32);
    let grads = loss.backward()?;
    let grad_t = grads.get(&t).unwrap();
    let grad_w = grads.get(&w).unwrap();
    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
    assert_eq!(
        test_utils::to_vec3_round(&grad_t.i(0)?, 2)?,
        [
            [
                [9.29, -7.03, 7.87, 0.0, 0.0],
                [-1.8, -7.82, 5.9, 0.0, 0.0],
                [-3.12, 4.49, 5.52, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0]
            ],
            [
                [21.73, 3.39, 4.77, 0.0, 0.0],
                [8.25, 3.73, 27.61, 0.0, 0.0],
                [-20.55, -5.61, -2.77, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0]
            ],
            [
                [-8.98, 9.91, -7.15, 0.0, 0.0],
                [4.93, -0.33, 4.56, 0.0, 0.0],
                [-6.7, -5.76, -8.05, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0]
            ],
            [
                [23.54, 6.98, -10.0, 0.0, 0.0],
                [9.65, 6.18, 18.72, 0.0, 0.0],
                [3.29, -5.27, 0.79, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0]
            ]
        ]
    );
    assert_eq!(
        test_utils::to_vec3_round(&grad_w.i(0)?, 2)?,
        [
            [
                [-3.47, 7.44, 0.66],
                [12.89, -3.4, -9.29],
                [-14.16, -0.83, 7.14]
            ],
            [
                [-3.23, 5.37, -3.02],
                [-2.12, -11.24, 1.94],
                [6.97, 7.2, 2.99]
            ],
            [
                [-4.04, -3.31, 4.87],
                [-6.68, -5.68, 1.73],
                [-5.54, 4.32, 0.52]
            ],
            [[-4.72, 1.5, 4.72], [3.79, 4.04, 6.76], [-4.6, 5.8, 6.93]]
        ]
    );
    // Conv Transpose 2d Test
    //tested against following python
    // import torch
    // torch.manual_seed(4242)
    // padding = 4
    // outpadding = 2
    // dilation = 3
    // stride = 3
    // input = torch.randn((1, 4, 7, 5), requires_grad=True)
    // kernel = torch.randn((4, 2, 3, 5), requires_grad=True)
    // print("input", input.flatten())
    // print("kernel", kernel.flatten())
    // res = torch.nn.functional.conv_transpose2d(
    //     input,
    //     kernel,
    //     stride=stride,
    //     padding=padding,
    //     dilation=dilation,
    //     output_padding=outpadding,
    // )
    // res.retain_grad()
    // print(res.shape)
    // loss = (res**2).sum()
    // print(loss)
    // loss.backward()
    // print(input.grad.shape)
    // print("input grad", torch.round(input.grad, decimals=1))
    // print(kernel.grad.shape)
    // print("kernel grad", torch.round(kernel.grad.flatten(), decimals=1))
    let padding = 4;
    let outpadding = 2;
    let dilation = 3;
    let stride = 3;
    let t = Var::from_slice(
        &[
            0.4056_f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997,
            3.0616, 1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843,
            0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013,
            -0.6836, 0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130,
            1.3123, 1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071,
            1.1586, 0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090,
            0.2049, 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323,
            -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742,
            0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.5411, -1.1715, -2.4997, 2.3249, -0.8912,
            -0.4733, -0.5701, -2.8888, -1.4112, -0.5471, -0.9234, -1.1660, 0.4189, -0.7465,
            -0.6473, 0.1402, 0.7875, 0.5377, -0.6779, -0.8088, -0.4864, -0.2312, 0.9279, 0.1264,
            1.5480, 0.8265, -0.1025, 0.5138, -0.2512, 0.1576, 1.2705, 0.3641, -0.9325, 0.6451,
            -0.8537, 0.2378, 0.1794, 0.2752, -0.3687, -1.1149, -0.1410, -0.5829, -0.0892, 1.4258,
            -2.2789, 0.5270, 0.1825, 1.7007, -0.5263, -0.2954, 0.4440, 0.5537, 0.3492, 0.6186,
            1.6475, 0.2219,
        ],
        (1, 4, 7, 5),
        dev,
    )?;
    #[rustfmt::skip]
    let w = Var::from_slice(
        &[
            -1.1744_f32, 0.3266, 2.5893, 1.0142, 0.1763, 0.7752, 0.6604, 0.2029, -0.2145, 0.7234,
            -0.3441, -1.5400, -0.6333, 0.6613, 0.2083, 0.6230, -1.7002, 0.3393, 0.4049, 1.0762,
            0.2723, 1.4181, 0.0029, -0.2122, 1.7668, 1.4168, 0.3320, -0.2719, 0.7932, -0.7204,
            0.4447, 0.1211, 0.5908, 1.0089, -0.1646, 1.8033, -0.6286, 0.2016, -0.3370, 1.2555,
            0.8009, -0.6488, -0.4652, -1.5685, 1.5860, 0.5583, 0.4623, 0.6026, 0.8828, 2.4990,
            0.6811, -0.3369, 1.3320, 1.7669, -1.1067, 1.2958, -0.9415, -0.9655, -0.4462, 0.7181,
            0.5181, -1.1658, -1.8467, -0.7763, 1.2769, 0.8651, 0.9890, 1.5092, 0.7207, -0.8481,
            0.7417, 0.3375, -1.2685, 1.4572, 1.0915, 0.1093, -0.8550, -0.5831, -0.6309, -0.2509,
            0.5220, -0.0914, 0.7900, 0.1096, 0.3258, 0.2723, -1.0942, -0.3393, -0.1653, 0.5732,
            -0.8014, 1.8194, -1.9023, 0.2127, 1.8636, -0.8979, 0.1927, -0.2778, 0.3105, 0.0071,
            -1.1823, 0.2476, -0.7178, -1.3821, 1.0769, -0.4376, -0.9967, -0.1227, 1.6197, -1.0604,
            0.1372, 0.8141, -0.6163, 0.7304, -0.8285, 2.0636, -0.7176, 0.2495, -0.2581, -0.4478,
        ],
        (4, 2, 3, 5),
        dev,
    )?;
    let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?;
    let loss = res.sqr()?.sum_all()?;
    assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 2904.0);
    let grads = loss.backward()?;
    let grad_t = grads.get(&t).unwrap();
    let grad_w = grads.get(&w).unwrap();
    assert_eq!(grad_t.dims(), [1, 4, 7, 5]);
    assert_eq!(grad_w.dims(), [4, 2, 3, 5]);
    assert_eq!(
        test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?,
        [
            // torch gets 89.1
            -89.0, -135.3, 136.7, 102.0, -53.4, 117.9, 118.6, -43.9, -218.0, -58.5, -114.3, -150.0,
            -15.6, 172.1, 66.3, -64.3, -27.9, -19.8, 31.7, 62.1, 5.5, 92.6, 28.2, -29.6, 55.9,
            52.7, -72.7, -119.8, 53.8, -25.5, 128.8, 19.3, 68.0, 190.9, -64.1, -86.2, -111.2,
            106.6, -67.7, 37.8, 115.9, 50.4, -77.7, -54.9, 22.3, -4.6, 89.8, 61.7, 122.4, 192.6,
            -27.8, -104.6, 57.0, 166.4, 27.1, 6.1, 18.7, -93.2, 31.5, 168.2, -3.7, -99.5, -55.5,
            -10.8, 17.5, 20.8, 16.9, 43.8, 42.0, -89.2, 18.8, -9.6, -84.1, 212.6, 19.7, -50.0,
            -52.0, -40.0, -166.6, -73.2, -10.8, -73.3, 31.5, -23.4, -79.3, -27.0, -84.4, -42.9,
            -20.3, 51.8, -16.7, 76.3, -120.5, -65.8, 96.5, -10.7, -45.9, -88.1, 65.4, -7.0, -1.5,
            92.8, -25.1, -114.2, -5.8, -14.8, -51.2, -20.7, 54.2, -79.8, 47.7, -29.2, -8.8, 53.5,
            -28.4, 85.0, -18.3, 107.0, 28.3, -71.8
        ]
    );
    assert_eq!(
        test_utils::to_vec3_round(&grad_t.i(0)?, 1)?,
        [
            [
                [32.3, -41.6, -24.0, 14.1, 17.6],
                [-11.8, 72.5, 87.6, 46.4, 61.5],
                [115.0, 108.5, -48.6, -63.4, -50.0],
                [51.3, 5.4, 31.3, 91.1, -30.9],
                [52.7, 92.8, -68.0, -47.0, 83.0],
                // pytorch gets -107.1
                [-10.2, -107.0, -5.4, 213.1, -31.4],
                [-2.4, 65.1, 9.2, -146.2, -24.2]
            ],
            [
                [-72.6, -63.9, -61.9, 45.3, 33.0],
                [79.3, -0.5, -26.2, 78.2, 42.7],
                [90.9, 141.6, 40.1, -62.7, 37.0],
                [32.8, 198.2, -0.8, -31.1, 27.3],
                // torch gets 48.0
                [34.5, 34.9, -47.9, 127.6, -12.3],
                [-61.4, -3.2, -2.9, -10.9, -16.6],
                [74.6, 60.1, -68.9, 34.5, -50.4]
            ],
            [
                [37.5, -56.9, -43.6, -13.5, -9.9],
                [40.0, 97.3, 28.6, 14.2, -30.1],
                [-22.3, -126.3, -68.8, -8.2, 26.1],
                [-32.9, 37.3, 108.5, -54.8, 29.6],
                [34.9, -176.9, -125.0, -28.3, -13.9],
                [-54.9, 142.6, 62.1, -80.4, -65.6],
                [7.4, -91.1, -67.6, 35.0, 39.7]
            ],
            [
                [-57.2, -40.9, -10.1, 32.6, 29.4],
                [18.7, -18.0, 29.5, -1.2, 59.2],
                [-14.0, -74.4, 19.8, -117.0, 58.2],
                [-21.8, 163.5, -71.1, -99.0, 80.9],
                [-58.9, -10.9, 93.8, -139.6, 98.0],
                // torch gets 54.5
                [-54.4, 135.3, 6.0, -79.1, 134.6],
                [27.5, -76.0, 43.4, -2.8, -7.8]
            ]
        ]
    );
    // Test the same, but then with the following properties, t & w are unmodified.
    let padding = 1;
    let outpadding = 1;
    let dilation = 1;
    let stride = 2;
    let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?;
    let loss = res.sqr()?.sum_all()?;
    assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 3627.0); // torch gives 3626.8560
    let grads = loss.backward()?;
    let grad_t = grads.get(&t).unwrap();
    let grad_w = grads.get(&w).unwrap();
    assert_eq!(grad_t.dims(), [1, 4, 7, 5]);
    assert_eq!(grad_w.dims(), [4, 2, 3, 5]);
    #[rustfmt::skip]
    assert_eq!(
        test_utils::to_vec3_round(&grad_t.i(0)?, 1)?,
        [
            [
                [  13.2,  -40.7,   -9.7,  -47.3,  -82.7],
                [ -98.2,    9.7,   57.7,   -6.2,  180.7],
                [ 100.2,   24.1,    3.7, -100.5,  -48.1],
                [  -0.3,   13.5,   -2.9,   80.0,  -49.8],
                [  47.2,  -25.6,  -74.4,   61.2,  -18.4],
                [   4.6,  -69.5,   27.9,   66.5,  -88.1],
                 // 4th column on next row; torch is 4.2
                [ -12.0,   79.2,  -40.0,    4.1,  -97.1],
            ],
            [
                [ -42.2,  -36.5,  -51.1,    7.5,   32.3],
                [  74.1,  -44.6,  -68.8,   19.5,    7.7],
                [ 137.1,   54.2,  153.8,  -58.0,   45.5],
                [  24.4,  -56.8,    9.7,  -41.0,  -14.5],
                [  -3.7,   72.6,    8.3,  134.8,   40.5],
                [  43.2,  -56.9,  -47.5,  -89.4,  -95.4],
                [  68.2,  108.1,  -80.0,   57.0, -121.1]
            ],
            [
                [  31.1,  -11.4,  -34.8,   33.1,  -44.2],
                [  29.4,  -31.6,  -40.2,   13.7,   13.1],
                [  -0.8,  -83.8,   -7.8,  -17.3,   78.2],
                [  12.0, -118.7,  137.5,  -76.7,   50.8],
                [ -28.7, -114.2,   -3.7,  -96.3,  -13.8],
                [ -31.8,   28.5,  -14.3,    4.6,   13.4],
                [  28.0,   -0.2,  -38.9,  -29.7,  -59.0]
            ],
            [
                [ -16.8,   38.5,   15.5,   26.6,   48.9],
                [  14.5,   49.6,  -24.8,   65.6,   61.7],
                [  22.1,  -64.7,   -4.3,  -51.0,   36.3],
                [  31.0,  -88.9,   47.1, -123.5,   -3.8],
                [ -14.8,  -39.8,  128.2, -110.3,   42.6],
                // 1st column on next row; torch is -7.2
                [  -7.1,   95.3,  -21.3,  -58.7,  -13.9], 
                [  26.9,   21.3,   16.1,   70.3,   32.1]
            ]
        ]
    );
    #[rustfmt::skip]
    assert_eq!(
        test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?,
        [
            // 2nd value; torch gets -3.2, 3rd value; torch gets 221.8
           -2.460e+01, -3.100e+00,  2.219e+02,  7.400e+00,  5.620e+01,
            7.420e+01,  7.830e+01,  8.900e+00,  1.050e+01,  2.810e+01,
            5.100e+00, -1.046e+02, -1.572e+02,  8.710e+01, -9.840e+01,
           -4.230e+01, -1.898e+02,  1.860e+01, -3.570e+01,  9.810e+01,
            4.680e+01,  1.182e+02,  4.020e+01, -1.900e+00,  1.508e+02,
            1.094e+02,  1.018e+02, -4.620e+01,  1.591e+02, -2.320e+01,
            // 5th value; torch gets 7.1
           -8.450e+01, -4.600e+00,  6.330e+01,  1.123e+02, -7.000e+00,
            1.101e+02, -6.620e+01,  2.090e+01, -5.120e+01,  8.990e+01,
            9.050e+01, -6.990e+01,  6.800e+01, -9.250e+01,  1.380e+02,
            4.720e+01,  4.710e+01,  6.210e+01,  8.870e+01,  2.098e+02,
            3.870e+01, -1.390e+01,  6.270e+01,  1.484e+02, -9.920e+01,
           -4.200e+01, -1.505e+02, -1.480e+01, -2.620e+01,  8.220e+01,
           -3.350e+01, -2.260e+01, -1.198e+02, -5.080e+01,  1.259e+02,
            5.600e+01,  9.270e+01,  1.209e+02,  6.590e+01, -8.330e+01,
            7.000e+00, -2.600e+01, -1.133e+02,  3.870e+01,  4.020e+01,
           -6.300e+00, -8.710e+01, -5.150e+01, -8.510e+01,  2.000e-01,
            3.640e+01, -6.100e+00,  6.590e+01, -2.700e+00,  6.550e+01,
            // 4th value; torch gets 3.8
            5.300e+00, -6.760e+01, -4.270e+01, -3.900e+00,  2.880e+01,
            5.260e+01,  6.170e+01, -1.203e+02, -1.610e+01,  7.740e+01,
           -1.008e+02, -1.070e+01, -9.900e+00,  3.300e+00, -2.620e+01,
           -4.440e+01,  2.580e+01, -6.920e+01, -4.220e+01,  1.108e+02,
            1.240e+01, -3.440e+01, -2.800e+00,  7.880e+01, -6.690e+01,
            1.480e+01,  2.310e+01, -4.260e+01, -1.500e+00, -4.760e+01,
            5.350e+01, -2.260e+01,  8.000e-01, -3.840e+01, -2.500e+00
        ]
    );
    Ok(())
 }
-test_device!(conv1d, conv1d_cpu, conv1d_gpu, conv1d_metal);
+test_device!(conv1d, conv1d_cpu, conv1d_gpu);
-test_device!(
+test_device!(conv1d_small, conv1d_small_cpu, conv1d_small_gpu);
-    conv1d_small,
+test_device!(conv2d, conv2d_cpu, conv2d_gpu);
    conv1d_small_cpu,
    conv1d_small_gpu,
    conv1d_small_metal
 );
 test_device!(conv2d, conv2d_cpu, conv2d_gpu, conv2d_metal);
 test_device!(
    conv2d_non_square,
    conv2d_non_square_cpu,
-    conv2d_non_square_gpu,
+    conv2d_non_square_gpu
    conv2d_non_square_metal
 );
 test_device!(
    conv2d_small,
    conv2d_small_cpu,
    conv2d_small_gpu,
    conv2d_small_metal
 );
 test_device!(
    conv2d_smaller,
    conv2d_smaller_cpu,
    conv2d_smaller_gpu,
    conv2d_smaller_metal
 );
 test_device!(
    conv2d_grad,
    conv2d_grad_cpu,
    conv2d_grad_gpu,
    conv2_grad_metal
 );
 test_device!(conv2d_small, conv2d_small_cpu, conv2d_small_gpu);
 test_device!(conv2d_smaller, conv2d_smaller_cpu, conv2d_smaller_gpu);
 test_device!(conv2d_grad, conv2d_grad_cpu, conv2d_grad_gpu);
--- a/candle-core/tests/custom_op_tests.rs
+++ b/candle-core/tests/custom_op_tests.rs
@ -112,70 +112,3 @@ fn custom_op1_with_backward() -> Result<()> {
    Ok(())
 }
 impl candle_core::InplaceOp1 for Elu {
    fn name(&self) -> &'static str {
        "elu"
    }
    fn cpu_fwd(&self, s: &mut CpuStorage, _l: &Layout) -> Result<()> {
        let alpha = self.alpha;
        match s {
            CpuStorage::BF16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
            CpuStorage::F16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
            CpuStorage::F32(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
            CpuStorage::F64(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)),
            _ => candle_core::bail!("unsupported dtype for inplace elu"),
        }
        Ok(())
    }
 }
 #[test]
 fn inplace_op1() -> Result<()> {
    let cpu = &Device::Cpu;
    let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?;
    let t = (t - 5.)?;
    t.inplace_op1(&Elu { alpha: 1. })?;
    assert_eq!(
        to_vec1_round(&t, 4)?,
        &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
    );
    Ok(())
 }
 #[cfg(any(feature = "cuda", feature = "metal"))]
 #[allow(clippy::approx_constant)]
 #[test]
 fn ug_op() -> Result<()> {
    let kernel = {
        use ug::lang::op;
        let layout = ug::Layout::from_shape(&[12]);
        let ptr = op::Arg::ptr(ug::DType::F32);
        let src = op::load(ptr.id(), layout.clone(), ug::DType::F32)?;
        let src = op::unary(op::UnaryOp::Exp, src)?;
        let st = op::store(ptr.id(), layout, src)?;
        let kernel = op::Kernel::new("exp".to_string(), vec![ptr], vec![st]);
        let opts: ug::lower_op::Opts = Default::default();
        kernel.lower(&opts)?
    };
    let device = if candle_core::utils::cuda_is_available() {
        Device::new_cuda(0)?
    } else if candle_core::utils::metal_is_available() {
        Device::new_metal(0)?
    } else {
        candle_core::bail!("metal/cuda is mandatory for this test")
    };
    let op = candle_core::UgIOp1::new("test", kernel, &device)?;
    let t = Tensor::arange(0u32, 12u32, &device)?.to_dtype(DType::F32)?;
    t.inplace_op1(&op)?;
    assert_eq!(
        to_vec1_round(&t, 2)?,
        &[
            1.0, 2.72, 7.39, 20.09, 54.6, 148.41, 403.43, 1096.63, 2980.96, 8103.08, 22026.47,
            59874.13
        ]
    );
    Ok(())
 }
--- a/candle-core/tests/fortran_tensor_3d.pth
+++ b/candle-core/tests/fortran_tensor_3d.pth
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -1,6 +1,5 @@
 #![allow(clippy::approx_constant)]
 use anyhow::{Context, Result};
-use candle_core::{test_device, test_utils, DType, Device, Shape, Tensor, Var};
+use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var};
 fn simple_grad(device: &Device) -> Result<()> {
    let x = Var::new(&[3f32, 1., 4.], device)?;
@ -97,24 +96,24 @@ fn unary_grad(device: &Device) -> Result<()> {
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
+        y.to_vec1::<f32>()?,
-        [20.0855, 2.7183, 54.5982, 1.1618]
+        [20.085537, 2.7182817, 54.59815, 1.1618342]
    );
    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
+        grad_x.to_vec1::<f32>()?,
-        [20.0855, 2.7183, 54.5982, 1.1618]
+        [20.085537, 2.7182817, 54.59815, 1.1618342]
    );
    let y = x.exp()?.sqr()?;
    let grads = y.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(
-        test_utils::to_vec1_round(&y, 3)?,
+        y.to_vec1::<f32>()?,
-        [403.429, 7.389, 2980.958, 1.35]
+        [403.4288, 7.3890557, 2980.9578, 1.3498588]
    );
    // exp(x)^2 = exp(2*x)
    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 2)?,
+        grad_x.to_vec1::<f32>()?,
-        [806.86, 14.78, 5961.92, 2.7]
+        [806.8576, 14.778111, 5961.9155, 2.6997175]
    );
    let y = x.sin()?;
    let grads = y.backward()?;
@ -193,273 +192,6 @@ fn unary_grad(device: &Device) -> Result<()> {
        test_utils::to_vec1_round(grad_x, 2)?,
        [0.01, 0.42, 0.0, 0.98],
    );
    // testing compared to pytorch nn.GELU(approximate = 'tanh')
    let y = x.gelu()?;
    let grads = y.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec1_round(&y, 4)?,
        [2.9964, 0.8412, 3.9999, 0.0839]
    );
    assert_eq!(
        test_utils::to_vec1_round(grad_x, 4)?,
        [1.0116, 1.0830, 1.0003, 0.6188],
    );
    // Testing compared to pytorch torch.erf
    //
    // import torch
    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
    // y = x.erf()
    // print(y)
    // loss = y.sum()
    // loss.backward()
    // print(x.grad)
    let y = x.erf()?;
    let grads = y.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(test_utils::to_vec1_round(&y, 4)?, [1.0, 0.8427, 1.0, 0.168]);
    assert_eq!(
        test_utils::to_vec1_round(grad_x, 4)?,
        [0.0001, 0.4151, 0.0, 1.1033],
    );
    // Testing compared to pytorch nn.GELU(approximate = 'none')
    //
    // import torch
    // import torch.nn.functional as F
    // x = torch.tensor([3.0, 1.0, 4.0, 0.15], requires_grad=True)
    // y = F.gelu(x, approximate='none')
    // print(y)
    // loss = y.sum()
    // loss.backward()
    // print(x.grad)
    let y = x.gelu_erf()?;
    let grads = y.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec1_round(&y, 4)?,
        [2.9960, 0.8413, 3.9999, 0.0839]
    );
    assert_eq!(
        test_utils::to_vec1_round(grad_x, 4)?,
        [1.0119, 1.0833, 1.0005, 0.6188],
    );
    // Testing compared to pytorch elu
    //
    // import torch
    // import torch.nn.functional as F
    // x = torch.tensor([-1.0, 0.0, -2.0, 3.0], requires_grad=True)
    // y = F.elu(x, alpha=2.0)
    // print(y)
    // loss = y.min
    // loss = y.sum()
    // loss.backward()
    // print(x.grad)
    let elu_x = Var::new(&[-1.0f32, 0., -2., 3.], device)?;
    let y = elu_x.elu(2.)?;
    let grads = y.backward()?;
    let grad_x = grads.get(&elu_x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec1_round(&y, 4)?,
        [-1.2642, 0.0000, -1.7293, 3.0000]
    );
    assert_eq!(
        test_utils::to_vec1_round(grad_x, 4)?,
        [0.7358, 2.0000, 0.2707, 1.0000]
    );
    // testing compared to pytorch nn.Silu()
    let y = x.silu()?;
    let grads = y.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec1_round(&y, 4)?,
        [2.8577, 0.7311, 3.9281, 0.0806]
    );
    assert_eq!(
        test_utils::to_vec1_round(grad_x, 4)?,
        [1.0881, 0.9277, 1.0527, 0.5747],
    );
    if device.is_cpu() {
        let x = Var::new(&[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]], device)?;
        let y = x.interpolate1d(12)?.reshape(36)?;
        let z = Tensor::new(
            &[
                1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16.,
                17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
                33., 34., 35., 36.,
            ],
            device,
        )?;
        let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
        let grads = loss.backward()?;
        let grad_x = grads.get(&x).context("no grad for x")?;
        assert_eq!(
            test_utils::to_vec3_round(grad_x, 4)?,
            [[[10_f32, 26., 42.], [58., 74., 90.], [106., 122., 138.]]]
        );
    }
    // manually checked: see comments
    let x = Var::new(&[[[[1f32, 2., 3.], [4., 5., 6.], [7., 8., 9.]]]], device)?;
    let y = x.interpolate2d(6, 6)?.reshape(36)?;
    let z = Tensor::new(
        &[
            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
            35., 36.,
        ],
        device,
    )?;
    // gradient should be
    // row 1
    // 1+2+7+8 = 18
    // 3+4+9+10 = 26
    // 5+6+11+12 = 34
    // row 2
    // 13+14+19+20 = 66
    // 15+16+21+22 = 74
    // 17+18+23+24 = 82
    // row 3
    // 25+26+31+32 = 114
    // 27+28+33+34 = 122
    // 29+30+35+36 = 130
    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
    let grads = loss.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
        [[18_f32, 26., 34.], [66., 74., 82.], [114., 122., 130.]]
    );
    // manually checked: see comments
    let x = Var::new(&[[[[1f32, 2.], [4., 5.]]]], device)?;
    let y = x.interpolate2d(6, 6)?.reshape(36)?;
    let z = Tensor::new(
        &[
            1_f32, 02., 03., 04., 05., 06., 07., 08., 09., 10., 11., 12., 13., 14., 15., 16., 17.,
            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34.,
            35., 36.,
        ],
        device,
    )?;
    // gradient should be
    // row 1
    // 1+2+3+7+8+9+13+14+15 = 72
    // 4+5+6+10+11+12+16+17+18 = 99
    // row 2
    // 19+20+21+25+26+27+31+32+33 = 234
    // 22+23+24+28+29+30+34+35+36 = 243
    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
    let grads = loss.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec2_round(&grad_x.flatten(0, 2)?, 4)?,
        [[72_f32, 99.], [234., 261.]]
    );
    // manually checked: see comments
    let x = Var::new(&[[[[1f32, 2.], [4., 5.]], [[6f32, 7.], [8., 9.]]]], device)?;
    let y = x.interpolate2d(4, 4)?.reshape(32)?;
    #[rustfmt::skip]
    let z = Tensor::new(
        &[
            1_f32, 02., 03., 04.,
            05.,   06., 07., 08.,
            09.,   10., 11., 12.,
            13.,   14., 15., 16.,
            17.,   18., 19., 20.,
            21.,   22., 23., 24.,
            25.,   26., 27., 28.,
            29.,   30., 31., 32.
        ],
        device,
    )?;
    // gradient should be
    // m1r1
    // 1+2+5+6=14
    // 3+4+7+8=22
    // m1r2
    // 9+10+13+14=46
    // 11+12+15+16=54
    // m2r1
    // 17+18+21+22=78
    // 19+20+23+24=86
    // m2r2
    // 25+26+29+30=110
    // 27+28+31+32=118
    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
    let grads = loss.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
    );
    // manually checked: see comments
    let x = Var::new(
        &[[[[1f32, 2.], [4., 5.]]], [[[6f32, 7.], [8., 9.]]]],
        device,
    )?;
    let y = x.interpolate2d(4, 4)?.reshape(32)?;
    #[rustfmt::skip]
       let z = Tensor::new(
           &[
               1_f32, 02., 03., 04.,
               05.,   06., 07., 08.,
               09.,   10., 11., 12.,
               13.,   14., 15., 16.,
               17.,   18., 19., 20.,
               21.,   22., 23., 24.,
               25.,   26., 27., 28.,
               29.,   30., 31., 32.
           ],
           device,
       )?;
    // gradient should be
    // m1r1
    // 1+2+5+6=14
    // 3+4+7+8=22
    // m1r2
    // 9+10+13+14=46
    // 11+12+15+16=54
    // m2r1
    // 17+18+21+22=78
    // 19+20+23+24=86
    // m2r2
    // 25+26+29+30=110
    // 27+28+31+32=118
    let loss = y.unsqueeze(1)?.transpose(0, 1)?.matmul(&z.unsqueeze(1)?)?;
    let grads = loss.backward()?;
    let grad_x = grads.get(&x).context("no grad for x")?;
    assert_eq!(
        test_utils::to_vec3_round(&grad_x.flatten(0, 1)?, 4)?,
        [[[14_f32, 22.], [46., 54.]], [[78., 86.], [110., 118.]]]
    );
    Ok(())
 }
@ -486,78 +218,12 @@ fn binary_grad(device: &Device) -> Result<()> {
    let grad_x = grads.get(x).context("no grad for x")?;
    assert_eq!(y.to_vec1::<f32>()?, [3., 1., -4., -1.]);
    assert_eq!(grad_x.to_vec1::<f32>()?, [1., 1., 1., 1.]);
    let x_var = Var::new(&[3f32, 1., -4., -1., 5., 9.], device)?;
    let x = x_var.as_tensor();
    let y_var = Var::new(&[2f32, 7., 1.], device)?;
    let y = y_var.as_tensor();
    let ss = x
        .reshape((2, 3))?
        .slice_scatter0(&y.reshape((1, 3))?, 1)?
        .sqr()?;
    let grads = ss.backward()?;
    let grad_x = grads.get(x).context("no grad for x")?;
    let grad_y = grads.get(y).context("no grad for y")?;
    assert_eq!(ss.to_vec2::<f32>()?, [[9., 1., 16.], [4., 49., 1.]]);
    assert_eq!(grad_x.to_vec1::<f32>()?, [6.0, 2.0, -8.0, 0.0, 0.0, 0.0]);
    assert_eq!(grad_y.to_vec1::<f32>()?, [4.0, 14.0, 2.0]);
    Ok(())
 }
-#[test]
+test_device!(simple_grad, simple_grad_cpu, simple_grad_gpu);
-fn test_flip_backprop() -> Result<()> {
+test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu);
-    let device = &Device::Cpu;
+test_device!(matmul_grad, matmul_grad_cpu, matmul_grad_gpu);
-
+test_device!(grad_descent, grad_descent_cpu, grad_descent_gpu);
-    // Create a tensor (leaf node) that requires gradients
+test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu);
-    let x = Var::ones((2, 2), DType::F64, device)?;
+test_device!(binary_grad, binary_grad_cpu, binary_grad_gpu);
    let weights = Tensor::arange(1.0, 5.0, device)?.reshape((2, 2))?;
    let y = x.matmul(&weights)?;
    let expected_y = Tensor::from_vec(vec![4.0, 6.0, 4.0, 6.0], (2, 2), device)?;
    candle_core::test_utils::assert_tensor_eq(&y, &expected_y)?;
    let z = y.flip(&[1])?;
    let expected_z = Tensor::from_vec(vec![6.0, 4.0, 6.0, 4.0], (2, 2), device)?;
    candle_core::test_utils::assert_tensor_eq(&z, &expected_z)?;
    let loss = z.sum_all()?;
    let grad_store = loss.backward()?;
    let grad_x = grad_store.get_id(x.id()).unwrap();
    let flipped_weights = weights.flip(&[1])?;
    let dloss_dy = Tensor::ones((2, 2), DType::F64, device)?;
    // dloss/dx = dloss/dy @ dy/dx = ones @ weight.flip.T
    let expected_grad = dloss_dy.matmul(&flipped_weights.t()?)?;
    candle_core::test_utils::assert_tensor_eq(grad_x, &expected_grad)?;
    Ok(())
 }
 test_device!(
    simple_grad,
    simple_grad_cpu,
    simple_grad_gpu,
    simple_grad_metal
 );
 test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu, sum_grad_metal);
 test_device!(
    matmul_grad,
    matmul_grad_cpu,
    matmul_grad_gpu,
    matmul_grad_metal
 );
 test_device!(
    grad_descent,
    grad_descent_cpu,
    grad_descent_gpu,
    grad_descent_metal
 );
 test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu, unary_grad_metal);
 test_device!(
    binary_grad,
    binary_grad_cpu,
    binary_grad_gpu,
    binary_grad_metal
 );
--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
@ -91,32 +91,3 @@ fn index_3d() -> Result<()> {
    assert_eq!(tensor.i((1, .., 3))?.to_vec1::<u32>()?, &[15, 19, 23]);
    Ok(())
 }
 #[test]
 fn slice_assign() -> Result<()> {
    let dev = Device::Cpu;
    let tensor = Tensor::arange(0u32, 4 * 5, &dev)?.reshape((4, 5))?;
    let src = Tensor::arange(0u32, 2 * 3, &dev)?.reshape((3, 2))?;
    let out = tensor.slice_assign(&[1..4, 3..5], &src)?;
    assert_eq!(
        out.to_vec2::<u32>()?,
        &[
            [0, 1, 2, 3, 4],
            [5, 6, 7, 0, 1],
            [10, 11, 12, 2, 3],
            [15, 16, 17, 4, 5]
        ]
    );
    let out = tensor.slice_assign(&[0..3, 0..2], &src)?;
    assert_eq!(
        out.to_vec2::<u32>()?,
        &[
            [0, 1, 2, 3, 4],
            [2, 3, 7, 8, 9],
            [4, 5, 12, 13, 14],
            [15, 16, 17, 18, 19]
        ]
    );
    Ok(())
 }
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -49,7 +49,7 @@ fn contiguous(device: &Device) -> Result<()> {
    Ok(())
 }
-test_device!(contiguous, contiguous_cpu, contiguous_gpu, contiguous_metal);
+test_device!(contiguous, contiguous_cpu, contiguous_gpu);
 #[test]
 fn strided_blocks() -> Result<()> {
@ -88,7 +88,7 @@ fn strided_blocks() -> Result<()> {
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
-    let tensor = tensor.i((.., 1))?.contiguous()?;
+    let tensor = tensor.i((.., 1))?;
    match tensor.strided_blocks() {
        candle::StridedBlocks::SingleBlock { start_offset, len } => {
            assert_eq!(start_offset, 0);
@ -100,20 +100,6 @@ fn strided_blocks() -> Result<()> {
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
    let tensor = tensor.i((.., 1))?;
    match tensor.strided_blocks() {
        candle::StridedBlocks::SingleBlock { .. } => {
            panic!("unexpected block structure")
        }
        candle::StridedBlocks::MultipleBlocks {
            block_len,
            block_start_index,
        } => {
            assert_eq!(block_len, 4);
            assert_eq!(block_start_index.collect::<Vec<_>>(), &[4, 16])
        }
    };
    let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?;
    match tensor.t()?.strided_blocks() {
        candle::StridedBlocks::SingleBlock { .. } => {
            panic!("unexpected block structure")
--- a/Show More
+++ b/Show More