Adding benchmark.

2025-06-17 19:18:50 +00:00 · 2023-08-29 17:01:40 +02:00
436 changed files with 5720 additions and 64993 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@ -1,8 +1,8 @@
-[build]
+[target.x86_64-unknown-linux-gnu]
+rustflags = ["-C", "target-cpu=native"]
+
+[target.aarch64-apple-darwin]
 rustflags = ["-C", "target-cpu=native"]

 [target.wasm32-unknown-unknown]
 rustflags = ["-C", "target-feature=+simd128"]
-
-[target.x86_64-apple-darwin]
-rustflags = ["-C", "target-feature=-avx,-avx2"]
--- a/.github/workflows/book-cd.yml
+++ b/.github/workflows/book-cd.yml
@ -1,40 +0,0 @@
-name: Deploy Rust book
-on:
-  push:
-    branches:
-      - main
-
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write  # To push a branch 
-      pull-requests: write  # To create a PR from that branch
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
-    - name: Install latest mdbook
-      run: |
-        tag=$(curl 'https://api.github.com/repos/rust-lang/mdbook/releases/latest' | jq -r '.tag_name')
-        url="https://github.com/rust-lang/mdbook/releases/download/${tag}/mdbook-${tag}-x86_64-unknown-linux-gnu.tar.gz"
-        mkdir mdbook
-        curl -sSL $url | tar -xz --directory=./mdbook
-        echo `pwd`/mdbook >> $GITHUB_PATH
-    - name: Deploy GitHub Pages
-      run: |
-        # This assumes your book is in the root of your repository.
-        # Just add a `cd` here if you need to change to another directory.
-        cd candle-book
-        mdbook build
-        git worktree add gh-pages
-        git config user.name "Deploy from CI"
-        git config user.email ""
-        cd gh-pages
-        # Delete the ref to avoid keeping history.
-        git update-ref -d refs/heads/gh-pages
-        rm -rf *
-        mv ../book/* .
-        git add .
-        git commit -m "Deploy $GITHUB_SHA to gh-pages"
-        git push --force --set-upstream origin gh-pages
--- a/.github/workflows/book.yml
+++ b/.github/workflows/book.yml
@ -1,29 +0,0 @@
-name: CI
-on: 
-  pull_request:
-
-jobs:
-  test:
-    name: Test candle-book
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write  # To push a branch 
-      pull-requests: write  # To create a PR from that branch
-    steps:
-    - uses: actions/checkout@master
-    - name: Install Rust
-      run: |
-        rustup set profile minimal
-        rustup toolchain install stable
-        rustup default stable
-    - name: Install latest mdbook
-      run: |
-        tag=$(curl 'https://api.github.com/repos/rust-lang/mdbook/releases/latest' | jq -r '.tag_name')
-        url="https://github.com/rust-lang/mdbook/releases/download/${tag}/mdbook-${tag}-x86_64-unknown-linux-gnu.tar.gz"
-        mkdir bin
-        curl -sSL $url | tar -xz --directory=bin
-        echo "$(pwd)/bin" >> $GITHUB_PATH
-    - name: Run tests
-      run: cd candle-book && cargo build && mdbook test -L ../target/debug/deps/
-
-
--- a/.github/workflows/ci_cuda.yaml
+++ b/.github/workflows/ci_cuda.yaml
@ -1,87 +0,0 @@
-name: CI / cuda
-
-on:
-  workflow_dispatch:
-  pull_request:
-
-jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ami-03cfed9ea28f4b002
-      EC2_INSTANCE_TYPE: g5.xlarge
-      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
-      EC2_SECURITY_GROUP: sg-030175c435ac141d6
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
-  test-cuda:
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
-      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
-    permissions:
-      contents: write
-      packages: write
-      # This is used to complete the identity challenge
-      # with sigstore/fulcio when running outside of PRs.
-      id-token: write
-      security-events: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Install Rust Stable
-        run: curl https://sh.rustup.rs -sSf | sh -s -- -y
-      - uses: Swatinem/rust-cache@v2
-      - run: apt-get update -y && apt-get install libssl-dev -y
-      - name: Test (cuda)
-        run: PATH=$PATH:/usr/local/cuda-11.8/bin/ /root/.cargo/bin/cargo test --features cuda
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - test-cuda
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,6 @@
 # Generated by Cargo
 # will have compiled files and executables
 debug/
-data/
 dist/
 target/

@ -20,19 +19,10 @@ Cargo.lock

 perf.data
 flamegraph.svg
-*.dylib
 *.so
 *.swp
-*.swo
 trace-*.json

-candle-wasm-examples/*/build
-candle-wasm-examples/*/*.bin
-candle-wasm-examples/*/*.jpeg
-candle-wasm-examples/*/audios/*.wav
-candle-wasm-examples/**/*.safetensors
-candle-wasm-examples/**/*.gguf
-candle-wasm-examples/*/package-lock.json
-candle-wasm-examples/**/config*.json
-.DS_Store
-.idea/*
+candle-wasm-example/*.wav
+candle-wasm-example/*.safetensors
+candle-wasm-example/package-lock.json
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
-[submodule "candle-examples/examples/flash-attn/cutlass"]
-	path = candle-flash-attn/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,11 +0,0 @@
-{
-    "[python]": {
-        "editor.defaultFormatter": "ms-python.black-formatter"
-    },
-    "python.formatting.provider": "none",
-    "python.testing.pytestArgs": [
-        "candle-pyo3"
-    ],
-    "python.testing.unittestEnabled": false,
-    "python.testing.pytestEnabled": true
-}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,113 +0,0 @@
-# Changelog
-This documents the main changes to the `candle` crate.
-
-## v0.3.1 - Unreleased
-
-### Added
-
-### Modified
-
-## v0.3.0 - 2023-10-01
-
-### Added
-
- Added the Mistral 7b v0.1 model
-  [983](https://github.com/huggingface/candle/pull/983).
- Quantized version of the Mistral model
-  [1009](https://github.com/huggingface/candle/pull/1009).
- Add the gelu-erf op and activation function
-  [969](https://github.com/huggingface/candle/pull/969).
- Add the mixformer/phi-v1.5 model
-  [930](https://github.com/huggingface/candle/pull/930).
- Add the sclice-scatter op
-  [927](https://github.com/huggingface/candle/pull/927).
- Add the Wuerstchen diffusion model
-  [911](https://github.com/huggingface/candle/pull/911).
-
-### Modified
-
- Support for simd128 intrinsics in some quantized vecdots
-  [982](https://github.com/huggingface/candle/pull/982).
- Optimize the index-select cuda kernel
-  [976](https://github.com/huggingface/candle/pull/976).
- Self-contained safetensor wrappers
-  [946](https://github.com/huggingface/candle/pull/946).
-
-## v0.2.2 - 2023-09-18
-
-### Added
- Support for `top_p` sampling
-  [819](https://github.com/huggingface/candle/pull/819).
- T5 model including decoding
-  [864](https://github.com/huggingface/candle/pull/864).
- 1-d upsampling
-  [839](https://github.com/huggingface/candle/pull/839).
-
-### Modified
- Bugfix for conv2d
-  [820](https://github.com/huggingface/candle/pull/820).
- Support tensor based indexing using `.i`
-  [842](https://github.com/huggingface/candle/pull/842).
-
-## v0.2.1 - 2023-09-11
-
-### Added
- Add some RNNs (GRU and LSTM) in `candle-nn`
-  [674](https://github.com/huggingface/candle/pull/674),
-  [688](https://github.com/huggingface/candle/pull/688).
- gguf v2 support
-  [725](https://github.com/huggingface/candle/pull/725).
- Quantized llama example in Python using the pyo3 api
-  [716](https://github.com/huggingface/candle/pull/716).
- `candle-nn` layer for conv2d-transposed
-  [760](https://github.com/huggingface/candle/pull/760).
- Add the Segment-Anything Model (SAM) as an example
-  [773](https://github.com/huggingface/candle/pull/773).
- TinyViT backbone for the segemnt anything example
-  [787](https://github.com/huggingface/candle/pull/787).
- Shape with holes support
-  [770](https://github.com/huggingface/candle/pull/770).
-
-### Modified
- Dilations are now supported in conv-transpose2d.
-  [671](https://github.com/huggingface/candle/pull/671).
- Interactive mode for the quantized model
-  [690](https://github.com/huggingface/candle/pull/690).
- Faster softmax operation
-  [747](https://github.com/huggingface/candle/pull/747).
- Faster convolution operations on CPU and CUDA via im2col
-  [802](https://github.com/huggingface/candle/pull/802).
- Moving some models to a more central location
-  [796](https://github.com/huggingface/candle/pull/796).
-
-## v0.2.0 - 2023-08-30
-
-### Added
- Add the powf op
-  [664](https://github.com/huggingface/candle/pull/664).
- Stable Diffusion XL support
-  [647](https://github.com/huggingface/candle/pull/647).
- Add the conv-transpose2d op
-  [635](https://github.com/huggingface/candle/pull/635).
- Refactor the VarBuilder api
-  [627](https://github.com/huggingface/candle/pull/627).
- Add some quantization command
-  [625](https://github.com/huggingface/candle/pull/625).
- Support more quantized types, e.g. Q2K, Q4K, Q5K...
-  [586](https://github.com/huggingface/candle/pull/586).
- Add pose estimation to the yolo example
-  [589](https://github.com/huggingface/candle/pull/589).
- Api to write GGUF files
-  [585](https://github.com/huggingface/candle/pull/585).
- Support more quantization types
-  [580](https://github.com/huggingface/candle/pull/580).
- Add EfficientNet as an example Computer Vision model
-  [572](https://github.com/huggingface/candle/pull/572).
- Add a group parameter to convolutions
-  [566](https://github.com/huggingface/candle/pull/566).
- New dtype: int64
-  [563](https://github.com/huggingface/candle/pull/563).
- Handling of the GGUF file format.
-  [559](https://github.com/huggingface/candle/pull/559).
-
-## v0.1.2 - 2023-08-21
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,66 +1,51 @@
 [workspace]
 members = [
    "candle-core",
-    "candle-datasets",
    "candle-examples",
-    "candle-book",
+    "candle-hub",
    "candle-nn",
    "candle-pyo3",
    "candle-transformers",
-    "candle-wasm-examples/llama2-c",
-    "candle-wasm-examples/segment-anything",
-    "candle-wasm-examples/whisper",
-    "candle-wasm-examples/yolo",
-    "candle-wasm-examples/bert",
-    "candle-wasm-examples/phi",
-    "candle-wasm-examples/t5",
-    "candle-wasm-tests",
+    "candle-wasm-example",
+]
+exclude = [
+    "candle-kernels",
 ]
-exclude = ["candle-flash-attn", "candle-kernels"]
-resolver = "2"
-
-[workspace.package]
-version = "0.3.0"
-edition = "2021"
-description = "Minimalist ML framework."
-repository = "https://github.com/huggingface/candle"
-keywords = ["blas", "tensor", "machine-learning"]
-categories = ["science"]
-license = "MIT OR Apache-2.0"

 [workspace.dependencies]
-accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
 clap = { version = "4.2.4", features = ["derive"] }
-cudarc = { version = "0.9.14", features = ["f16"] }
-# TODO: Switch back to the official gemm implementation once it has caught up.
-gemm = { version = "0.16.0", package = "candle-gemm" }
-hf-hub = "0.3.0"
-half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
-image = { version = "0.24.7", default-features = false, features = ["jpeg", "png"] }
-imageproc = { version = "0.23.0", default-features = false }
-intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
+# Re-enable this once 0.9.13 as been released as it would include the cublas-f16 changes
+# cudarc = { version = "0.9.13", optional = true, features = ["f16"] }
+cudarc = { git = "https://github.com/LaurentMazare/cudarc.git", branch = "cublas-bf16", features = ["f16"] }
+futures = "0.3.28"
+# TODO: Switch back to the official gemm implementation once the following are available.
+# https://github.com/sarah-ek/gemm/pull/8.
+# https://github.com/sarah-ek/gemm/pull/9.
+gemm = { git = "https://github.com/LaurentMazare/gemm.git", branch = "f16-vec-plus-wasm-simd" }
+half = { version = "2.3.1", features = ["num-traits"] }
+indicatif = "0.17.5"
+intel-mkl-src = { version = "0.8.1", features = ["mkl-dynamic-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
-memmap2 = { version = "0.7.1", features = ["stable_deref_trait"] }
+memmap2 = "0.7.1"
 num_cpus = "1.15.0"
 num-traits = "0.2.15"
-parquet = { version = "45.0.0" }
 rand = "0.8.5"
-rand_distr = "0.4.3"
-rayon = "1.7.0"
-rusttype = { version = "0.9", default-features = false }
+reqwest = "0.11.18"
 safetensors = "0.3.1"
-serde = { version = "1.0.171", features = ["derive"] }
+serde = { version = "1.0.166", features = ["derive"] }
 serde_json = "1.0.99"
+sha256 = "=1.1.4"
 thiserror = "1"
-tokenizers = { version = "0.13.4", default-features = false }
+tokenizers = { version = "0.13.3", default-features = false }
+tokio = "1.28.2"
+tokio-test = "0.4.2"
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
 wav = "1.0.0"
-yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "0.6.6", default-features = false }

 [profile.release-with-debug]
--- a/201
+++ b/201
@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/23
+++ b/23
@ -1,23 +0,0 @@
-Permission is hereby granted, free of charge, to any
-person obtaining a copy of this software and associated
-documentation files (the "Software"), to deal in the
-Software without restriction, including without
-limitation the rights to use, copy, modify, merge,
-publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software
-is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice
-shall be included in all copies or substantial portions
-of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
-ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
-TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
-PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
--- a/4
+++ b/4
@ -1,11 +1,7 @@
-.PHONY: clean-ptx clean test
-
 clean-ptx:
 	find target -name "*.ptx" -type f -delete
 	echo "" > candle-kernels/src/lib.rs
 	touch candle-kernels/build.rs
-	touch candle-examples/build.rs
-	touch candle-flash-attn/build.rs

 clean:
 	cargo clean
--- a/README.md
+++ b/README.md
@ -1,335 +1,120 @@
 # candle
-[![discord server](https://dcbadge.vercel.app/api/server/hugging-face-879548962464493619)](https://discord.gg/hugging-face-879548962464493619)
-[![Latest version](https://img.shields.io/crates/v/candle-core.svg)](https://crates.io/crates/candle-core)
-[![Documentation](https://docs.rs/candle-core/badge.svg)](https://docs.rs/candle-core)
-![License](https://img.shields.io/crates/l/candle-core.svg)
+ML framework for Rust

-Candle is a minimalist ML framework for Rust with a focus on performance (including GPU support) 
-and ease of use. Try our online demos: 
-[whisper](https://huggingface.co/spaces/lmz/candle-whisper),
-[LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2),
-[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
-[yolo](https://huggingface.co/spaces/lmz/candle-yolo),
-[Segment
-Anything](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).
-
-## Get started
-
-Make sure that you have [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) correctly installed as described in [**Installation**](https://huggingface.github.io/candle/guide/installation.html).
-
-Let's see how to run a simple matrix multiplication.
-Write the following to your `myapp/src/main.rs` file:
 ```rust
-use candle_core::{Device, Tensor};
+let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
+let b = Tensor::zeros((3, 4), DType::F32, &Device::Cpu)?;

-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let device = Device::Cpu;
-
-    let a = Tensor::randn(0f32, 1., (2, 3), &device)?;
-    let b = Tensor::randn(0f32, 1., (3, 4), &device)?;
-
-    let c = a.matmul(&b)?;
-    println!("{c}");
-    Ok(())
-}
+let c = a.matmul(&b)?;
 ```

-`cargo run` should display a tensor of shape `Tensor[[2, 4], f32]`.
-
-
-Having installed `candle` with Cuda support, simply define the `device` to be on GPU:
-
-```diff
- let device = Device::Cpu;
-+ let device = Device::new_cuda(0)?;
-```
-
-For more advanced examples, please have a look at the following section.
-
 ## Check out our examples

-These online demos run entirely in your browser:
- [yolo](https://huggingface.co/spaces/lmz/candle-yolo): pose estimation and
-  object recognition.
- [whisper](https://huggingface.co/spaces/lmz/candle-whisper): text to speech.
- [LLaMA2](https://huggingface.co/spaces/lmz/candle-llama2): text generation.
- [T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm): text generation.
- [Phi-v1.5](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm): text generation.
- [Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm): Image segmentation.
+Check out our [examples](./candle-examples/examples/):

-We also provide a some command line based examples using state of the art models:
+- [Whisper](./candle-examples/examples/whisper/)
+- [Llama](./candle-examples/examples/llama/)
+- [Bert](./candle-examples/examples/bert/) (Useful for sentence embeddings)
+- [Falcon](./candle-examples/examples/falcon/)

- [LLaMA and LLaMA-v2](./candle-examples/examples/llama/): general LLM.
- [Falcon](./candle-examples/examples/falcon/): general LLM.
- [Phi-v1.5](./candle-examples/examples/phi/): a 1.3b general LLM with performance on par with LLaMA-v2 7b.
- [StableLM-3B-4E1T](./candle-examples/examples/stable-lm/): a 3b general LLM
-  pre-trained on 1T tokens of English and code datasets.
- [Mistral7b-v0.1](./candle-examples/examples/mistral/): a 7b general LLM with
-  performance larger than all publicly available 13b models as of 2023-09-28.
- [StarCoder](./candle-examples/examples/bigcode/): LLM specialized to code generation.
- [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
-  the LLaMA model using the same quantization techniques as
-  [llama.cpp](https://github.com/ggerganov/llama.cpp).
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/quantized/assets/aoc.gif" width="600">
-  
- [Stable Diffusion](./candle-examples/examples/stable-diffusion/): text to
-  image generative model, support for the 1.5, 2.1, and SDXL 1.0 versions.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg" width="200">
-
- [Wuerstchen](./candle-examples/examples/wuerstchen/): another text to
-  image generative model.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/wuerstchen/assets/cat.jpg" width="200">
-
- [yolo-v3](./candle-examples/examples/yolo-v3/) and
-  [yolo-v8](./candle-examples/examples/yolo-v8/): object detection and pose
-  estimation models.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.od.jpg" width="200"><img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/yolo-v8/assets/bike.pose.jpg" width="200">
- [segment-anything](./candle-examples/examples/segment-anything/): image
-  segmentation model with prompt.
-
-<img src="https://github.com/huggingface/candle/raw/main/candle-examples/examples/segment-anything/assets/sam_merged.jpg" width="200">
-
- [Whisper](./candle-examples/examples/whisper/): speech recognition model.
- [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/): useful for sentence embeddings.
- [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
-  using self-supervision (can be used for imagenet classification, depth
-  evaluation, segmentation).
-
-Run them using commands like:
 ```
-cargo run --example quantized --release
+cargo run --example bert --release
+cargo run --example whisper --release
+cargo run --example llama --release
+cargo run --example falcon --release
 ```

-In order to use **CUDA** add `--features cuda` to the example command line. If
-you have cuDNN installed, use `--features cudnn` for even more speedups.
+In order to use **CUDA** add `--features cuda` to the example command line.

-There are also some wasm examples for whisper and
-[llama2.c](https://github.com/karpathy/llama2.c). You can either build them with
-`trunk` or try them online:
-[whisper](https://huggingface.co/spaces/lmz/candle-whisper),
-[llama2](https://huggingface.co/spaces/lmz/candle-llama2),
-[T5](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm),
-[Phi-v1.5](https://huggingface.co/spaces/radames/Candle-Phi-1.5-Wasm),
-[Segment Anything Model](https://huggingface.co/spaces/radames/candle-segment-anything-wasm).
-
-For LLaMA2, run the following command to retrieve the weight files and start a
-test server:
-```bash
-cd candle-wasm-examples/llama2-c
-wget https://huggingface.co/spaces/lmz/candle-llama2/resolve/main/model.bin
-wget https://huggingface.co/spaces/lmz/candle-llama2/resolve/main/tokenizer.json
-trunk serve --release --port 8081
-```
-And then head over to
-[http://localhost:8081/](http://localhost:8081/).
-
-<!--- ANCHOR: useful_libraries --->
-
-## Useful Libraries
- [`candle-lora`](https://github.com/EricLBuehler/candle-lora) provides a LoRA implementation that conforms to the official `peft` implementation.
-
-If you have an addition to this list, please submit a pull request.
-
-<!--- ANCHOR_END: useful_libraries --->
-
-<!--- ANCHOR: features --->

 ## Features

- Simple syntax, looks and feels like PyTorch.
-    - Model training.
-    - Embed user-defined ops/kernels, such as [flash-attention v2](https://github.com/huggingface/candle/blob/89ba005962495f2bfbda286e185e9c3c7f5300a3/candle-flash-attn/src/lib.rs#L152).
- Backends.
-    - Optimized CPU backend with optional MKL support for x86 and Accelerate for macs.
-    - CUDA backend for efficiently running on GPUs, multiple GPU distribution via NCCL.
-    - WASM support, run your models in a browser.
- Included models.
-    - Language Models.
-        - LLaMA v1 and v2.
-        - Falcon.
-        - StarCoder.
-        - Phi v1.5.
-        - Mistral 7b v0.1.
-        - StableLM-3B-4E1T.
-        - T5.
-        - Bert.
-    - Whisper (multi-lingual support).
-    - Stable Diffusion v1.5, v2.1, XL v1.0.
-    - Wurstchen v2.
-    - Computer Vision Models.
-        - DINOv2.
-        - EfficientNet.
-        - yolo-v3.
-        - yolo-v8.
-        - Segment-Anything Model (SAM).
- File formats: load models from safetensors, npz, ggml, or PyTorch files.
- Serverless (on CPU), small and fast deployments.
- Quantization support using the llama.cpp quantized types.
+- Simple syntax (looks and like PyTorch)
+- CPU and Cuda backends, m1, f16, bf16 (and tentatively wasm)
+- Enable serverless (CPU), small and fast deployments
+- Model training
+- Distributed computing (NCCL).
+- Models out of the box (Llama, Whisper, Falcon, ...)
+- Emphasis on enabling users to use custom ops/kernels

-<!--- ANCHOR_END: features --->
+## How to use ?

-## How to use
-
-<!--- ANCHOR: cheatsheet --->
 Cheatsheet:

 |            | Using PyTorch                            | Using Candle                                                     |
 |------------|------------------------------------------|------------------------------------------------------------------|
-| Creation   | `torch.Tensor([[1, 2], [3, 4]])`         | `Tensor::new(&[[1f32, 2.], [3., 4.]], &Device::Cpu)?`           |
-| Creation   | `torch.zeros((2, 2))`                    | `Tensor::zeros((2, 2), DType::F32, &Device::Cpu)?`               |
+| Creation   | `torch.Tensor([[1, 2], [3, 4]])`         | `Tensor::new(`                                                   |
+|            |                                          | `  &[[1f32, 2.]], [3., 4.]],`                                    |
+|            |                                          | `  &Device::Cpu)?`                                               |
 | Indexing   | `tensor[:, :4]`                          | `tensor.i((.., ..4))?`                                           |
 | Operations | `tensor.view((2, 2))`                    | `tensor.reshape((2, 2))?`                                        |
 | Operations | `a.matmul(b)`                            | `a.matmul(&b)?`                                                  |
 | Arithmetic | `a + b`                                  | `&a + &b`                                                        |
-| Device     | `tensor.to(device="cuda")`               | `tensor.to_device(&Device::new_cuda(0)?)?`                            |
+| Device     | `tensor.to(device="cuda")`               | `tensor.to_device(&Device::Cuda(0))?`                            |
 | Dtype      | `tensor.to(dtype=torch.float16)`         | `tensor.to_dtype(&DType::F16)?`                                  |
-| Saving     | `torch.save({"A": A}, "model.bin")`      | `candle::safetensors::save(&HashMap::from([("A", A)]), "model.safetensors")?` |
-| Loading    | `weights = torch.load("model.bin")`      | `candle::safetensors::load("model.safetensors", &device)`        |
-
-<!--- ANCHOR_END: cheatsheet --->
+| Saving     | `torch.save({"A": A}, "model.bin")`      | `tensor.save_safetensors("A", "model.safetensors")?`             |
+| Loading    | `weights = torch.load("model.bin")`      | TODO (see the examples for now)                                  |


 ## Structure

 - [candle-core](./candle-core): Core ops, devices, and `Tensor` struct definition
- [candle-nn](./candle-nn/): Tools to build real models
- [candle-examples](./candle-examples/): Examples of using the library in realistic settings
+- [candle-nn](./candle-nn/): Facilities to build real models
+- [candle-examples](./candle-examples/): Real-world like examples on how to use the library in real settings
 - [candle-kernels](./candle-kernels/): CUDA custom kernels
- [candle-datasets](./candle-datasets/): Datasets and data loaders.
- [candle-transformers](./candle-transformers): transformers-related utilities.
- [candle-flash-attn](./candle-flash-attn): Flash attention v2 layer.
+
+

 ## FAQ

-### Why should I use Candle?
+### Why Candle?

-Candle's core goal is to *make serverless inference possible*. Full machine learning frameworks like PyTorch
-are very large, which makes creating instances on a cluster slow. Candle allows deployment of lightweight
-binaries.
+Candle stems from the need to reduce binary size in order to *enable serverless*
+possible by making the whole engine smaller than PyTorch very large library volume.
+This enables creating runtimes on a cluster much faster.

-Secondly, Candle lets you *remove Python* from production workloads. Python overhead can seriously hurt performance,
-and the [GIL](https://www.backblaze.com/blog/the-python-gil-past-present-and-future/) is a notorious source of headaches.
+And simply *removing Python* from production workloads.
+Python can really add overhead in more complex workflows and the [GIL](https://www.backblaze.com/blog/the-python-gil-past-present-and-future/) is a notorious source of headaches.

-Finally, Rust is cool! A lot of the HF ecosystem already has Rust crates, like [safetensors](https://github.com/huggingface/safetensors) and [tokenizers](https://github.com/huggingface/tokenizers).
+Rust is cool, and a lot of the HF ecosystem already has Rust crates [safetensors](https://github.com/huggingface/safetensors) and [tokenizers](https://github.com/huggingface/tokenizers).


 ### Other ML frameworks

 - [dfdx](https://github.com/coreylowman/dfdx) is a formidable crate, with shapes being included
-  in types. This prevents a lot of headaches by getting the compiler to complain about shape mismatches right off the bat.
-  However, we found that some features still require nightly, and writing code can be a bit daunting for non rust experts.
+  in types preventing a lot of headaches by getting compiler to complain about shape mismatch right off the bat
+  However we found that some features still require nightly and writing code can be a bit dauting for non rust experts.

  We're leveraging and contributing to other core crates for the runtime so hopefully both crates can benefit from each
-  other.
+  other

 - [burn](https://github.com/burn-rs/burn) is a general crate that can leverage multiple backends so you can choose the best
-  engine for your workload.
+  engine for your workload

 - [tch-rs](https://github.com/LaurentMazare/tch-rs.git) Bindings to the torch library in Rust. Extremely versatile, but they 
-  bring in the entire torch library into the runtime. The main contributor of `tch-rs` is also involved in the development
+  do bring in the entire torch library into the runtime. The main contributor of `tch-rs` is also involved in the development
  of `candle`.

-### Common Errors
-
-#### Missing symbols when compiling with the mkl feature.
+### Missing symbols when compiling with the mkl feature.

 If you get some missing symbols when compiling binaries/tests using the mkl
-or accelerate features, e.g. for mkl you get:
+features, e.g.:
 ```
  = note: /usr/bin/ld: (....o): in function `blas::sgemm':
          .../blas-0.22.0/src/lib.rs:1944: undefined reference to `sgemm_' collect2: error: ld returned 1 exit status

  = note: some `extern` functions couldn't be found; some native libraries may need to be installed or have their path specified
  = note: use the `-l` flag to specify native libraries to link
-  = note: use the `cargo:rustc-link-lib` directive to specify the native libraries to link with Cargo
-```
-or for accelerate:
-```
-Undefined symbols for architecture arm64:
-            "_dgemm_", referenced from:
-                candle_core::accelerate::dgemm::h1b71a038552bcabe in libcandle_core...
-            "_sgemm_", referenced from:
-                candle_core::accelerate::sgemm::h2cf21c592cba3c47 in libcandle_core...
-          ld: symbol(s) not found for architecture arm64
+  = note: use the `cargo:rustc-link-lib` directive to specify the native libraries to link with Cargo (see https://doc.rust-lang.org/cargo/reference/build-scripts.html#cargorustc-link-libkindname)
 ```

-This is likely due to a missing linker flag that was needed to enable the mkl library. You
-can try adding the following for mkl at the top of your binary:
-```rust
+This is likely due to some missing linker flag that enable the mkl library. You
+can try adding the following at the top of your binary:
+```
 extern crate intel_mkl_src;
 ```
-or for accelerate:
-```rust
-extern crate accelerate_src;
-```

-#### Cannot run the LLaMA examples: access to source requires login credentials
-
-```
-Error: request error: https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/tokenizer.json: status code 401
-```
-
-This is likely because you're not permissioned for the LLaMA-v2 model. To fix
-this, you have to register on the huggingface-hub, accept the [LLaMA-v2 model
-conditions](https://huggingface.co/meta-llama/Llama-2-7b-hf), and set up your
-authentication token. See issue
-[#350](https://github.com/huggingface/candle/issues/350) for more details.
-
-#### Missing cute/cutlass headers when compiling flash-attn
-
-```
-  In file included from kernels/flash_fwd_launch_template.h:11:0,
-                   from kernels/flash_fwd_hdim224_fp16_sm80.cu:5:
-  kernels/flash_fwd_kernel.h:8:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
-   #include <cute/algorithm/copy.hpp>
-            ^~~~~~~~~~~~~~~~~~~~~~~~~
-  compilation terminated.
-  Error: nvcc error while compiling:
-```
-[cutlass](https://github.com/NVIDIA/cutlass) is provided as a git submodule so you may want to run the following command to check it in properly.
-```bash
-git submodule update --init
-```
-
-#### Compiling with flash-attention fails
-
-```
-/usr/include/c++/11/bits/std_function.h:530:146: error: parameter packs not expanded with ‘...’:
-```
-
-This is a bug in gcc-11 triggered by the Cuda compiler. To fix this, install a different, supported gcc version - for example gcc-10, and specify the path to the compiler in the CANDLE_NVCC_CCBIN environment variable.
-```
-env CANDLE_NVCC_CCBIN=/usr/lib/gcc/x86_64-linux-gnu/10 cargo ...
-```
-
-#### Linking error on windows when running rustdoc or mdbook tests
-
-```
-Couldn't compile the test.
---- .\candle-book\src\inference\hub.md - Using_the_hub::Using_in_a_real_model_ (line 50) stdout ----
-error: linking with `link.exe` failed: exit code: 1181
-//very long chain of linking
- = note: LINK : fatal error LNK1181: cannot open input file 'windows.0.48.5.lib'
-```
-
-Make sure you link all native libraries that might be located outside a project target, e.g., to run mdbook tests, you should run:
-
-```
-mdbook test candle-book -L .\target\debug\deps\ `
-L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.42.2\lib `
-L native=$env:USERPROFILE\.cargo\registry\src\index.crates.io-6f17d22bba15001f\windows_x86_64_msvc-0.48.5\lib
-```
-
-#### Extremely slow model load time with WSL
-
-This may be caused by the models being loaded from `/mnt/c`, more details on
-[stackoverflow](https://stackoverflow.com/questions/68972448/why-is-wsl-extremely-slow-when-compared-with-native-windows-npm-yarn-processing).
-
-#### Tracking down errors
+### How to know where an error comes from.

 You can set `RUST_BACKTRACE=1` to be provided with backtraces when a candle
 error is generated.
--- a/candle-book/.gitignore
+++ b/candle-book/.gitignore
@ -1 +0,0 @@
-book
--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@ -1,49 +0,0 @@
-[package]
-name = "candle-book"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-repository.workspace = true
-keywords.workspace = true
-categories.workspace = true
-license.workspace = true
-readme = "README.md"
-
-[dependencies]
-accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.3.0", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.3.0" }
-candle-nn = { path = "../candle-nn", version = "0.3.0" }
-candle-transformers = { path = "../candle-transformers", version = "0.3.0" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.0", optional = true }
-safetensors = { workspace = true }
-serde = { workspace = true }
-serde_json = { workspace = true }
-num-traits = { workspace = true }
-intel-mkl-src = { workspace = true, optional = true }
-cudarc = { workspace = true, optional = true }
-half = { workspace = true, optional = true }
-image = { workspace = true, optional = true }
-anyhow = { workspace = true }
-tokio = "1.29.1"
-
-[dev-dependencies]
-byteorder = { workspace = true }
-hf-hub = { workspace = true, features=["tokio"]}
-clap = { workspace = true }
-memmap2 = { workspace = true }
-rand = { workspace = true }
-tokenizers = { workspace = true, features = ["onig"] }
-tracing = { workspace = true }
-tracing-chrome = { workspace = true }
-tracing-subscriber = { workspace = true }
-wav = { workspace = true }
-# Necessary to disambiguate with tokio in wasm examples which are 1.28.1
-parquet = { workspace = true }
-image = { workspace = true }
-
-[build-dependencies]
-anyhow = { workspace = true }
-
-[features]
-default = []
--- a/candle-book/book.toml
+++ b/candle-book/book.toml
@ -1,6 +0,0 @@
-[book]
-authors = ["Nicolas Patry"]
-language = "en"
-multilingual = false
-src = "src"
-title = "Candle Documentation"
--- a/candle-book/src/README.md
+++ b/candle-book/src/README.md
@ -1,6 +0,0 @@
-# Introduction
-
-{{#include ../../README.md:features}}
-
-
-This book will introduce step by step how to use `candle`.
--- a/candle-book/src/SUMMARY.md
+++ b/candle-book/src/SUMMARY.md
@ -1,28 +0,0 @@
-# Summary
-
-[Introduction](README.md)
-
-# User Guide
-
- [Installation](guide/installation.md)
- [Hello World - MNIST](guide/hello_world.md)
- [PyTorch cheatsheet](guide/cheatsheet.md)
-
-# Reference Guide
-
- [Running a model](inference/inference.md)
-    - [Using the hub](inference/hub.md)
- [Error management](error_manage.md)
- [Training](training/training.md)
-    - [Simplified](training/simplified.md)
-    - [MNIST](training/mnist.md)
-    - [Fine-tuning]()
-    - [Serialization]()
- [Advanced Cuda usage]()
-    - [Writing a custom kernel]()
-    - [Porting a custom kernel]()
- [Using MKL]()
- [Creating apps]()
-    - [Creating a WASM app]()
-    - [Creating a REST api webserver]()
-    - [Creating a desktop Tauri app]()
--- a/candle-book/src/advanced/mkl.md
+++ b/candle-book/src/advanced/mkl.md
@ -1 +0,0 @@
-# Using MKL
--- a/candle-book/src/apps/README.md
+++ b/candle-book/src/apps/README.md
@ -1 +0,0 @@
-# Creating apps
--- a/candle-book/src/apps/dekstop.md
+++ b/candle-book/src/apps/dekstop.md
@ -1 +0,0 @@
-# Creating a desktop Tauri app
--- a/candle-book/src/apps/rest.md
+++ b/candle-book/src/apps/rest.md
@ -1 +0,0 @@
-# Creating a REST api webserver
--- a/candle-book/src/apps/wasm.md
+++ b/candle-book/src/apps/wasm.md
@ -1 +0,0 @@
-# Creating a WASM app
--- a/candle-book/src/chapter_1.md
+++ b/candle-book/src/chapter_1.md
@ -1 +0,0 @@
-# Chapter 1
--- a/candle-book/src/cuda/README.md
+++ b/candle-book/src/cuda/README.md
@ -1 +0,0 @@
-# Advanced Cuda usage
--- a/candle-book/src/cuda/porting.md
+++ b/candle-book/src/cuda/porting.md
@ -1 +0,0 @@
-# Porting a custom kernel
--- a/candle-book/src/cuda/writing.md
+++ b/candle-book/src/cuda/writing.md
@ -1 +0,0 @@
-# Writing a custom kernel
--- a/candle-book/src/error_manage.md
+++ b/candle-book/src/error_manage.md
@ -1,51 +0,0 @@
-# Error management
-
-You might have seen in the code base a lot of `.unwrap()` or `?`.
-If you're unfamiliar with Rust check out the [Rust book](https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html)
-for more information.
-
-What's important to know though, is that if you want to know *where* a particular operation failed
-You can simply use `RUST_BACKTRACE=1` to get the location of where the model actually failed.
-
-Let's see on failing code:
-
-```rust,ignore
-let x = Tensor::zeros((1, 784), DType::F32, &device)?;
-let y = Tensor::zeros((1, 784), DType::F32, &device)?;
-let z = x.matmul(&y)?;
-```
-
-Will print at runtime:
-
-```bash
-Error: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }
-``` 
-
-
-After adding `RUST_BACKTRACE=1`:
-
-
-```bash
-Error: WithBacktrace { inner: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }, backtrace: Backtrace [{ fn: "candle::error::Error::bt", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/error.rs", line: 200 }, { fn: "candle::tensor::Tensor::matmul", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/tensor.rs", line: 816 }, { fn: "myapp::main", file: "./src/main.rs", line: 29 }, { fn: "core::ops::function::FnOnce::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 250 }, { fn: "std::sys_common::backtrace::__rust_begin_short_backtrace", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/sys_common/backtrace.rs", line: 135 }, { fn: "std::rt::lang_start::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 166 }, { fn: "core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 284 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::rt::lang_start", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 165 }, { fn: "main" }, { fn: "__libc_start_main" }, { fn: "_start" }] }
-```
-
-Not super pretty at the moment, but we can see error occurred on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`
-
-
-Another thing to note, is that since Rust is compiled it is not necessarily as easy to recover proper stacktraces
-especially in release builds. We're using [`anyhow`](https://docs.rs/anyhow/latest/anyhow/) for that.
-The library is still young, please [report](https://github.com/LaurentMazare/candle/issues) any issues detecting where an error is coming from.
-
-## Cuda error management
-
-When running a model on Cuda, you might get a stacktrace not really representing the error.
-The reason is that CUDA is async by nature, and therefore the error might be caught while you were sending totally different kernels.
-
-One way to avoid this is to use `CUDA_LAUNCH_BLOCKING=1` as an environment variable. This will force every kernel to be launched sequentially.
-You might still however see the error happening on other kernels as the faulty kernel might exit without an error but spoiling some pointer for which the error will happen when dropping the `CudaSlice` only.
-
-
-If this occurs, you can use [`compute-sanitizer`](https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html)
-This tool is like `valgrind` but for cuda. It will help locate the errors in the kernels.
-
-
--- a/candle-book/src/guide/cheatsheet.md
+++ b/candle-book/src/guide/cheatsheet.md
@ -1,3 +0,0 @@
-# Pytorch cheatsheet
-
-{{#include ../../../README.md:cheatsheet}}
--- a/candle-book/src/guide/hello_world.md
+++ b/candle-book/src/guide/hello_world.md
@ -1,195 +0,0 @@
-# Hello world!
-
-We will now create the hello world of the ML world, building a model capable of solving MNIST dataset.
-
-Open `src/main.rs` and fill in this content:
-
-```rust
-# extern crate candle_core;
-use candle_core::{Device, Result, Tensor};
-
-struct Model {
-    first: Tensor,
-    second: Tensor,
-}
-
-impl Model {
-    fn forward(&self, image: &Tensor) -> Result<Tensor> {
-        let x = image.matmul(&self.first)?;
-        let x = x.relu()?;
-        x.matmul(&self.second)
-    }
-}
-
-fn main() -> Result<()> {
-    // Use Device::new_cuda(0)?; to use the GPU.
-    let device = Device::Cpu;
-
-    let first = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
-    let second = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
-    let model = Model { first, second };
-
-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
-
-    let digit = model.forward(&dummy_image)?;
-    println!("Digit {digit:?} digit");
-    Ok(())
-}
-```
-
-Everything should now run with:
-
-```bash
-cargo run --release
-```
-
-## Using a `Linear` layer.
-
-Now that we have this, we might want to complexify things a bit, for instance by adding `bias` and creating
-the classical `Linear` layer. We can do as such
-
-```rust
-# extern crate candle_core;
-# use candle_core::{Device, Result, Tensor};
-struct Linear{
-    weight: Tensor,
-    bias: Tensor,
-}
-impl Linear{
-    fn forward(&self, x: &Tensor) -> Result<Tensor> {
-        let x = x.matmul(&self.weight)?;
-        x.broadcast_add(&self.bias)
-    }
-}
-
-struct Model {
-    first: Linear,
-    second: Linear,
-}
-
-impl Model {
-    fn forward(&self, image: &Tensor) -> Result<Tensor> {
-        let x = self.first.forward(image)?;
-        let x = x.relu()?;
-        self.second.forward(&x)
-    }
-}
-```
-
-This will change the model running code into a new function
-
-```rust
-# extern crate candle_core;
-# use candle_core::{Device, Result, Tensor};
-# struct Linear{
-#     weight: Tensor,
-#     bias: Tensor,
-# }
-# impl Linear{
-#     fn forward(&self, x: &Tensor) -> Result<Tensor> {
-#         let x = x.matmul(&self.weight)?;
-#         x.broadcast_add(&self.bias)
-#     }
-# }
-# 
-# struct Model {
-#     first: Linear,
-#     second: Linear,
-# }
-# 
-# impl Model {
-#     fn forward(&self, image: &Tensor) -> Result<Tensor> {
-#         let x = self.first.forward(image)?;
-#         let x = x.relu()?;
-#         self.second.forward(&x)
-#     }
-# }
-fn main() -> Result<()> {
-    // Use Device::new_cuda(0)?; to use the GPU.
-    // Use Device::Cpu; to use the CPU.
-    let device = Device::cuda_if_available(0)?;
-
-    // Creating a dummy model
-    let weight = Tensor::randn(0f32, 1.0, (784, 100), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
-    let first = Linear{weight, bias};
-    let weight = Tensor::randn(0f32, 1.0, (100, 10), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
-    let second = Linear{weight, bias};
-    let model = Model { first, second };
-
-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
-
-    // Inference on the model
-    let digit = model.forward(&dummy_image)?;
-    println!("Digit {digit:?} digit");
-    Ok(())
-}
-```
-
-Now it works, it is a great way to create your own layers.
-But most of the classical layers are already implemented in [candle-nn](https://github.com/huggingface/candle/tree/main/candle-nn).
-
-## Using `candle_nn`.
-
-For instance [Linear](https://github.com/huggingface/candle/blob/main/candle-nn/src/linear.rs) is already there.
-This Linear is coded with PyTorch layout in mind, to reuse better existing models out there, so it uses the transpose of the weights and not the weights directly.
-
-So instead we can simplify our example:
-
-```bash
-cargo add --git https://github.com/huggingface/candle.git candle-nn
-```
-
-And rewrite our examples using it
-
-```rust
-# extern crate candle_core;
-# extern crate candle_nn;
-use candle_core::{Device, Result, Tensor};
-use candle_nn::{Linear, Module};
-
-struct Model {
-    first: Linear,
-    second: Linear,
-}
-
-impl Model {
-    fn forward(&self, image: &Tensor) -> Result<Tensor> {
-        let x = self.first.forward(image)?;
-        let x = x.relu()?;
-        self.second.forward(&x)
-    }
-}
-
-fn main() -> Result<()> {
-    // Use Device::new_cuda(0)?; to use the GPU.
-    let device = Device::Cpu;
-
-    // This has changed (784, 100) -> (100, 784) !
-    let weight = Tensor::randn(0f32, 1.0, (100, 784), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (100, ), &device)?;
-    let first = Linear::new(weight, Some(bias));
-    let weight = Tensor::randn(0f32, 1.0, (10, 100), &device)?;
-    let bias = Tensor::randn(0f32, 1.0, (10, ), &device)?;
-    let second = Linear::new(weight, Some(bias));
-    let model = Model { first, second };
-
-    let dummy_image = Tensor::randn(0f32, 1.0, (1, 784), &device)?;
-
-    let digit = model.forward(&dummy_image)?;
-    println!("Digit {digit:?} digit");
-    Ok(())
-}
-```
-
-Feel free to modify this example to use `Conv2d` to create a classical convnet instead.
-
-
-Now that we have the running dummy code we can get to more advanced topics:
-
- [For PyTorch users](../guide/cheatsheet.md)
- [Running existing models](../inference/inference.md)
- [Training models](../training/training.md)
-
-
--- a/candle-book/src/guide/installation.md
+++ b/candle-book/src/guide/installation.md
@ -1,56 +0,0 @@
-# Installation
-
-**With Cuda support**:
-
-1. First, make sure that Cuda is correctly installed.
- `nvcc --version` should print information about your Cuda compiler driver.
- `nvidia-smi --query-gpu=compute_cap --format=csv` should print your GPUs compute capability, e.g. something
-like:
-
-```bash
-compute_cap
-8.9
-```
-
-If any of the above commands errors out, please make sure to update your Cuda version.
-
-2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support.
-
-Start by creating a new cargo:
-
-```bash
-cargo new myapp
-cd myapp
-```
-
-Make sure to add the `candle-core` crate with the cuda feature:
-
-```bash
-cargo add --git https://github.com/huggingface/candle.git candle-core --features "cuda"
-```
-
-Run `cargo build` to make sure everything can be correctly built.
-
-```bash
-cargo build
-```
-
-**Without Cuda support**:
-
-Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) as follows:
-
-```bash
-cargo new myapp
-cd myapp
-cargo add --git https://github.com/huggingface/candle.git candle-core
-```
-
-Finally, run `cargo build` to make sure everything can be correctly built.
-
-```bash
-cargo build
-```
-
-**With mkl support**
-
-You can also see the `mkl` feature which could be interesting to get faster inference on CPU. [Using mkl](./advanced/mkl.md)
--- a/candle-book/src/inference/cuda/README.md
+++ b/candle-book/src/inference/cuda/README.md
@ -1 +0,0 @@
-# Advanced Cuda usage
--- a/candle-book/src/inference/cuda/porting.md
+++ b/candle-book/src/inference/cuda/porting.md
@ -1 +0,0 @@
-# Porting a custom kernel
--- a/candle-book/src/inference/cuda/writing.md
+++ b/candle-book/src/inference/cuda/writing.md
@ -1 +0,0 @@
-# Writing a custom kernel
--- a/candle-book/src/inference/hub.md
+++ b/candle-book/src/inference/hub.md
@ -1,104 +0,0 @@
-# Using the hub
-
-Install the [`hf-hub`](https://github.com/huggingface/hf-hub) crate:
-
-```bash
-cargo add hf-hub
-```
-
-Then let's start by downloading the [model file](https://huggingface.co/bert-base-uncased/tree/main).
-
-
-```rust
-# extern crate candle_core;
-# extern crate hf_hub;
-use hf_hub::api::sync::Api;
-use candle_core::Device;
-
-let api = Api::new().unwrap();
-let repo = api.model("bert-base-uncased".to_string());
-
-let weights = repo.get("model.safetensors").unwrap();
-
-let weights = candle_core::safetensors::load(weights, &Device::Cpu);
-```
-
-We now have access to all the [tensors](https://huggingface.co/bert-base-uncased?show_tensors=true) within the file.
-
-You can check all the names of the tensors [here](https://huggingface.co/bert-base-uncased?show_tensors=true)
-
-
-## Using async 
-
-`hf-hub` comes with an async API.
-
-```bash
-cargo add hf-hub --features tokio
-```
-
-```rust,ignore
-# This is tested directly in examples crate because it needs external dependencies unfortunately:
-# See [this](https://github.com/rust-lang/mdBook/issues/706)
-{{#include ../lib.rs:book_hub_1}}
-```
-
-
-## Using in a real model.
-
-Now that we have our weights, we can use them in our bert architecture:
-
-```rust
-# extern crate candle_core;
-# extern crate candle_nn;
-# extern crate hf_hub;
-# use hf_hub::api::sync::Api;
-# 
-# let api = Api::new().unwrap();
-# let repo = api.model("bert-base-uncased".to_string());
-# 
-# let weights = repo.get("model.safetensors").unwrap();
-use candle_core::{Device, Tensor, DType};
-use candle_nn::{Linear, Module};
-
-let weights = candle_core::safetensors::load(weights, &Device::Cpu).unwrap();
-
-let weight = weights.get("bert.encoder.layer.0.attention.self.query.weight").unwrap();
-let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap();
-
-let linear = Linear::new(weight.clone(), Some(bias.clone()));
-
-let input_ids = Tensor::zeros((3, 768), DType::F32, &Device::Cpu).unwrap();
-let output = linear.forward(&input_ids).unwrap();
-```
-
-For a full reference, you can check out the full [bert](https://github.com/LaurentMazare/candle/tree/main/candle-examples/examples/bert) example.
-
-## Memory mapping
-
-For more efficient loading, instead of reading the file, you could use [`memmap2`](https://docs.rs/memmap2/latest/memmap2/)
-
-**Note**: Be careful about memory mapping it seems to cause issues on [Windows, WSL](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/5893)
-and will definitely be slower on network mounted disk, because it will issue more read calls.
-
-```rust,ignore
-{{#include ../lib.rs:book_hub_2}}
-```
-
-**Note**: This operation is **unsafe**. [See the safety notice](https://docs.rs/memmap2/latest/memmap2/struct.Mmap.html#safety).
-In practice model files should never be modified, and the mmaps should be mostly READONLY anyway, so the caveat most likely does not apply, but always keep it in mind.
-
-
-## Tensor Parallel Sharding
-
-When using multiple GPUs to use in Tensor Parallel in order to get good latency, you can load only the part of the Tensor you need.
-
-For that you need to use [`safetensors`](https://crates.io/crates/safetensors) directly.
-
-```bash
-cargo add safetensors
-```
-
-
-```rust,ignore
-{{#include ../lib.rs:book_hub_3}}
-```
--- a/candle-book/src/inference/inference.md
+++ b/candle-book/src/inference/inference.md
@ -1,7 +0,0 @@
-# Running a model
-
-
-In order to run an existing model, you will need to download and use existing weights.
-Most models are already available on https://huggingface.co/ in [`safetensors`](https://github.com/huggingface/safetensors) format.
-
-Let's get started by running an old model : `bert-base-uncased`.
--- a/candle-book/src/lib.rs
+++ b/candle-book/src/lib.rs
@ -1,196 +0,0 @@
-#[cfg(test)]
-pub mod simplified;
-
-#[cfg(test)]
-mod tests {
-    use anyhow::Result;
-    use candle::{DType, Device, Tensor};
-    use parquet::file::reader::SerializedFileReader;
-
-    // NOTE: Waiting on https://github.com/rust-lang/mdBook/pull/1856
-    #[rustfmt::skip]
-    #[tokio::test]
-    async fn book_hub_1() {
-// ANCHOR: book_hub_1
-use candle::Device;
-use hf_hub::api::tokio::Api;
-
-let api = Api::new().unwrap();
-let repo = api.model("bert-base-uncased".to_string());
-
-let weights_filename = repo.get("model.safetensors").await.unwrap();
-
-let weights = candle::safetensors::load(weights_filename, &Device::Cpu).unwrap();
-// ANCHOR_END: book_hub_1
-        assert_eq!(weights.len(), 206);
-    }
-
-    #[rustfmt::skip]
-    #[test]
-    fn book_hub_2() {
-// ANCHOR: book_hub_2
-use candle::Device;
-use hf_hub::api::sync::Api;
-use memmap2::Mmap;
-use std::fs;
-
-let api = Api::new().unwrap();
-let repo = api.model("bert-base-uncased".to_string());
-let weights_filename = repo.get("model.safetensors").unwrap();
-
-let file = fs::File::open(weights_filename).unwrap();
-let mmap = unsafe { Mmap::map(&file).unwrap() };
-let weights = candle::safetensors::load_buffer(&mmap[..], &Device::Cpu).unwrap();
-// ANCHOR_END: book_hub_2
-        assert_eq!(weights.len(), 206);
-    }
-
-    #[rustfmt::skip]
-    #[test]
-    fn book_hub_3() {
-// ANCHOR: book_hub_3
-use candle::{DType, Device, Tensor};
-use hf_hub::api::sync::Api;
-use memmap2::Mmap;
-use safetensors::slice::IndexOp;
-use safetensors::SafeTensors;
-use std::fs;
-
-let api = Api::new().unwrap();
-let repo = api.model("bert-base-uncased".to_string());
-let weights_filename = repo.get("model.safetensors").unwrap();
-
-let file = fs::File::open(weights_filename).unwrap();
-let mmap = unsafe { Mmap::map(&file).unwrap() };
-
-// Use safetensors directly
-let tensors = SafeTensors::deserialize(&mmap[..]).unwrap();
-let view = tensors
-    .tensor("bert.encoder.layer.0.attention.self.query.weight")
-    .unwrap();
-
-// We're going to load shard with rank 1, within a world_size of 4
-// We're going to split along dimension 0 doing VIEW[start..stop, :]
-let rank = 1;
-let world_size = 4;
-let dim = 0;
-let dtype = view.dtype();
-let mut tp_shape = view.shape().to_vec();
-let size = tp_shape[0];
-
-if size % world_size != 0 {
-    panic!("The dimension is not divisble by `world_size`");
-}
-let block_size = size / world_size;
-let start = rank * block_size;
-let stop = (rank + 1) * block_size;
-
-// Everything is expressed in tensor dimension
-// bytes offsets is handled automatically for safetensors.
-
-let iterator = view.slice(start..stop).unwrap();
-
-tp_shape[dim] = block_size;
-
-// Convert safetensors Dtype to candle DType
-let dtype: DType = dtype.try_into().unwrap();
-
-// TODO: Implement from_buffer_iterator so we can skip the extra CPU alloc.
-let raw: Vec<u8> = iterator.into_iter().flatten().cloned().collect();
-let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).unwrap();
-// ANCHOR_END: book_hub_3
-        assert_eq!(view.shape(), &[768, 768]);
-        assert_eq!(tp_tensor.dims(), &[192, 768]);
-    }
-
-    #[rustfmt::skip]
-    #[test]
-    fn book_training_1() -> Result<()>{
-// ANCHOR: book_training_1
-use hf_hub::{api::sync::Api, Repo, RepoType};
-
-let dataset_id = "mnist".to_string();
-
-let api = Api::new()?;
-let repo = Repo::with_revision(
-    dataset_id,
-    RepoType::Dataset,
-    "refs/convert/parquet".to_string(),
-);
-let repo = api.repo(repo);
-let test_parquet_filename = repo.get("mnist/test/0000.parquet")?;
-let train_parquet_filename = repo.get("mnist/train/0000.parquet")?;
-let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?)?;
-let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?)?;
-// ANCHOR_END: book_training_1
-// Ignore unused
-let _train = train_parquet;
-// ANCHOR: book_training_2
-for row in test_parquet {
-    for (idx, (name, field)) in row?.get_column_iter().enumerate() {
-        println!("Column id {idx}, name {name}, value {field}");
-    }
-}
-// ANCHOR_END: book_training_2
-let test_parquet_filename = repo.get("mnist/test/0000.parquet")?;
-let train_parquet_filename = repo.get("mnist/train/0000.parquet")?;
-let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?)?;
-let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?)?;
-// ANCHOR: book_training_3
-
-let test_samples = 10_000;
-let mut test_buffer_images: Vec<u8> = Vec::with_capacity(test_samples * 784);
-let mut test_buffer_labels: Vec<u8> = Vec::with_capacity(test_samples);
-for row in test_parquet{
-    for (_name, field) in row?.get_column_iter() {
-        if let parquet::record::Field::Group(subrow) = field {
-            for (_name, field) in subrow.get_column_iter() {
-                if let parquet::record::Field::Bytes(value) = field {
-                    let image = image::load_from_memory(value.data()).unwrap();
-                    test_buffer_images.extend(image.to_luma8().as_raw());
-                }
-            }
-        }else if let parquet::record::Field::Long(label) = field {
-            test_buffer_labels.push(*label as u8);
-        }
-    }
-}
-let test_images = (Tensor::from_vec(test_buffer_images, (test_samples, 784), &Device::Cpu)?.to_dtype(DType::F32)? / 255.)?;
-let test_labels = Tensor::from_vec(test_buffer_labels, (test_samples, ), &Device::Cpu)?;
-
-let train_samples = 60_000;
-let mut train_buffer_images: Vec<u8> = Vec::with_capacity(train_samples * 784);
-let mut train_buffer_labels: Vec<u8> = Vec::with_capacity(train_samples);
-for row in train_parquet{
-    for (_name, field) in row?.get_column_iter() {
-        if let parquet::record::Field::Group(subrow) = field {
-            for (_name, field) in subrow.get_column_iter() {
-                if let parquet::record::Field::Bytes(value) = field {
-                    let image = image::load_from_memory(value.data()).unwrap();
-                    train_buffer_images.extend(image.to_luma8().as_raw());
-                }
-            }
-        }else if let parquet::record::Field::Long(label) = field {
-            train_buffer_labels.push(*label as u8);
-        }
-    }
-}
-let train_images = (Tensor::from_vec(train_buffer_images, (train_samples, 784), &Device::Cpu)?.to_dtype(DType::F32)? / 255.)?;
-let train_labels = Tensor::from_vec(train_buffer_labels, (train_samples, ), &Device::Cpu)?;
-
-let mnist = candle_datasets::vision::Dataset {
-    train_images,
-    train_labels,
-    test_images,
-    test_labels,
-    labels: 10,
-};
-
-// ANCHOR_END: book_training_3
-assert_eq!(mnist.test_images.dims(), &[10_000, 784]);
-assert_eq!(mnist.test_labels.dims(), &[10_000]);
-assert_eq!(mnist.train_images.dims(), &[60_000, 784]);
-assert_eq!(mnist.train_labels.dims(), &[60_000]);
-Ok(())
-    }
-}
--- a/candle-book/src/simplified.rs
+++ b/candle-book/src/simplified.rs
@ -1,196 +0,0 @@
-//! #A simplified example in Rust of training a neural network and then using it based on the Candle Framework by Hugging Face.
-//! Author: Evgeny Igumnov 2023 igumnovnsk@gmail.com
-//! This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
-//!
-//! ##Basic moments:
-//!
-//! A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
-//! The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
-//! The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
-//! For training, samples with real data on the results of the first and second stages of different elections are used.
-//! The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
-//! Model parameters (weights of neurons) are initialized randomly, then optimized during training.
-//! After training, the model is tested on a deferred sample to evaluate the accuracy.
-//! If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
-//! Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.
-
-#[rustfmt::skip]
-mod tests {
-
-use candle::{DType, Result, Tensor, D, Device};
-use candle_nn::{loss, ops, Linear, Module, VarBuilder, VarMap, Optimizer};
-
-// ANCHOR: book_training_simplified1
-const VOTE_DIM: usize = 2;
-const RESULTS: usize = 1;
-const EPOCHS: usize = 10;
-const LAYER1_OUT_SIZE: usize = 4;
-const LAYER2_OUT_SIZE: usize = 2;
-const LEARNING_RATE: f64 = 0.05;
-
-#[derive(Clone)]
-pub struct Dataset {
-    pub train_votes: Tensor,
-    pub train_results: Tensor,
-    pub test_votes: Tensor,
-    pub test_results: Tensor,
-}
-
-struct MultiLevelPerceptron {
-    ln1: Linear,
-    ln2: Linear,
-    ln3: Linear,
-}
-
-impl MultiLevelPerceptron {
-    fn new(vs: VarBuilder) -> Result<Self> {
-        let ln1 = candle_nn::linear(VOTE_DIM, LAYER1_OUT_SIZE, vs.pp("ln1"))?;
-        let ln2 = candle_nn::linear(LAYER1_OUT_SIZE, LAYER2_OUT_SIZE, vs.pp("ln2"))?;
-        let ln3 = candle_nn::linear(LAYER2_OUT_SIZE, RESULTS + 1, vs.pp("ln3"))?;
-        Ok(Self { ln1, ln2, ln3 })
-    }
-
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        let xs = self.ln1.forward(xs)?;
-        let xs = xs.relu()?;
-        let xs = self.ln2.forward(&xs)?;
-        let xs = xs.relu()?;
-        self.ln3.forward(&xs)
-    }
-}
-
-// ANCHOR_END: book_training_simplified1
-
-
-
-// ANCHOR: book_training_simplified3
-#[tokio::test]
-async fn simplified() -> anyhow::Result<()> {
-
-    let dev = Device::cuda_if_available(0)?;
-
-    let train_votes_vec: Vec<u32> = vec![
-        15, 10,
-        10, 15,
-        5, 12,
-        30, 20,
-        16, 12,
-        13, 25,
-        6, 14,
-        31, 21,
-    ];
-    let train_votes_tensor = Tensor::from_vec(train_votes_vec.clone(), (train_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
-
-    let train_results_vec: Vec<u32> = vec![
-        1,
-        0,
-        0,
-        1,
-        1,
-        0,
-        0,
-        1,
-    ];
-    let train_results_tensor = Tensor::from_vec(train_results_vec, train_votes_vec.len() / VOTE_DIM, &dev)?;
-
-    let test_votes_vec: Vec<u32> = vec![
-        13, 9,
-        8, 14,
-        3, 10,
-    ];
-    let test_votes_tensor = Tensor::from_vec(test_votes_vec.clone(), (test_votes_vec.len() / VOTE_DIM, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
-
-    let test_results_vec: Vec<u32> = vec![
-        1,
-        0,
-        0,
-    ];
-    let test_results_tensor = Tensor::from_vec(test_results_vec.clone(), test_results_vec.len(), &dev)?;
-
-    let m = Dataset {
-        train_votes: train_votes_tensor,
-        train_results: train_results_tensor,
-        test_votes: test_votes_tensor,
-        test_results: test_results_tensor,
-    };
-
-    let trained_model: MultiLevelPerceptron;
-    loop {
-        println!("Trying to train neural network.");
-        match train(m.clone(), &dev) {
-            Ok(model) => {
-                trained_model = model;
-                break;
-            },
-            Err(e) => {
-                println!("Error: {}", e);
-                continue;
-            }
-        }
-
-    }
-
-    let real_world_votes: Vec<u32> = vec![
-        13, 22,
-    ];
-
-    let tensor_test_votes = Tensor::from_vec(real_world_votes.clone(), (1, VOTE_DIM), &dev)?.to_dtype(DType::F32)?;
-
-    let final_result = trained_model.forward(&tensor_test_votes)?;
-
-    let result = final_result
-        .argmax(D::Minus1)?
-        .to_dtype(DType::F32)?
-        .get(0).map(|x| x.to_scalar::<f32>())??;
-    println!("real_life_votes: {:?}", real_world_votes);
-    println!("neural_network_prediction_result: {:?}", result);
-
-    Ok(())
-
-}
-// ANCHOR_END: book_training_simplified3
-
-// ANCHOR: book_training_simplified2
-fn train(m: Dataset, dev: &Device) -> anyhow::Result<MultiLevelPerceptron> {
-    let train_results = m.train_results.to_device(dev)?;
-    let train_votes = m.train_votes.to_device(dev)?;
-    let varmap = VarMap::new();
-    let vs = VarBuilder::from_varmap(&varmap, DType::F32, dev);
-    let model = MultiLevelPerceptron::new(vs.clone())?;
-    let mut sgd = candle_nn::SGD::new(varmap.all_vars(), LEARNING_RATE)?;
-    let test_votes = m.test_votes.to_device(dev)?;
-    let test_results = m.test_results.to_device(dev)?;
-    let mut final_accuracy: f32 = 0.0;
-    for epoch in 1..EPOCHS + 1 {
-        let logits = model.forward(&train_votes)?;
-        let log_sm = ops::log_softmax(&logits, D::Minus1)?;
-        let loss = loss::nll(&log_sm, &train_results)?;
-        sgd.backward_step(&loss)?;
-
-        let test_logits = model.forward(&test_votes)?;
-        let sum_ok = test_logits
-            .argmax(D::Minus1)?
-            .eq(&test_results)?
-            .to_dtype(DType::F32)?
-            .sum_all()?
-            .to_scalar::<f32>()?;
-        let test_accuracy = sum_ok / test_results.dims1()? as f32;
-        final_accuracy = 100. * test_accuracy;
-        println!("Epoch: {epoch:3} Train loss: {:8.5} Test accuracy: {:5.2}%",
-                 loss.to_scalar::<f32>()?,
-                 final_accuracy
-        );
-        if final_accuracy == 100.0 {
-            break;
-        }
-    }
-    if final_accuracy < 100.0 {
-        Err(anyhow::Error::msg("The model is not trained well enough."))
-    } else {
-        Ok(model)
-    }
-}
-// ANCHOR_END: book_training_simplified2
-
-
-}
--- a/candle-book/src/training/finetuning.md
+++ b/candle-book/src/training/finetuning.md
@ -1 +0,0 @@
-# Fine-tuning
--- a/candle-book/src/training/mnist.md
+++ b/candle-book/src/training/mnist.md
@ -1,10 +0,0 @@
-# MNIST
-
-So we now have downloaded the MNIST parquet files, let's put them in a simple struct.
-
-```rust,ignore
-{{#include ../lib.rs:book_training_3}}
-```
-
-The parsing of the file and putting it into single tensors requires the dataset to fit the entire memory.
-It is quite rudimentary, but simple enough for a small dataset like MNIST.
--- a/candle-book/src/training/serialization.md
+++ b/candle-book/src/training/serialization.md
@ -1 +0,0 @@
-# Serialization
--- a/candle-book/src/training/simplified.md
+++ b/candle-book/src/training/simplified.md
@ -1,45 +0,0 @@
-# Simplified
-
-## How its works
-
-This program implements a neural network to predict the winner of the second round of elections based on the results of the first round.
-
-Basic moments:
-
-1. A multilayer perceptron with two hidden layers is used. The first hidden layer has 4 neurons, the second has 2 neurons.
-2. The input is a vector of 2 numbers - the percentage of votes for the first and second candidates in the first stage.
-3. The output is the number 0 or 1, where 1 means that the first candidate will win in the second stage, 0 means that he will lose.
-4. For training, samples with real data on the results of the first and second stages of different elections are used.
-5. The model is trained by backpropagation using gradient descent and the cross-entropy loss function.
-6. Model parameters (weights of neurons) are initialized randomly, then optimized during training.
-7. After training, the model is tested on a deferred sample to evaluate the accuracy.
-8. If the accuracy on the test set is below 100%, the model is considered underfit and the learning process is repeated.
-
-Thus, this neural network learns to find hidden relationships between the results of the first and second rounds of voting in order to make predictions for new data.
-
-
-```rust,ignore
-{{#include ../simplified.rs:book_training_simplified1}}
-```
-
-```rust,ignore
-{{#include ../simplified.rs:book_training_simplified2}}
-```
-
-```rust,ignore
-{{#include ../simplified.rs:book_training_simplified3}}
-```
-
-
-## Example output
-
-```bash
-Trying to train neural network.
-Epoch:   1 Train loss:  4.42555 Test accuracy:  0.00%
-Epoch:   2 Train loss:  0.84677 Test accuracy: 33.33%
-Epoch:   3 Train loss:  2.54335 Test accuracy: 33.33%
-Epoch:   4 Train loss:  0.37806 Test accuracy: 33.33%
-Epoch:   5 Train loss:  0.36647 Test accuracy: 100.00%
-real_life_votes: [13, 22]
-neural_network_prediction_result: 0.0
-```
--- a/candle-book/src/training/training.md
+++ b/candle-book/src/training/training.md
@ -1,39 +0,0 @@
-# Training
-
-
-Training starts with data. We're going to use the huggingface hub and 
-start with the Hello world dataset of machine learning, MNIST.
-
-Let's start with downloading `MNIST` from [huggingface](https://huggingface.co/datasets/mnist).
-
-This requires [`hf-hub`](https://github.com/huggingface/hf-hub).
-```bash
-cargo add hf-hub
-```
-
-This is going to be very hands-on for now.
-
-```rust,ignore
-{{#include ../../../candle-examples/src/lib.rs:book_training_1}}
-```
-
-This uses the standardized `parquet` files from the `refs/convert/parquet` branch on every dataset.
-Our handles are now [`parquet::file::serialized_reader::SerializedFileReader`].
-
-We can inspect the content of the files with:
-
-```rust,ignore
-{{#include ../../../candle-examples/src/lib.rs:book_training_2}}
-```
-
-You should see something like:
-
-```bash
-Column id 1, name label, value 6
-Column id 0, name image, value {bytes: [137, ....]
-Column id 1, name label, value 8
-Column id 0, name image, value {bytes: [137, ....]
-```
-
-So each row contains 2 columns (image, label) with image being saved as bytes.
-Let's put them into a useful struct.
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -1,18 +1,18 @@
 [package]
-name = "candle-core"
-version.workspace = true
-edition.workspace = true
-description.workspace = true
-repository.workspace = true
-keywords.workspace = true
-categories.workspace = true
-license.workspace = true
+name = "candle"
+version = "0.1.0"
+edition = "2021"
+
+description = "Minimalist ML framework."
+repository = "https://github.com/LaurentMazare/candle"
+keywords = ["blas", "tensor", "machine-learning"]
+categories = ["science"]
+license = "MIT/Apache-2.0"
 readme = "README.md"

 [dependencies]
-accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { path = "../candle-kernels", version = "0.3.0", optional = true }
+candle-kernels = { path = "../candle-kernels", optional = true }
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
@ -22,20 +22,14 @@ memmap2 = { workspace = true }
 num-traits = { workspace = true }
 num_cpus = { workspace = true }
 rand = { workspace = true }
-rand_distr = { workspace = true }
-rayon = { workspace = true }
 safetensors = { workspace = true }
 thiserror = { workspace = true }
-yoke = { workspace = true }
 zip = { workspace = true }

 [dev-dependencies]
 anyhow = { workspace = true }
-clap = { workspace = true }

 [features]
 default = []
-cuda = ["cudarc", "dep:candle-kernels"]
-cudnn = ["cuda", "cudarc/cudnn"]
+cuda = ["dep:cudarc", "dep:candle-kernels"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
-accelerate = ["dep:libc", "dep:accelerate-src"]
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -1,18 +1,24 @@
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
 use anyhow::Result;
-use candle_core::{Device, Tensor};
+use candle::{Device, Tensor};

 fn main() -> Result<()> {
-    let inp = Tensor::randn(0f32, 1., (2, 320, 96, 96), &Device::Cpu)?;
-    let w = Tensor::randn(0f32, 1., (320, 320, 3, 3), &Device::Cpu)?;
-    let start = std::time::Instant::now();
-    let res = inp.conv2d(&w, 0, 1, 1, 1)?;
-    println!("{:?}", start.elapsed());
-    println!("{res:?}");
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 7., 1., 8., 2.]];
+    let t1 = Tensor::new(data, &Device::Cpu)?;
+    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 7., 1., 8., 2.]];
+    let t2 = Tensor::new(data2, &Device::Cpu)?;
+    assert_eq!(
+        Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
+            .t()?
+            .to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0],
+            [5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
    Ok(())
 }
--- a/candle-core/examples/cpu_benchmarks.rs
+++ b/candle-core/examples/cpu_benchmarks.rs
@ -0,0 +1,81 @@
+/// This example contains some simple benchmarks so that it's easy to run them in perf etc.
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+// use candle::quantized::GgmlType;
+use candle::{DType, Device, Result, Tensor};
+// use clap::{Parser, Subcommand};
+
+// fn softmax<D: candle::shape::Dim>(xs: &Tensor, dim: D) -> Result<Tensor> {
+//     let dim = dim.to_index(xs.shape(), "softmax")?;
+//     let max = xs.max_keepdim(dim)?;
+//     let diff = xs.broadcast_sub(&max)?;
+//     let num = diff.exp()?;
+//     let den = num.sum_keepdim(dim)?;
+//     num.broadcast_div(&den)
+// }
+
+trait Benchmark {
+    type PreProcessData;
+    type RunResult;
+
+    fn preprocess() -> Result<Self::PreProcessData>;
+    fn run_one(_: &Self::PreProcessData) -> Result<Self::RunResult>;
+
+    const ITERS: usize;
+}
+
+struct Matmul;
+impl Benchmark for Matmul {
+    type PreProcessData = (Tensor, Tensor);
+    type RunResult = Tensor;
+    fn preprocess() -> Result<Self::PreProcessData> {
+        let lhs = Tensor::randn((1024, 1024), DType::F32, &Device::Cpu, 1.0, 0.0)?;
+        let rhs = Tensor::randn((1024, 1024), DType::F32, &Device::Cpu, 1.0, 0.0)?;
+        Ok((lhs, rhs))
+    }
+
+    fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+        d.0.matmul(&d.1)
+    }
+
+    const ITERS: usize = 100;
+}
+
+// struct Softmax;
+// impl Benchmark for Softmax {
+//     type PreProcessData = Tensor;
+//     type RunResult = Tensor;
+//     fn preprocess() -> Result<Self::PreProcessData> {
+//         // Typical whisper tiny size.
+//         let x = Tensor::randn(0f32, 1., (1, 6, 200, 1500), &Device::Cpu)?;
+//         Ok(x)
+//     }
+//
+//     fn run_one(d: &Self::PreProcessData) -> Result<Self::RunResult> {
+//         softmax(d, D::Minus1)
+//     }
+//
+//     const ITERS: usize = 100;
+// }
+
+fn run<B: Benchmark>(iters: Option<usize>) -> Result<()> {
+    use std::hint::black_box;
+
+    let iters = iters.unwrap_or(B::ITERS);
+    let d = B::preprocess()?;
+    let start = std::time::Instant::now();
+    for _iter in 0..iters {
+        let _res = black_box(B::run_one(black_box(&d))?);
+    }
+    println!("{:?}", start.elapsed() / iters as u32);
+    Ok(())
+}
+
+fn main() -> Result<()> {
+    run::<Matmul>(None)?;
+    Ok(())
+}
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -1,29 +1,15 @@
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

 use anyhow::Result;
-use candle_core::{Device, Tensor};
+use candle::{Device, Tensor};

 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
-    let in_t = Tensor::rand(-1f32, 1f32, (1, 3, 12, 7), &device)?;
-    let k_t = Tensor::rand(-1f32, 1f32, (6, 3, 1, 1), &device)?;
-    let out_t = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
-    println!("{out_t}");
-    let in_t = in_t.to_device(&Device::Cpu)?;
-    let k_t = k_t.to_device(&Device::Cpu)?;
-    let out_t2 = in_t.conv2d(&k_t, 0, 1, 1, 1)?;
-    let diff = (out_t.to_device(&Device::Cpu)? - out_t2)?
-        .sqr()?
-        .sum_all()?;
-    println!("{diff}");
-
-    let t = Tensor::randn(0f32, 1f32, (2, 4, 96, 96), &device)?;
-    let w = Tensor::randn(0f32, 1f32, (320, 4, 3, 3), &device)?;
-    let res = t.conv2d(&w, 1, 1, 1, 1)?;
-    println!("{res:?}");
+    let t = Tensor::new(&[[1f32, 2., 3., 4.2]], &device)?;
+    let sum = t.sum_keepdim(0)?;
+    println!("{sum}");
+    let sum = t.sum_keepdim(1)?;
+    println!("{sum}");
    Ok(())
 }
--- a/candle-core/examples/cuda_sum_benchmark.rs
+++ b/candle-core/examples/cuda_sum_benchmark.rs
@ -1,13 +1,10 @@
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
 use std::str::FromStr;

 use anyhow::Result;
-use candle_core::{Device, Tensor};
+use candle::{Device, Tensor};

 fn cos_sin(n: usize, device: &Device) -> Result<Tensor> {
    let thetas: Vec<_> = (0..n).map(|i| (i as f32 / n as f32)).collect();
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@ -1,384 +0,0 @@
-use candle_core::quantized::{gguf_file, k_quants, QTensor};
-use candle_core::{Device, Result, Tensor};
-use clap::{Parser, Subcommand, ValueEnum};
-use rayon::prelude::*;
-
-#[derive(ValueEnum, Debug, Clone)]
-enum QuantizationMode {
-    /// The default quantization includes all 2d tensors, except the output tensor which always
-    /// uses Q6_K.
-    Llama,
-}
-
-impl QuantizationMode {
-    fn quantize(
-        &self,
-        name: &str,
-        tensor: QTensor,
-        default: fn(&Tensor) -> Result<QTensor>,
-    ) -> Result<QTensor> {
-        match self {
-            Self::Llama => {
-                // Same behavior as the llama.cpp quantization.
-                let should_quantize = name.ends_with(".weight") && tensor.rank() == 2;
-                if should_quantize {
-                    let tensor = tensor.dequantize(&Device::Cpu)?;
-                    if name == "output.weight" {
-                        QTensor::quantize::<k_quants::BlockQ6K>(&tensor)
-                    } else {
-                        default(&tensor)
-                    }
-                } else {
-                    Ok(tensor)
-                }
-            }
-        }
-    }
-}
-
-#[derive(ValueEnum, Debug, Clone)]
-enum Quantization {
-    #[value(name = "q4_0")]
-    Q4_0,
-    #[value(name = "q4_1")]
-    Q4_1,
-    #[value(name = "q5_0")]
-    Q5_0,
-    #[value(name = "q5_1")]
-    Q5_1,
-    #[value(name = "q8_0")]
-    Q8_0,
-    #[value(name = "q8_1")]
-    Q8_1,
-    Q2k,
-    Q3k,
-    Q4k,
-    Q5k,
-    Q6k,
-    Q8k,
-    F16,
-    F32,
-}
-
-#[derive(ValueEnum, Debug, Clone)]
-enum Format {
-    Safetensors,
-    Npz,
-    Ggml,
-    Gguf,
-    Pth,
-    Pickle,
-}
-
-impl Format {
-    fn infer<P: AsRef<std::path::Path>>(p: P) -> Option<Self> {
-        p.as_ref()
-            .extension()
-            .and_then(|e| e.to_str())
-            .and_then(|e| match e {
-                // We don't infer any format for .bin as it can be used for ggml/gguf or pytorch.
-                "safetensors" | "safetensor" => Some(Self::Safetensors),
-                "npz" => Some(Self::Npz),
-                "pth" | "pt" => Some(Self::Pth),
-                "ggml" => Some(Self::Ggml),
-                "gguf" => Some(Self::Gguf),
-                _ => None,
-            })
-    }
-}
-
-#[derive(Subcommand, Debug, Clone)]
-enum Command {
-    Ls {
-        files: Vec<std::path::PathBuf>,
-
-        /// The file format to use, if unspecified infer from the file extension.
-        #[arg(long, value_enum)]
-        format: Option<Format>,
-
-        /// Enable verbose mode.
-        #[arg(short, long)]
-        verbose: bool,
-    },
-
-    Quantize {
-        /// The input file, in gguf format.
-        in_file: Vec<std::path::PathBuf>,
-
-        /// The output file, in gguf format.
-        #[arg(long)]
-        out_file: std::path::PathBuf,
-
-        /// The quantization schema to apply.
-        #[arg(long, value_enum)]
-        quantization: Quantization,
-
-        /// Which tensor to quantize.
-        #[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
-        mode: QuantizationMode,
-    },
-}
-
-#[derive(Parser, Debug, Clone)]
-struct Args {
-    #[command(subcommand)]
-    command: Command,
-}
-
-fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> Result<()> {
-    let format = match format {
-        Some(format) => format,
-        None => match Format::infer(file) {
-            Some(format) => format,
-            None => {
-                println!(
-                    "{file:?}: cannot infer format from file extension, use the --format flag"
-                );
-                return Ok(());
-            }
-        },
-    };
-    match format {
-        Format::Npz => {
-            let tensors = candle_core::npy::NpzTensors::new(file)?;
-            let mut names = tensors.names();
-            names.sort();
-            for name in names {
-                let shape_dtype = match tensors.get_shape_and_dtype(name) {
-                    Ok((shape, dtype)) => format!("[{shape:?}; {dtype:?}]"),
-                    Err(err) => err.to_string(),
-                };
-                println!("{name}: {shape_dtype}")
-            }
-        }
-        Format::Safetensors => {
-            let tensors = unsafe { candle_core::safetensors::MmapedSafetensors::new(file)? };
-            let mut tensors = tensors.tensors();
-            tensors.sort_by(|a, b| a.0.cmp(&b.0));
-            for (name, view) in tensors.iter() {
-                let dtype = view.dtype();
-                let dtype = match candle_core::DType::try_from(dtype) {
-                    Ok(dtype) => format!("{dtype:?}"),
-                    Err(_) => format!("{dtype:?}"),
-                };
-                let shape = view.shape();
-                println!("{name}: [{shape:?}; {dtype}]")
-            }
-        }
-        Format::Pth => {
-            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose)?;
-            tensors.sort_by(|a, b| a.name.cmp(&b.name));
-            for tensor_info in tensors.iter() {
-                println!(
-                    "{}: [{:?}; {:?}]",
-                    tensor_info.name,
-                    tensor_info.layout.shape(),
-                    tensor_info.dtype,
-                );
-                if verbose {
-                    println!("    {:?}", tensor_info);
-                }
-            }
-        }
-        Format::Pickle => {
-            let file = std::fs::File::open(file)?;
-            let mut reader = std::io::BufReader::new(file);
-            let mut stack = candle_core::pickle::Stack::empty();
-            stack.read_loop(&mut reader)?;
-            for (i, obj) in stack.stack().iter().enumerate() {
-                println!("{i} {obj:?}");
-            }
-        }
-        Format::Ggml => {
-            let mut file = std::fs::File::open(file)?;
-            let content = candle_core::quantized::ggml_file::Content::read(&mut file)?;
-            let mut tensors = content.tensors.into_iter().collect::<Vec<_>>();
-            tensors.sort_by(|a, b| a.0.cmp(&b.0));
-            for (name, qtensor) in tensors.iter() {
-                println!("{name}: [{:?}; {:?}]", qtensor.shape(), qtensor.dtype());
-            }
-        }
-        Format::Gguf => {
-            let mut file = std::fs::File::open(file)?;
-            let content = gguf_file::Content::read(&mut file)?;
-            if verbose {
-                let mut metadata = content.metadata.into_iter().collect::<Vec<_>>();
-                metadata.sort_by(|a, b| a.0.cmp(&b.0));
-                println!("metadata entries ({})", metadata.len());
-                for (key, value) in metadata.iter() {
-                    println!("  {key}: {value:?}");
-                }
-            }
-            let mut tensors = content.tensor_infos.into_iter().collect::<Vec<_>>();
-            tensors.sort_by(|a, b| a.0.cmp(&b.0));
-            for (name, info) in tensors.iter() {
-                println!("{name}: [{:?}; {:?}]", info.shape, info.ggml_dtype);
-            }
-        }
-    }
-    Ok(())
-}
-
-fn run_quantize_safetensors(
-    in_files: &[std::path::PathBuf],
-    out_file: std::path::PathBuf,
-    q: Quantization,
-) -> Result<()> {
-    let mut out_file = std::fs::File::create(out_file)?;
-    let mut tensors = std::collections::HashMap::new();
-    for in_file in in_files.iter() {
-        let in_tensors = candle_core::safetensors::load(in_file, &Device::Cpu)?;
-        tensors.extend(in_tensors)
-    }
-    println!("tensors: {}", tensors.len());
-
-    let quantize_fn = match q {
-        Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
-        Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
-        Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
-        Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
-        Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
-        Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
-        Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
-        Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
-        Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
-        Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
-        Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
-        Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
-        Quantization::F16 => QTensor::quantize::<half::f16>,
-        Quantization::F32 => QTensor::quantize::<f32>,
-    };
-    let block_size = match q {
-        Quantization::Q4_0 => k_quants::QK4_0,
-        Quantization::Q4_1 => k_quants::QK4_1,
-        Quantization::Q5_0 => k_quants::QK5_0,
-        Quantization::Q5_1 => k_quants::QK5_1,
-        Quantization::Q8_0 => k_quants::QK8_0,
-        Quantization::Q8_1 => k_quants::QK8_1,
-        Quantization::Q2k
-        | Quantization::Q3k
-        | Quantization::Q4k
-        | Quantization::Q5k
-        | Quantization::Q6k
-        | Quantization::Q8k => k_quants::QK_K,
-        Quantization::F16 | Quantization::F32 => 1,
-    };
-
-    let qtensors = tensors
-        .into_par_iter()
-        .map(|(name, tensor)| {
-            let should_quantize = tensor.rank() == 2 && tensor.dim(1)? % block_size == 0;
-            println!("  quantizing {name} {tensor:?} {should_quantize}");
-            let tensor = if should_quantize {
-                quantize_fn(&tensor)?
-            } else {
-                QTensor::quantize::<f32>(&tensor)?
-            };
-            Ok((name, tensor))
-        })
-        .collect::<Result<Vec<_>>>()?;
-    let qtensors = qtensors
-        .iter()
-        .map(|(k, v)| (k.as_str(), v))
-        .collect::<Vec<_>>();
-    gguf_file::write(&mut out_file, &[], &qtensors)?;
-    Ok(())
-}
-
-fn run_quantize(
-    in_files: &[std::path::PathBuf],
-    out_file: std::path::PathBuf,
-    q: Quantization,
-    qmode: QuantizationMode,
-) -> Result<()> {
-    if in_files.is_empty() {
-        candle_core::bail!("no specified input files")
-    }
-    if let Some(extension) = out_file.extension() {
-        if extension == "safetensors" {
-            candle_core::bail!("the generated file cannot use the safetensors extension")
-        }
-    }
-    if let Some(extension) = in_files[0].extension() {
-        if extension == "safetensors" {
-            return run_quantize_safetensors(in_files, out_file, q);
-        }
-    }
-
-    if in_files.len() != 1 {
-        candle_core::bail!("only a single in-file can be used when quantizing gguf files")
-    }
-
-    // Open the out file early so as to fail directly on missing directories etc.
-    let mut out_file = std::fs::File::create(out_file)?;
-    let mut in_ = std::fs::File::open(&in_files[0])?;
-    let content = gguf_file::Content::read(&mut in_)?;
-    println!("tensors: {}", content.tensor_infos.len());
-
-    let quantize_fn = match q {
-        Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
-        Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
-        Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
-        Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
-        Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
-        Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
-        Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
-        Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
-        Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
-        Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
-        Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
-        Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
-        Quantization::F16 => QTensor::quantize::<half::f16>,
-        Quantization::F32 => QTensor::quantize::<f32>,
-    };
-
-    let qtensors = content
-        .tensor_infos
-        .par_iter()
-        .map(|(name, _)| {
-            println!("  quantizing {name}");
-            let mut in_file = std::fs::File::open(&in_files[0])?;
-            let tensor = content.tensor(&mut in_file, name)?;
-            let tensor = qmode.quantize(name, tensor, quantize_fn)?;
-            Ok((name, tensor))
-        })
-        .collect::<Result<Vec<_>>>()?;
-    let qtensors = qtensors
-        .iter()
-        .map(|(k, v)| (k.as_str(), v))
-        .collect::<Vec<_>>();
-
-    let metadata = content
-        .metadata
-        .iter()
-        .map(|(k, v)| (k.as_str(), v))
-        .collect::<Vec<_>>();
-    gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?;
-    Ok(())
-}
-
-fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
-    match args.command {
-        Command::Ls {
-            files,
-            format,
-            verbose,
-        } => {
-            let multiple_files = files.len() > 1;
-            for file in files.iter() {
-                if multiple_files {
-                    println!("--- {file:?} ---");
-                }
-                run_ls(file, format.clone(), verbose)?
-            }
-        }
-        Command::Quantize {
-            in_file,
-            out_file,
-            quantization,
-            mode,
-        } => run_quantize(&in_file, out_file, quantization, mode)?,
-    }
-    Ok(())
-}
--- a/candle-core/src/accelerate.rs
+++ b/candle-core/src/accelerate.rs
@ -1,444 +0,0 @@
-#![allow(dead_code)]
-use libc::{c_char, c_double, c_float, c_int, c_long, c_ulong};
-
-mod ffi {
-    use super::*;
-    extern "C" {
-        // It would be nice to be able to switch to the NEWLAPACK version of the function but this
-        // seems to trigger some link error. Available function names can be seen here:
-        // /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Accelerate.tbd
-        #[link_name = "sgemm_"]
-        pub fn sgemm_ffi(
-            transa: *const c_char,
-            transb: *const c_char,
-            m: *const c_int,
-            n: *const c_int,
-            k: *const c_int,
-            alpha: *const c_float,
-            a: *const c_float,
-            lda: *const c_int,
-            b: *const c_float,
-            ldb: *const c_int,
-            beta: *const c_float,
-            c: *mut c_float,
-            ldc: *const c_int,
-        );
-        #[link_name = "dgemm_"]
-        pub fn dgemm_ffi(
-            transa: *const c_char,
-            transb: *const c_char,
-            m: *const c_int,
-            n: *const c_int,
-            k: *const c_int,
-            alpha: *const c_double,
-            a: *const c_double,
-            lda: *const c_int,
-            b: *const c_double,
-            ldb: *const c_int,
-            beta: *const c_double,
-            c: *mut c_double,
-            ldc: *const c_int,
-        );
-
-        pub fn vvexpf(dst: *mut c_float, src: *const c_float, len: *const c_int);
-        pub fn vvexp(dst: *mut c_double, src: *const c_double, len: *const c_int);
-        pub fn vvsqrtf(dst: *mut c_float, src: *const c_float, len: *const c_int);
-        pub fn vvsqrt(dst: *mut c_double, src: *const c_double, len: *const c_int);
-        pub fn vvsinf(dst: *mut c_float, src: *const c_float, len: *const c_int);
-        pub fn vvsin(dst: *mut c_double, src: *const c_double, len: *const c_int);
-        pub fn vvcosf(dst: *mut c_float, src: *const c_float, len: *const c_int);
-        pub fn vvcos(dst: *mut c_double, src: *const c_double, len: *const c_int);
-        pub fn vvlogf(dst: *mut c_float, src: *const c_float, len: *const c_int);
-        pub fn vvlog(dst: *mut c_double, src: *const c_double, len: *const c_int);
-        pub fn vvtanhf(dst: *mut c_float, src: *const c_float, len: *const c_int);
-        pub fn vvtanh(dst: *mut c_double, src: *const c_double, len: *const c_int);
-
-        pub fn vDSP_vaddD(
-            _: *const c_double,
-            _: c_long,
-            _: *const c_double,
-            _: c_long,
-            _: *mut c_double,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vadd(
-            _: *const c_float,
-            _: c_long,
-            _: *const c_float,
-            _: c_long,
-            _: *mut c_float,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vsubD(
-            _: *const c_double,
-            _: c_long,
-            _: *const c_double,
-            _: c_long,
-            _: *mut c_double,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vsub(
-            _: *const c_float,
-            _: c_long,
-            _: *const c_float,
-            _: c_long,
-            _: *mut c_float,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vmulD(
-            _: *const c_double,
-            _: c_long,
-            _: *const c_double,
-            _: c_long,
-            _: *mut c_double,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vmul(
-            _: *const c_float,
-            _: c_long,
-            _: *const c_float,
-            _: c_long,
-            _: *mut c_float,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vdivD(
-            _: *const c_double,
-            _: c_long,
-            _: *const c_double,
-            _: c_long,
-            _: *mut c_double,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vdiv(
-            _: *const c_float,
-            _: c_long,
-            _: *const c_float,
-            _: c_long,
-            _: *mut c_float,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vminD(
-            _: *const c_double,
-            _: c_long,
-            _: *const c_double,
-            _: c_long,
-            _: *mut c_double,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vmin(
-            _: *const c_float,
-            _: c_long,
-            _: *const c_float,
-            _: c_long,
-            _: *mut c_float,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vmaxD(
-            _: *const c_double,
-            _: c_long,
-            _: *const c_double,
-            _: c_long,
-            _: *mut c_double,
-            _: c_long,
-            _: c_ulong,
-        );
-        pub fn vDSP_vmax(
-            _: *const c_float,
-            _: c_long,
-            _: *const c_float,
-            _: c_long,
-            _: *mut c_float,
-            _: c_long,
-            _: c_ulong,
-        );
-    }
-}
-
-#[allow(clippy::too_many_arguments)]
-#[inline]
-pub unsafe fn sgemm(
-    transa: u8,
-    transb: u8,
-    m: i32,
-    n: i32,
-    k: i32,
-    alpha: f32,
-    a: &[f32],
-    lda: i32,
-    b: &[f32],
-    ldb: i32,
-    beta: f32,
-    c: &mut [f32],
-    ldc: i32,
-) {
-    ffi::sgemm_ffi(
-        &(transa as c_char),
-        &(transb as c_char),
-        &m,
-        &n,
-        &k,
-        &alpha,
-        a.as_ptr(),
-        &lda,
-        b.as_ptr(),
-        &ldb,
-        &beta,
-        c.as_mut_ptr(),
-        &ldc,
-    )
-}
-
-#[allow(clippy::too_many_arguments)]
-#[inline]
-pub unsafe fn dgemm(
-    transa: u8,
-    transb: u8,
-    m: i32,
-    n: i32,
-    k: i32,
-    alpha: f64,
-    a: &[f64],
-    lda: i32,
-    b: &[f64],
-    ldb: i32,
-    beta: f64,
-    c: &mut [f64],
-    ldc: i32,
-) {
-    ffi::dgemm_ffi(
-        &(transa as c_char),
-        &(transb as c_char),
-        &m,
-        &n,
-        &k,
-        &alpha,
-        a.as_ptr(),
-        &lda,
-        b.as_ptr(),
-        &ldb,
-        &beta,
-        c.as_mut_ptr(),
-        &ldc,
-    )
-}
-
-#[inline]
-pub fn vs_exp(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvexpf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vd_exp(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvexp(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vs_sqrt(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvsqrtf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vd_sqrt(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvsqrt(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vs_sin(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvsinf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vd_sin(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvsin(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-#[inline]
-pub fn vs_cos(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvcosf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vd_cos(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvcos(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-#[inline]
-pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvtanhf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vd_tanh(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvtanh(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vs_ln(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvlogf(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vd_ln(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    unsafe { ffi::vvlog(y.as_mut_ptr(), a.as_ptr(), &(a_len as i32)) }
-}
-
-#[inline]
-pub fn vs_sqr(a: &[f32], y: &mut [f32]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
-}
-
-#[inline]
-pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
-    let a_len = a.len();
-    let y_len = y.len();
-    if a_len != y_len {
-        panic!("a and y have different lengths {a_len} <> {y_len}")
-    }
-    y.iter_mut().zip(a.iter()).for_each(|(y, a)| *y = *a * *a)
-}
-
-#[inline]
-pub fn vs_tanh_inplace(y: &mut [f32]) {
-    unsafe { ffi::vvtanhf(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vd_tanh_inplace(y: &mut [f64]) {
-    unsafe { ffi::vvtanh(y.as_mut_ptr(), y.as_ptr(), &(y.len() as i32)) }
-}
-
-#[inline]
-pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
-    }
-    vs_tanh_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = 0.5 * v * (1.0 + *y)
-    }
-}
-
-#[inline]
-pub fn vd_gelu(vs: &[f64], ys: &mut [f64]) {
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = (2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
-    }
-    vd_tanh_inplace(ys);
-    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
-        *y = 0.5 * v * (1.0 + *y)
-    }
-}
-
-macro_rules! binary_op {
-    ($fn_name:ident, $ty:ty, $accelerate_name:ident) => {
-        #[inline]
-        pub fn $fn_name(a: &[$ty], b: &[$ty], y: &mut [$ty]) {
-            let a_len = a.len();
-            let b_len = b.len();
-            let y_len = y.len();
-            if a_len != y_len || b_len != y_len {
-                panic!(
-                    "{} a,b,y len mismatch {a_len} {b_len} {y_len}",
-                    stringify!($fn_name)
-                );
-            }
-            unsafe {
-                // Weird quirk of accelerate, the rhs comes before the lhs.
-                ffi::$accelerate_name(
-                    b.as_ptr(),
-                    1,
-                    a.as_ptr(),
-                    1,
-                    y.as_mut_ptr(),
-                    1,
-                    a_len as u64,
-                )
-            }
-        }
-    };
-}
-binary_op!(vs_add, f32, vDSP_vadd);
-binary_op!(vd_add, f64, vDSP_vaddD);
-binary_op!(vs_sub, f32, vDSP_vsub);
-binary_op!(vd_sub, f64, vDSP_vsubD);
-binary_op!(vs_mul, f32, vDSP_vmul);
-binary_op!(vd_mul, f64, vDSP_vmulD);
-binary_op!(vs_div, f32, vDSP_vdiv);
-binary_op!(vd_div, f64, vDSP_vdivD);
-binary_op!(vs_max, f32, vDSP_vmax);
-binary_op!(vd_max, f64, vDSP_vmaxD);
-binary_op!(vs_min, f32, vDSP_vmin);
-binary_op!(vd_min, f64, vDSP_vminD);
--- a/candle-core/src/backend.rs
+++ b/candle-core/src/backend.rs
@ -1,7 +1,6 @@
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Layout, Result, Shape};

-pub trait BackendStorage: Sized {
+pub(crate) trait BackendStorage: Sized {
    type Device: BackendDevice;

    fn try_clone(&self, _: &Layout) -> Result<Self>;
@ -15,19 +14,18 @@ pub trait BackendStorage: Sized {

    fn affine(&self, _: &Layout, _: f64, _: f64) -> Result<Self>;

-    fn powf(&self, _: &Layout, _: f64) -> Result<Self>;
-
    fn elu(&self, _: &Layout, _: f64) -> Result<Self>;

-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self>;
+    fn sum(&self, _: &Layout, _: &[usize]) -> Result<Self>;

-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self>;
+    fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()>;

    fn to_dtype(&self, _: &Layout, _: DType) -> Result<Self>;

-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self>;
+    fn unary_impl<B: crate::op::UnaryOp>(&self, _: &Layout) -> Result<Self>;

-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self>;
+    fn binary_impl<B: crate::op::BinaryOp>(&self, _: &Self, _: &Layout, _: &Layout)
+        -> Result<Self>;

    fn where_cond(&self, _: &Layout, _: &Self, _: &Layout, _: &Self, _: &Layout) -> Result<Self>;

@ -39,47 +37,7 @@ pub trait BackendStorage: Sized {
        _params: &crate::conv::ParamsConv1D,
    ) -> Result<Self>;

-    fn conv2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConv2D,
-    ) -> Result<Self>;
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self>;
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self>;
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self>;
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self>;
-
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self>;
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self>;
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self>;
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self>;
+    fn embedding(&self, _: &Layout, _: &Self, _: &Layout) -> Result<Self>;

    fn matmul(
        &self,
@ -92,7 +50,7 @@ pub trait BackendStorage: Sized {
    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>;
 }

-pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
+pub(crate) trait BackendDevice: Sized + std::fmt::Debug + Clone {
    type Storage: BackendStorage;

    // TODO: Make the usize generic and part of a generic DeviceLocation.
@ -111,6 +69,4 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
    fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;

    fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
-
-    fn set_seed(&self, _: u64) -> Result<()>;
 }
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -1,20 +1,6 @@
-use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp};
-use crate::{Error, Result, Tensor, TensorId};
+use crate::{op::Op, Error, Result, Tensor, TensorId};
 use std::collections::HashMap;

-// arg has been reduced to node via reduce_dims, expand it back to arg.
-// This has to handle keepdims.
-fn broadcast_back(arg: &Tensor, node: &Tensor, reduced_dims: &[usize]) -> Result<Tensor> {
-    if arg.rank() == node.rank() {
-        // keepdim = true
-        node.broadcast_as(arg.shape())
-    } else {
-        // keepdim = false
-        // first expand the reduced dims.
-        node.reshape(reduced_dims)?.broadcast_as(arg.shape())
-    }
-}
-
 impl Tensor {
    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
    /// elements having dependencies on the latter ones, e.g. the first element if any is the
@ -38,10 +24,7 @@ impl Tensor {
                nodes
            } else if let Some(op) = node.op() {
                match op {
-                    Op::IndexAdd(t1, t2, t3, _)
-                    | Op::ScatterAdd(t1, t2, t3, _)
-                    | Op::CustomOp3(t1, t2, t3, _)
-                    | Op::WhereCond(t1, t2, t3) => {
+                    Op::WhereCond(t1, t2, t3) => {
                        let (tg, nodes) = walk(t1, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(t2, nodes, already_seen);
@ -55,22 +38,12 @@ impl Tensor {
                        kernel: rhs,
                        ..
                    }
-                    | Op::Conv2D {
-                        arg: lhs,
-                        kernel: rhs,
-                        ..
-                    }
-                    | Op::ConvTranspose2D {
-                        arg: lhs,
-                        kernel: rhs,
-                        ..
-                    }
-                    | Op::CustomOp2(lhs, rhs, _)
-                    | Op::Binary(lhs, rhs, _)
-                    | Op::Gather(lhs, rhs, _)
-                    | Op::IndexSelect(lhs, rhs, _)
-                    | Op::Matmul(lhs, rhs)
-                    | Op::SliceScatter0(lhs, rhs, _) => {
+                    | Op::Add(lhs, rhs)
+                    | Op::Mul(lhs, rhs)
+                    | Op::Sub(lhs, rhs)
+                    | Op::Div(lhs, rhs)
+                    | Op::Embedding(lhs, rhs)
+                    | Op::Matmul(lhs, rhs) => {
                        let (tg, nodes) = walk(lhs, nodes, already_seen);
                        track_grad |= tg;
                        let (tg, nodes) = walk(rhs, nodes, already_seen);
@ -91,32 +64,29 @@ impl Tensor {
                            nodes
                        }
                    }
-                    Op::Unary(_node, UnaryOp::Ceil)
-                    | Op::Unary(_node, UnaryOp::Floor)
-                    | Op::Unary(_node, UnaryOp::Round) => nodes,
                    Op::Reshape(node)
-                    | Op::UpsampleNearest1D(node)
-                    | Op::UpsampleNearest2D(node)
-                    | Op::AvgPool2D { arg: node, .. }
-                    | Op::MaxPool2D { arg: node, .. }
-                    | Op::Copy(node)
                    | Op::Broadcast(node)
-                    | Op::Cmp(node, _)
-                    | Op::Reduce(node, ReduceOp::Min | ReduceOp::Sum | ReduceOp::Max, _)
+                    | Op::Sum(node, _)
                    | Op::ToDType(node)
                    | Op::ToDevice(node)
                    | Op::Transpose(node, _, _)
-                    | Op::Permute(node, _)
                    | Op::Narrow(node, _, _, _)
-                    | Op::Unary(node, _)
+                    | Op::Softmax(node, _)
+                    | Op::Sqr(node)
+                    | Op::Sqrt(node)
+                    | Op::Gelu(node)
+                    | Op::Relu(node)
                    | Op::Elu(node, _)
-                    | Op::Powf(node, _)
-                    | Op::CustomOp1(node, _) => {
+                    | Op::Exp(node)
+                    | Op::Log(node)
+                    | Op::Sin(node)
+                    | Op::Cos(node)
+                    | Op::Abs(node)
+                    | Op::Neg(node) => {
                        let (tg, nodes) = walk(node, nodes, already_seen);
                        track_grad |= tg;
                        nodes
                    }
-                    Op::Reduce(_, ReduceOp::ArgMin | ReduceOp::ArgMax, _) => nodes,
                }
            } else {
                nodes
@ -146,19 +116,19 @@ impl Tensor {
            // this is out of scope.
            if let Some(op) = node.op() {
                match op {
-                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
+                    Op::Add(lhs, rhs) => {
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&grad)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&grad)?;
                    }
-                    Op::Binary(lhs, rhs, BinaryOp::Sub) => {
+                    Op::Sub(lhs, rhs) => {
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&grad)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.sub(&grad)?;
                    }
-                    Op::Binary(lhs, rhs, BinaryOp::Mul) => {
+                    Op::Mul(lhs, rhs) => {
                        let lhs_grad = grad.mul(rhs)?;
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
@ -166,27 +136,12 @@ impl Tensor {
                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
                    }
-                    Op::Binary(lhs, rhs, BinaryOp::Div) => {
+                    Op::Div(lhs, rhs) => {
                        let lhs_grad = grad.div(rhs)?;
                        let lhs_sum_grad = grads.or_insert(lhs)?;
                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
                        let rhs_grad = grad.mul(lhs)?.div(&rhs.sqr()?)?;
                        let rhs_sum_grad = grads.or_insert(rhs)?;
-                        *rhs_sum_grad = rhs_sum_grad.sub(&rhs_grad)?;
-                    }
-                    Op::Binary(lhs, rhs, BinaryOp::Minimum)
-                    | Op::Binary(lhs, rhs, BinaryOp::Maximum) => {
-                        let mask_lhs = node.eq(lhs)?.to_dtype(grad.dtype())?;
-                        let mask_rhs = node.eq(rhs)?.to_dtype(grad.dtype())?;
-
-                        // If both masks are 1 one the same point, we want to scale the
-                        // gradient by 0.5 rather than 1.
-                        let lhs_grad = mask_lhs.mul(&grad)?.div(&(&mask_rhs + 1.)?)?;
-                        let lhs_sum_grad = grads.or_insert(lhs)?;
-                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
-
-                        let rhs_grad = mask_rhs.mul(&grad)?.div(&(&mask_lhs + 1.)?)?;
-                        let rhs_sum_grad = grads.or_insert(rhs)?;
                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
                    }
                    Op::WhereCond(pred, t, f) => {
@ -198,114 +153,9 @@ impl Tensor {
                        let f_grad = pred.where_cond(&zeros, &grad)?;
                        *f_sum_grad = f_sum_grad.add(&f_grad)?;
                    }
-                    Op::Conv1D { .. } => Err(Error::BackwardNotSupported { op: "conv1d" })?,
-                    Op::Conv2D {
-                        arg,
-                        kernel,
-                        padding,
-                        stride,
-                        dilation,
-                    } => {
-                        // The output height for conv_transpose2d is:
-                        // (i_h - 1) * stride - 2 * padding + dilation * (k_h - 1) + out_padding + 1
-                        let grad_h = grad.dim(2)?;
-                        let k_h = kernel.dim(2)?;
-                        let out_size =
-                            (grad_h - 1) * stride + dilation * (k_h - 1) + 1 - 2 * padding;
-                        let out_padding = arg.dim(2)? - out_size;
-                        let grad_arg = grad.conv_transpose2d(
-                            kernel,
-                            *padding,
-                            out_padding,
-                            *stride,
-                            *dilation,
-                        )?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-
-                        let grad_kernel = arg
-                            .transpose(0, 1)?
-                            .conv2d(&grad.transpose(0, 1)?, *padding, *dilation, *stride, 1)?
-                            .transpose(0, 1)?;
-                        let sum_grad = grads.or_insert(kernel)?;
-                        *sum_grad = sum_grad.add(&grad_kernel)?;
-                    }
-                    Op::ConvTranspose2D { .. } => Err(Error::BackwardNotSupported {
-                        op: "conv-transpose2d",
-                    })?,
-                    Op::AvgPool2D {
-                        arg,
-                        kernel_size,
-                        stride,
-                    } => {
-                        if kernel_size != stride {
-                            crate::bail!("backward not supported for avgpool2d if ksize {kernel_size:?} != stride {stride:?}")
-                        }
-                        let (_n, _c, h, w) = arg.dims4()?;
-                        let grad_arg = grad.upsample_nearest2d(h, w)?;
-                        let grad_arg =
-                            (grad_arg * (1f64 / (kernel_size.0 * kernel_size.1) as f64))?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-                    }
-                    Op::MaxPool2D {
-                        arg,
-                        kernel_size,
-                        stride,
-                    } => {
-                        if kernel_size != stride {
-                            crate::bail!("backward not supported for maxpool2d if ksize {kernel_size:?} != stride {stride:?}")
-                        }
-                        let (_n, _c, h, w) = arg.dims4()?;
-                        // For computing the max-pool gradient, we compute a mask where a 1 means
-                        // that the element is the maximum, then we apply this mask to the
-                        // upsampled gradient (taking into account that multiple max may exist so
-                        // we scale the gradient for this case).
-                        let node_upsampled = node.upsample_nearest2d(h, w)?;
-                        let mask = arg.eq(&node_upsampled)?.to_dtype(arg.dtype())?;
-                        let avg = mask.avg_pool2d_with_stride(*kernel_size, *stride)?;
-                        let grad_arg = ((grad * avg)?.upsample_nearest2d(h, w)? * mask)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad_arg)?;
-                    }
-                    Op::UpsampleNearest1D { .. } => Err(Error::BackwardNotSupported {
-                        op: "upsample-nearest1d",
-                    })?,
-                    Op::UpsampleNearest2D { .. } => Err(Error::BackwardNotSupported {
-                        op: "upsample-nearest2d",
-                    })?,
-                    Op::SliceScatter0(lhs, rhs, start_rhs) => {
-                        let rhs_sum_grad = grads.or_insert(rhs)?;
-                        let rhs_grad = grad.narrow(0, *start_rhs, rhs.dim(0)?)?;
-                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
-
-                        let lhs_sum_grad = grads.or_insert(lhs)?;
-                        let lhs_grad = grad.slice_scatter0(&rhs.zeros_like()?, *start_rhs)?;
-                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?
-                    }
-                    Op::Gather(arg, indexes, dim) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.scatter_add(indexes, &grad, *dim)?;
-                    }
-                    Op::ScatterAdd(init, indexes, src, dim) => {
-                        let init_sum_grad = grads.or_insert(init)?;
-                        *init_sum_grad = init_sum_grad.add(&grad)?;
-
-                        let src_grad = grad.gather(indexes, *dim)?;
-                        let src_sum_grad = grads.or_insert(src)?;
-                        *src_sum_grad = src_sum_grad.add(&src_grad)?;
-                    }
-                    Op::IndexAdd(init, indexes, src, dim) => {
-                        let init_sum_grad = grads.or_insert(init)?;
-                        *init_sum_grad = init_sum_grad.add(&grad)?;
-
-                        let src_grad = grad.index_select(indexes, *dim)?;
-                        let src_sum_grad = grads.or_insert(src)?;
-                        *src_sum_grad = src_sum_grad.add(&src_grad)?;
-                    }
-                    Op::IndexSelect(arg, indexes, dim) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.index_add(indexes, &grad, *dim)?;
+                    Op::Conv1D { .. } => return Err(Error::BackwardNotSupported { op: "conv1d" }),
+                    Op::Embedding(_lhs, _rhs) => {
+                        return Err(Error::BackwardNotSupported { op: "embedding" })
                    }
                    Op::Matmul(lhs, rhs) => {
                        // Skipping checks, the op went ok, we can skip
@ -345,80 +195,42 @@ impl Tensor {
                            }
                        }

-                        let mut arg_grad = grad.sum_keepdim(sum_dims.as_slice())?;
-                        for _i in 0..left_dims {
-                            arg_grad = arg_grad.squeeze(0)?
-                        }
+                        let arg_grad = grad.sum(sum_dims.as_slice())?;
                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&arg_grad.broadcast_as(sum_grad.dims())?)?;
+                        *sum_grad = sum_grad.broadcast_add(&arg_grad)?
                    }
-                    Op::Reduce(arg, ReduceOp::Sum, reduced_dims) => {
-                        let grad = broadcast_back(arg, &grad, reduced_dims)?;
+                    Op::Sum(arg, _sum_dims) => {
                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad)?;
-                    }
-                    Op::Cmp(_args, _) => {}
-                    Op::Reduce(arg, ReduceOp::Max, reduced_dims) => {
-                        let node = broadcast_back(arg, node, reduced_dims)?;
-                        let grad = broadcast_back(arg, &grad, reduced_dims)?;
-                        let grad = node.eq(arg)?.to_dtype(grad.dtype())?.mul(&grad)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad.broadcast_as(sum_grad.dims())?)?;
-                    }
-                    Op::Reduce(arg, ReduceOp::Min, reduced_dims) => {
-                        let node = broadcast_back(arg, node, reduced_dims)?;
-                        let grad = broadcast_back(arg, &grad, reduced_dims)?;
-                        let grad = node.eq(arg)?.to_dtype(grad.dtype())?.mul(&grad)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad.broadcast_as(sum_grad.dims())?)?;
+                        *sum_grad = sum_grad.broadcast_add(&grad)?
                    }
                    Op::ToDType(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&grad.to_dtype(node.dtype())?)?
                    }
-                    Op::Copy(arg) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&grad)?
-                    }
                    Op::Affine { arg, mul, .. } => {
                        let arg_grad = grad.affine(*mul, 0.)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(arg, UnaryOp::Log) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&(grad / arg)?)?
-                    }
-                    Op::Unary(arg, UnaryOp::Sin) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&(&grad * arg.cos())?)?
-                    }
-                    Op::Unary(arg, UnaryOp::Cos) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.sub(&(&grad * arg.sin())?)?
-                    }
-                    Op::Unary(arg, UnaryOp::Tanh) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        let minus_dtanh = (node.sqr()? - 1.)?;
-                        *sum_grad = sum_grad.sub(&(&grad * &minus_dtanh)?)?
-                    }
-                    Op::Unary(arg, UnaryOp::Abs) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        let ones = arg.ones_like()?;
-                        let abs_grad = arg.ge(&arg.zeros_like()?)?.where_cond(&ones, &ones.neg()?);
-                        *sum_grad = sum_grad.add(&(&grad * abs_grad)?)?
-                    }
-                    Op::Unary(arg, UnaryOp::Exp) => {
+                    Op::Log(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&(&grad * *node)?)?
                    }
-                    Op::Unary(arg, UnaryOp::Neg) => {
+                    Op::Sin(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.sub(&grad)?
+                        *sum_grad = sum_grad.add(&(&grad * arg.cos())?)?
                    }
-                    Op::Unary(arg, UnaryOp::Recip) => {
+                    Op::Cos(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.sub(&(&grad * arg.sin())?)?
+                    }
+                    Op::Abs(_args) => return Err(Error::BackwardNotSupported { op: "abs" }),
+                    Op::Exp(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&(&grad / arg)?)?
+                    }
+                    Op::Neg(arg) => {
                        let sum_grad = grads.or_insert(arg)?;
-                        let grad = (grad / arg.sqr()?)?;
                        *sum_grad = sum_grad.sub(&grad)?
                    }
                    &Op::Narrow(ref arg, dim, start_idx, len) => {
@ -447,76 +259,24 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Reduce(_, ReduceOp::ArgMin, _) => {}
-                    Op::Reduce(_, ReduceOp::ArgMax, _) => {}
+                    Op::Softmax(_arg, _) => {
+                        return Err(Error::BackwardNotSupported { op: "softmax" })
+                    }
                    Op::Reshape(arg) => {
                        let arg_grad = grad.reshape(arg.dims())?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(_, UnaryOp::Ceil) => Err(Error::BackwardNotSupported { op: "ceil" })?,
-                    Op::Unary(_, UnaryOp::Floor) => {
-                        Err(Error::BackwardNotSupported { op: "floor" })?
-                    }
-                    Op::Unary(_, UnaryOp::Round) => {
-                        Err(Error::BackwardNotSupported { op: "round" })?
-                    }
-                    Op::Unary(_, UnaryOp::Gelu) => Err(Error::BackwardNotSupported { op: "gelu" })?,
-                    Op::Unary(_, UnaryOp::Erf) => Err(Error::BackwardNotSupported { op: "erf" })?,
-                    Op::Unary(_, UnaryOp::GeluErf) => {
-                        Err(Error::BackwardNotSupported { op: "gelu-erf" })?
-                    }
-                    Op::Unary(arg, UnaryOp::Relu) => {
-                        let sum_grad = grads.or_insert(arg)?;
-                        let relu_grad = arg.ge(&arg.zeros_like()?)?.to_dtype(arg.dtype())?;
-                        *sum_grad = sum_grad.add(&(&grad * relu_grad)?)?
-                    }
-                    Op::Elu(..) => Err(Error::BackwardNotSupported { op: "elu" })?,
-                    Op::Powf(arg, e) => {
-                        let arg_grad = (&(grad * arg.powf(e - 1.)?)? * *e)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&arg_grad)?
-                    }
-                    Op::CustomOp1(arg, c) => {
-                        if let Some(arg_grad) = c.bwd(arg, node, &grad)? {
-                            let sum_grad = grads.or_insert(arg)?;
-                            *sum_grad = sum_grad.add(&arg_grad)?
-                        }
-                    }
-                    Op::CustomOp2(arg1, arg2, c) => {
-                        let (arg_grad1, arg_grad2) = c.bwd(arg1, arg2, node, &grad)?;
-                        if let Some(arg_grad1) = arg_grad1 {
-                            let sum_grad = grads.or_insert(arg1)?;
-                            *sum_grad = sum_grad.add(&arg_grad1)?
-                        }
-                        if let Some(arg_grad2) = arg_grad2 {
-                            let sum_grad = grads.or_insert(arg2)?;
-                            *sum_grad = sum_grad.add(&arg_grad2)?
-                        }
-                    }
-                    Op::CustomOp3(arg1, arg2, arg3, c) => {
-                        let (arg_grad1, arg_grad2, arg_grad3) =
-                            c.bwd(arg1, arg2, arg3, node, &grad)?;
-                        if let Some(arg_grad1) = arg_grad1 {
-                            let sum_grad = grads.or_insert(arg1)?;
-                            *sum_grad = sum_grad.add(&arg_grad1)?
-                        }
-                        if let Some(arg_grad2) = arg_grad2 {
-                            let sum_grad = grads.or_insert(arg2)?;
-                            *sum_grad = sum_grad.add(&arg_grad2)?
-                        }
-                        if let Some(arg_grad3) = arg_grad3 {
-                            let sum_grad = grads.or_insert(arg3)?;
-                            *sum_grad = sum_grad.add(&arg_grad3)?
-                        }
-                    }
-                    Op::Unary(arg, UnaryOp::Sqr) => {
+                    Op::Gelu(_) => return Err(Error::BackwardNotSupported { op: "gelu" }),
+                    Op::Relu(_) => return Err(Error::BackwardNotSupported { op: "relu" }),
+                    Op::Elu(..) => return Err(Error::BackwardNotSupported { op: "elu" }),
+                    Op::Sqr(arg) => {
                        let arg_grad = arg.mul(&grad)?.affine(2., 0.)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Unary(arg, UnaryOp::Sqrt) => {
-                        let arg_grad = grad.div(node)?.affine(0.5, 0.)?;
+                    Op::Sqrt(arg) => {
+                        let arg_grad = grad.div(arg)?.affine(0.5, 0.)?;
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
@ -530,15 +290,6 @@ impl Tensor {
                        let sum_grad = grads.or_insert(arg)?;
                        *sum_grad = sum_grad.add(&arg_grad)?
                    }
-                    Op::Permute(arg, dims) => {
-                        let mut inv_dims = vec![0; dims.len()];
-                        for (i, &dim_idx) in dims.iter().enumerate() {
-                            inv_dims[dim_idx] = i
-                        }
-                        let arg_grad = grad.permute(inv_dims)?;
-                        let sum_grad = grads.or_insert(arg)?;
-                        *sum_grad = sum_grad.add(&arg_grad)?
-                    }
                };
            }
        }
@ -546,7 +297,6 @@ impl Tensor {
    }
 }

-#[derive(Debug)]
 pub struct GradStore(HashMap<TensorId, Tensor>);

 impl GradStore {
--- a/candle-core/src/conv.rs
+++ b/candle-core/src/conv.rs
@ -1,8 +1,6 @@
-use crate::{op::BackpropOp, op::Op, Error, Result, Tensor};
-
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ParamsConv1D {
-    pub(crate) b_size: usize,
+    pub(crate) b_size: Option<usize>,
    // Maybe we should have a version without l_in as this bit depends on the input and not only on
    // the weights.
    pub(crate) l_in: usize,
@ -11,255 +9,19 @@ pub struct ParamsConv1D {
    pub(crate) k_size: usize,
    pub(crate) padding: usize,
    pub(crate) stride: usize,
-    pub(crate) dilation: usize,
 }

 impl ParamsConv1D {
    pub(crate) fn l_out(&self) -> usize {
-        (self.l_in + 2 * self.padding - self.dilation * (self.k_size - 1) - 1) / self.stride + 1
+        let dilation = 1;
+        (self.l_in + 2 * self.padding - dilation * (self.k_size - 1) - 1) / self.stride + 1
    }

    pub(crate) fn out_dims(&self) -> Vec<usize> {
        let l_out = self.l_out();
-        vec![self.b_size, self.c_out, l_out]
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum CudnnFwdAlgo {
-    ImplicitGemm,
-    ImplicitPrecompGemm,
-    Gemm,
-    Direct,
-    Fft,
-    FftTiling,
-    Winograd,
-    WinogradNonFused,
-    Count,
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ParamsConv2D {
-    pub(crate) b_size: usize,
-    pub(crate) i_h: usize,
-    pub(crate) i_w: usize,
-    pub(crate) k_h: usize,
-    pub(crate) k_w: usize,
-    pub(crate) c_out: usize,
-    pub(crate) c_in: usize,
-    pub(crate) padding: usize,
-    pub(crate) stride: usize,
-    pub(crate) dilation: usize,
-    pub cudnn_fwd_algo: Option<CudnnFwdAlgo>,
-}
-
-impl ParamsConv2D {
-    pub(crate) fn out_h(&self) -> usize {
-        (self.i_h + 2 * self.padding - self.dilation * (self.k_h - 1) - 1) / self.stride + 1
-    }
-
-    pub(crate) fn out_w(&self) -> usize {
-        (self.i_w + 2 * self.padding - self.dilation * (self.k_w - 1) - 1) / self.stride + 1
-    }
-
-    pub(crate) fn out_dims(&self) -> Vec<usize> {
-        vec![self.b_size, self.c_out, self.out_h(), self.out_w()]
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ParamsConvTranspose2D {
-    pub(crate) b_size: usize,
-    pub(crate) i_h: usize,
-    pub(crate) i_w: usize,
-    pub(crate) k_h: usize,
-    pub(crate) k_w: usize,
-    pub(crate) c_out: usize,
-    pub(crate) c_in: usize,
-    pub(crate) padding: usize,
-    pub(crate) output_padding: usize,
-    pub(crate) stride: usize,
-    pub(crate) dilation: usize,
-}
-
-impl ParamsConvTranspose2D {
-    pub(crate) fn out_h(&self) -> usize {
-        (self.i_h - 1) * self.stride + self.dilation * (self.k_h - 1) + self.output_padding + 1
-            - 2 * self.padding
-    }
-
-    pub(crate) fn out_w(&self) -> usize {
-        (self.i_w - 1) * self.stride + self.dilation * (self.k_w - 1) + self.output_padding + 1
-            - 2 * self.padding
-    }
-
-    pub(crate) fn out_dims(&self) -> Vec<usize> {
-        vec![self.b_size, self.c_out, self.out_h(), self.out_w()]
-    }
-}
-
-impl Tensor {
-    fn conv1d_single_group(&self, kernel: &Self, params: &ParamsConv1D) -> Result<Self> {
-        let storage =
-            self.storage()
-                .conv1d(self.layout(), &kernel.storage(), kernel.layout(), params)?;
-        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::Conv1D {
-            arg,
-            kernel,
-            padding: params.padding,
-            stride: params.stride,
-            dilation: params.dilation,
-        });
-        let out_dims = params.out_dims();
-        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
-    }
-
-    /// Applies a 1D convolution over the input tensor.
-    pub fn conv1d(
-        &self,
-        kernel: &Self,
-        padding: usize,
-        stride: usize,
-        dilation: usize,
-        groups: usize,
-    ) -> Result<Self> {
-        let (c_out, c_in_k, k_size) = kernel.dims3()?;
-        let (b_size, c_in, l_in) = self.dims3()?;
-        if c_in != c_in_k * groups {
-            Err(Error::Conv1dInvalidArgs {
-                inp_shape: self.shape().clone(),
-                k_shape: kernel.shape().clone(),
-                padding,
-                stride,
-                msg: "the number of in-channels on the input doesn't match the kernel size",
-            }
-            .bt())?
-        }
-
-        let params = ParamsConv1D {
-            b_size,
-            l_in,
-            c_out: c_out / groups,
-            c_in: c_in / groups,
-            k_size,
-            padding,
-            stride,
-            dilation,
-        };
-        if groups == 1 {
-            self.conv1d_single_group(kernel, &params)
-        } else {
-            let blocks = self.chunk(groups, 1)?;
-            let kernel = kernel.chunk(groups, 0)?;
-            let blocks = blocks
-                .iter()
-                .zip(&kernel)
-                .map(|(block, kernel)| block.conv1d_single_group(kernel, &params))
-                .collect::<Result<Vec<_>>>()?;
-            Tensor::cat(&blocks, 1)
+        match self.b_size {
+            None => vec![self.c_out, l_out],
+            Some(n) => vec![n, self.c_out, l_out],
        }
    }
-
-    fn conv2d_single_group(&self, kernel: &Self, params: &ParamsConv2D) -> Result<Self> {
-        let storage =
-            self.storage()
-                .conv2d(self.layout(), &kernel.storage(), kernel.layout(), params)?;
-        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::Conv2D {
-            arg,
-            kernel,
-            padding: params.padding,
-            stride: params.stride,
-            dilation: params.dilation,
-        });
-        let out_dims = params.out_dims();
-        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
-    }
-
-    /// Applies a 2D convolution over the input tensor.
-    pub fn conv2d(
-        &self,
-        kernel: &Self,
-        padding: usize,
-        stride: usize,
-        dilation: usize,
-        groups: usize,
-    ) -> Result<Self> {
-        let (b_size, c_in, i_h, i_w) = self.dims4()?;
-        let (c_out, c_in_k, k_h, k_w) = kernel.dims4()?;
-        if c_in != c_in_k * groups {
-            crate::bail!(
-                "in_channel mismatch between input ({c_in}, groups {groups}) and kernel ({c_in_k})"
-            )
-        }
-        let params = ParamsConv2D {
-            b_size,
-            i_h,
-            i_w,
-            k_h,
-            k_w,
-            c_out: c_out / groups,
-            c_in: c_in / groups,
-            padding,
-            stride,
-            dilation,
-            cudnn_fwd_algo: None,
-        };
-        if groups == 1 {
-            self.conv2d_single_group(kernel, &params)
-        } else {
-            let blocks = self.chunk(groups, 1)?;
-            let kernel = kernel.chunk(groups, 0)?;
-            let blocks = blocks
-                .iter()
-                .zip(&kernel)
-                .map(|(block, kernel)| block.conv2d_single_group(kernel, &params))
-                .collect::<Result<Vec<_>>>()?;
-            Tensor::cat(&blocks, 1)
-        }
-    }
-
-    /// Applies a 2D transposed convolution over the input tensor.
-    pub fn conv_transpose2d(
-        &self,
-        kernel: &Self,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-    ) -> Result<Self> {
-        let (b_size, c_in, i_h, i_w) = self.dims4()?;
-        let (c_in_k, c_out, k_h, k_w) = kernel.dims4()?;
-        if c_in != c_in_k {
-            crate::bail!("in_channel mismatch between input ({c_in}) and kernel ({c_in_k})")
-        }
-        let params = ParamsConvTranspose2D {
-            b_size,
-            i_h,
-            i_w,
-            k_h,
-            k_w,
-            c_out,
-            c_in,
-            padding,
-            output_padding,
-            stride,
-            dilation,
-        };
-        let storage = self.storage().conv_transpose2d(
-            self.layout(),
-            &kernel.storage(),
-            kernel.layout(),
-            &params,
-        )?;
-        let op = BackpropOp::new2(self, kernel, |arg, kernel| Op::ConvTranspose2D {
-            arg,
-            kernel,
-            padding: params.padding,
-            output_padding: params.output_padding,
-            stride: params.stride,
-            dilation: params.dilation,
-        });
-        let out_dims = params.out_dims();
-        Ok(crate::tensor::from_storage(storage, out_dims, op, false))
-    }
 }
--- a/candle-core/src/convert.rs
+++ b/candle-core/src/convert.rs
@ -1,6 +1,6 @@
 //! Implement conversion traits for tensors
-use crate::{DType, Device, Error, Tensor, WithDType};
-use half::{bf16, f16, slice::HalfFloatSliceExt};
+use crate::{Device, Error, Tensor, WithDType};
+use half::{bf16, f16};
 use std::convert::TryFrom;

 impl<T: WithDType> TryFrom<&Tensor> for Vec<T> {
@ -92,54 +92,5 @@ from_tensor!(f64);
 from_tensor!(f32);
 from_tensor!(f16);
 from_tensor!(bf16);
-from_tensor!(i64);
 from_tensor!(u32);
 from_tensor!(u8);
-
-impl Tensor {
-    pub fn write_bytes<W: std::io::Write>(&self, f: &mut W) -> crate::Result<()> {
-        use byteorder::{LittleEndian, WriteBytesExt};
-
-        let vs = self.flatten_all()?;
-        match self.dtype() {
-            DType::BF16 => {
-                let vs = vs.to_vec1::<bf16>()?;
-                for &v in vs.reinterpret_cast() {
-                    f.write_u16::<LittleEndian>(v)?
-                }
-            }
-            DType::F16 => {
-                let vs = vs.to_vec1::<f16>()?;
-                for &v in vs.reinterpret_cast() {
-                    f.write_u16::<LittleEndian>(v)?
-                }
-            }
-            DType::F32 => {
-                // TODO: Avoid using a buffer when data is already on the CPU.
-                for v in vs.to_vec1::<f32>()? {
-                    f.write_f32::<LittleEndian>(v)?
-                }
-            }
-            DType::F64 => {
-                for v in vs.to_vec1::<f64>()? {
-                    f.write_f64::<LittleEndian>(v)?
-                }
-            }
-            DType::U32 => {
-                for v in vs.to_vec1::<u32>()? {
-                    f.write_u32::<LittleEndian>(v)?
-                }
-            }
-            DType::I64 => {
-                for v in vs.to_vec1::<i64>()? {
-                    f.write_i64::<LittleEndian>(v)?
-                }
-            }
-            DType::U8 => {
-                let vs = vs.to_vec1::<u8>()?;
-                f.write_all(&vs)?;
-            }
-        }
-        Ok(())
-    }
-}
--- a/candle-core/src/cpu/avx.rs
+++ b/candle-core/src/cpu/avx.rs
@ -1,148 +0,0 @@
-use super::{Cpu, CpuF16};
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-use half::f16;
-
-pub struct CurrentCpu {}
-
-const STEP: usize = 32;
-const EPR: usize = 8;
-const ARR: usize = STEP / EPR;
-
-impl Cpu<ARR> for CurrentCpu {
-    type Unit = __m256;
-    type Array = [__m256; ARR];
-
-    const STEP: usize = STEP;
-    const EPR: usize = EPR;
-
-    fn n() -> usize {
-        ARR
-    }
-
-    unsafe fn zero() -> Self::Unit {
-        _mm256_setzero_ps()
-    }
-
-    unsafe fn zero_array() -> Self::Array {
-        [Self::zero(); ARR]
-    }
-
-    unsafe fn from_f32(v: f32) -> Self::Unit {
-        _mm256_set1_ps(v)
-    }
-
-    unsafe fn load(mem_addr: *const f32) -> Self::Unit {
-        _mm256_loadu_ps(mem_addr)
-    }
-
-    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
-        _mm256_add_ps(a, b)
-    }
-
-    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
-        _mm256_add_ps(_mm256_mul_ps(b, c), a)
-    }
-
-    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
-        _mm256_storeu_ps(mem_addr, a);
-    }
-
-    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
-        for i in 0..ARR / 2 {
-            x[2 * i] = _mm256_add_ps(x[2 * i], x[2 * i + 1]);
-        }
-        for i in 0..ARR / 4 {
-            x[4 * i] = _mm256_add_ps(x[4 * i], x[4 * i + 2]);
-        }
-        #[allow(clippy::reversed_empty_ranges)]
-        for i in 0..ARR / 8 {
-            x[8 * i] = _mm256_add_ps(x[8 * i], x[8 * i + 4]);
-        }
-        let t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1));
-        let t1 = _mm_hadd_ps(t0, t0);
-        *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
-    }
-}
-
-pub struct CurrentCpuF16 {}
-impl CpuF16<ARR> for CurrentCpuF16 {
-    type Unit = __m256;
-    type Array = [__m256; ARR];
-
-    const STEP: usize = STEP;
-    const EPR: usize = EPR;
-
-    fn n() -> usize {
-        ARR
-    }
-
-    unsafe fn zero() -> Self::Unit {
-        _mm256_setzero_ps()
-    }
-
-    unsafe fn zero_array() -> Self::Array {
-        [Self::zero(); ARR]
-    }
-
-    unsafe fn from_f32(v: f32) -> Self::Unit {
-        _mm256_set1_ps(v)
-    }
-
-    #[cfg(target_feature = "f16c")]
-    unsafe fn load(mem_addr: *const f16) -> Self::Unit {
-        _mm256_cvtph_ps(_mm_loadu_si128(mem_addr as *const __m128i))
-    }
-
-    #[cfg(not(target_feature = "f16c"))]
-    unsafe fn load(mem_addr: *const f16) -> Self::Unit {
-        let mut tmp = [0.0f32; 8];
-        for i in 0..8 {
-            tmp[i] = (*mem_addr.add(i)).to_f32();
-        }
-        _mm256_loadu_ps(tmp.as_ptr())
-    }
-
-    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
-        _mm256_add_ps(a, b)
-    }
-
-    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
-        _mm256_add_ps(_mm256_mul_ps(b, c), a)
-    }
-
-    #[cfg(target_feature = "f16c")]
-    unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit) {
-        _mm_storeu_si128(mem_addr as *mut __m128i, _mm256_cvtps_ph(a, 0))
-    }
-
-    #[cfg(not(target_feature = "f16c"))]
-    unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit) {
-        let mut tmp = [0.0f32; 8];
-        _mm256_storeu_ps(tmp.as_mut_ptr(), a);
-        for i in 0..8 {
-            *mem_addr.add(i) = f16::from_f32(tmp[i]);
-        }
-    }
-
-    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
-        let mut offset = ARR >> 1;
-        for i in 0..offset {
-            x[i] = _mm256_add_ps(x[i], x[offset + i]);
-        }
-        offset >>= 1;
-        for i in 0..offset {
-            x[i] = _mm256_add_ps(x[i], x[offset + i]);
-        }
-        offset >>= 1;
-        for i in 0..offset {
-            x[i] = _mm256_add_ps(x[i], x[offset + i]);
-        }
-        let t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1));
-        let t1 = _mm_hadd_ps(t0, t0);
-        *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
-    }
-}
--- a/candle-core/src/cpu/erf.rs
+++ b/candle-core/src/cpu/erf.rs
@ -1,763 +0,0 @@
-#![allow(clippy::excessive_precision)]
-// Code taken from https://github.com/statrs-dev/statrs
-//! Provides the [error](https://en.wikipedia.org/wiki/Error_function) and
-//! related functions
-
-mod evaluate {
-    //! Provides functions that don't have a numerical solution and must
-    //! be solved computationally (e.g. evaluation of a polynomial)
-
-    /// evaluates a polynomial at `z` where `coeff` are the coeffecients
-    /// to a polynomial of order `k` where `k` is the length of `coeff` and the
-    /// coeffecient
-    /// to the `k`th power is the `k`th element in coeff. E.g. [3,-1,2] equates to
-    /// `2z^2 - z + 3`
-    ///
-    /// # Remarks
-    ///
-    /// Returns 0 for a 0 length coefficient slice
-    pub fn polynomial(z: f64, coeff: &[f64]) -> f64 {
-        let n = coeff.len();
-        if n == 0 {
-            return 0.0;
-        }
-
-        let mut sum = *coeff.last().unwrap();
-        for c in coeff[0..n - 1].iter().rev() {
-            sum = *c + z * sum;
-        }
-        sum
-    }
-}
-use std::f64;
-
-/// `erf` calculates the error function at `x`.
-pub fn erf(x: f64) -> f64 {
-    if x.is_nan() {
-        f64::NAN
-    } else if x >= 0.0 && x.is_infinite() {
-        1.0
-    } else if x <= 0.0 && x.is_infinite() {
-        -1.0
-    } else if x == 0. {
-        0.0
-    } else {
-        erf_impl(x, false)
-    }
-}
-
-/// `erf_inv` calculates the inverse error function
-/// at `x`.
-pub fn erf_inv(x: f64) -> f64 {
-    if x == 0.0 {
-        0.0
-    } else if x >= 1.0 {
-        f64::INFINITY
-    } else if x <= -1.0 {
-        f64::NEG_INFINITY
-    } else if x < 0.0 {
-        erf_inv_impl(-x, 1.0 + x, -1.0)
-    } else {
-        erf_inv_impl(x, 1.0 - x, 1.0)
-    }
-}
-
-/// `erfc` calculates the complementary error function
-/// at `x`.
-pub fn erfc(x: f64) -> f64 {
-    if x.is_nan() {
-        f64::NAN
-    } else if x == f64::INFINITY {
-        0.0
-    } else if x == f64::NEG_INFINITY {
-        2.0
-    } else {
-        erf_impl(x, true)
-    }
-}
-
-/// `erfc_inv` calculates the complementary inverse
-/// error function at `x`.
-pub fn erfc_inv(x: f64) -> f64 {
-    if x <= 0.0 {
-        f64::INFINITY
-    } else if x >= 2.0 {
-        f64::NEG_INFINITY
-    } else if x > 1.0 {
-        erf_inv_impl(-1.0 + x, 2.0 - x, -1.0)
-    } else {
-        erf_inv_impl(1.0 - x, x, 1.0)
-    }
-}
-
-// **********************************************************
-// ********** Coefficients for erf_impl polynomial **********
-// **********************************************************
-
-/// Polynomial coefficients for a numerator of `erf_impl`
-/// in the interval [1e-10, 0.5].
-const ERF_IMPL_AN: &[f64] = &[
-    0.00337916709551257388990745,
-    -0.00073695653048167948530905,
-    -0.374732337392919607868241,
-    0.0817442448733587196071743,
-    -0.0421089319936548595203468,
-    0.0070165709512095756344528,
-    -0.00495091255982435110337458,
-    0.000871646599037922480317225,
-];
-
-/// Polynomial coefficients for a denominator of `erf_impl`
-/// in the interval [1e-10, 0.5]
-const ERF_IMPL_AD: &[f64] = &[
-    1.0,
-    -0.218088218087924645390535,
-    0.412542972725442099083918,
-    -0.0841891147873106755410271,
-    0.0655338856400241519690695,
-    -0.0120019604454941768171266,
-    0.00408165558926174048329689,
-    -0.000615900721557769691924509,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [0.5, 0.75].
-const ERF_IMPL_BN: &[f64] = &[
-    -0.0361790390718262471360258,
-    0.292251883444882683221149,
-    0.281447041797604512774415,
-    0.125610208862766947294894,
-    0.0274135028268930549240776,
-    0.00250839672168065762786937,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [0.5, 0.75].
-const ERF_IMPL_BD: &[f64] = &[
-    1.0,
-    1.8545005897903486499845,
-    1.43575803037831418074962,
-    0.582827658753036572454135,
-    0.124810476932949746447682,
-    0.0113724176546353285778481,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [0.75, 1.25].
-const ERF_IMPL_CN: &[f64] = &[
-    -0.0397876892611136856954425,
-    0.153165212467878293257683,
-    0.191260295600936245503129,
-    0.10276327061989304213645,
-    0.029637090615738836726027,
-    0.0046093486780275489468812,
-    0.000307607820348680180548455,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [0.75, 1.25].
-const ERF_IMPL_CD: &[f64] = &[
-    1.0,
-    1.95520072987627704987886,
-    1.64762317199384860109595,
-    0.768238607022126250082483,
-    0.209793185936509782784315,
-    0.0319569316899913392596356,
-    0.00213363160895785378615014,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [1.25, 2.25].
-const ERF_IMPL_DN: &[f64] = &[
-    -0.0300838560557949717328341,
-    0.0538578829844454508530552,
-    0.0726211541651914182692959,
-    0.0367628469888049348429018,
-    0.00964629015572527529605267,
-    0.00133453480075291076745275,
-    0.778087599782504251917881e-4,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [1.25, 2.25].
-const ERF_IMPL_DD: &[f64] = &[
-    1.0,
-    1.75967098147167528287343,
-    1.32883571437961120556307,
-    0.552528596508757581287907,
-    0.133793056941332861912279,
-    0.0179509645176280768640766,
-    0.00104712440019937356634038,
-    -0.106640381820357337177643e-7,
-];
-
-///  Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [2.25, 3.5].
-const ERF_IMPL_EN: &[f64] = &[
-    -0.0117907570137227847827732,
-    0.014262132090538809896674,
-    0.0202234435902960820020765,
-    0.00930668299990432009042239,
-    0.00213357802422065994322516,
-    0.00025022987386460102395382,
-    0.120534912219588189822126e-4,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [2.25, 3.5].
-const ERF_IMPL_ED: &[f64] = &[
-    1.0,
-    1.50376225203620482047419,
-    0.965397786204462896346934,
-    0.339265230476796681555511,
-    0.0689740649541569716897427,
-    0.00771060262491768307365526,
-    0.000371421101531069302990367,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [3.5, 5.25].
-const ERF_IMPL_FN: &[f64] = &[
-    -0.00546954795538729307482955,
-    0.00404190278731707110245394,
-    0.0054963369553161170521356,
-    0.00212616472603945399437862,
-    0.000394984014495083900689956,
-    0.365565477064442377259271e-4,
-    0.135485897109932323253786e-5,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [3.5, 5.25].
-const ERF_IMPL_FD: &[f64] = &[
-    1.0,
-    1.21019697773630784832251,
-    0.620914668221143886601045,
-    0.173038430661142762569515,
-    0.0276550813773432047594539,
-    0.00240625974424309709745382,
-    0.891811817251336577241006e-4,
-    -0.465528836283382684461025e-11,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [5.25, 8].
-const ERF_IMPL_GN: &[f64] = &[
-    -0.00270722535905778347999196,
-    0.0013187563425029400461378,
-    0.00119925933261002333923989,
-    0.00027849619811344664248235,
-    0.267822988218331849989363e-4,
-    0.923043672315028197865066e-6,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [5.25, 8].
-const ERF_IMPL_GD: &[f64] = &[
-    1.0,
-    0.814632808543141591118279,
-    0.268901665856299542168425,
-    0.0449877216103041118694989,
-    0.00381759663320248459168994,
-    0.000131571897888596914350697,
-    0.404815359675764138445257e-11,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [8, 11.5].
-const ERF_IMPL_HN: &[f64] = &[
-    -0.00109946720691742196814323,
-    0.000406425442750422675169153,
-    0.000274499489416900707787024,
-    0.465293770646659383436343e-4,
-    0.320955425395767463401993e-5,
-    0.778286018145020892261936e-7,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [8, 11.5].
-const ERF_IMPL_HD: &[f64] = &[
-    1.0,
-    0.588173710611846046373373,
-    0.139363331289409746077541,
-    0.0166329340417083678763028,
-    0.00100023921310234908642639,
-    0.24254837521587225125068e-4,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [11.5, 17].
-const ERF_IMPL_IN: &[f64] = &[
-    -0.00056907993601094962855594,
-    0.000169498540373762264416984,
-    0.518472354581100890120501e-4,
-    0.382819312231928859704678e-5,
-    0.824989931281894431781794e-7,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [11.5, 17].
-const ERF_IMPL_ID: &[f64] = &[
-    1.0,
-    0.339637250051139347430323,
-    0.043472647870310663055044,
-    0.00248549335224637114641629,
-    0.535633305337152900549536e-4,
-    -0.117490944405459578783846e-12,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [17, 24].
-const ERF_IMPL_JN: &[f64] = &[
-    -0.000241313599483991337479091,
-    0.574224975202501512365975e-4,
-    0.115998962927383778460557e-4,
-    0.581762134402593739370875e-6,
-    0.853971555085673614607418e-8,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [17, 24].
-const ERF_IMPL_JD: &[f64] = &[
-    1.0,
-    0.233044138299687841018015,
-    0.0204186940546440312625597,
-    0.000797185647564398289151125,
-    0.117019281670172327758019e-4,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [24, 38].
-const ERF_IMPL_KN: &[f64] = &[
-    -0.000146674699277760365803642,
-    0.162666552112280519955647e-4,
-    0.269116248509165239294897e-5,
-    0.979584479468091935086972e-7,
-    0.101994647625723465722285e-8,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [24, 38].
-const ERF_IMPL_KD: &[f64] = &[
-    1.0,
-    0.165907812944847226546036,
-    0.0103361716191505884359634,
-    0.000286593026373868366935721,
-    0.298401570840900340874568e-5,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [38, 60].
-const ERF_IMPL_LN: &[f64] = &[
-    -0.583905797629771786720406e-4,
-    0.412510325105496173512992e-5,
-    0.431790922420250949096906e-6,
-    0.993365155590013193345569e-8,
-    0.653480510020104699270084e-10,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [38, 60].
-const ERF_IMPL_LD: &[f64] = &[
-    1.0,
-    0.105077086072039915406159,
-    0.00414278428675475620830226,
-    0.726338754644523769144108e-4,
-    0.477818471047398785369849e-6,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [60, 85].
-const ERF_IMPL_MN: &[f64] = &[
-    -0.196457797609229579459841e-4,
-    0.157243887666800692441195e-5,
-    0.543902511192700878690335e-7,
-    0.317472492369117710852685e-9,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [60, 85].
-const ERF_IMPL_MD: &[f64] = &[
-    1.0,
-    0.052803989240957632204885,
-    0.000926876069151753290378112,
-    0.541011723226630257077328e-5,
-    0.535093845803642394908747e-15,
-];
-
-/// Polynomial coefficients for a numerator in `erf_impl`
-/// in the interval [85, 110].
-const ERF_IMPL_NN: &[f64] = &[
-    -0.789224703978722689089794e-5,
-    0.622088451660986955124162e-6,
-    0.145728445676882396797184e-7,
-    0.603715505542715364529243e-10,
-];
-
-/// Polynomial coefficients for a denominator in `erf_impl`
-/// in the interval [85, 110].
-const ERF_IMPL_ND: &[f64] = &[
-    1.0,
-    0.0375328846356293715248719,
-    0.000467919535974625308126054,
-    0.193847039275845656900547e-5,
-];
-
-// **********************************************************
-// ********** Coefficients for erf_inv_impl polynomial ******
-// **********************************************************
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0, 0.5].
-const ERF_INV_IMPL_AN: &[f64] = &[
-    -0.000508781949658280665617,
-    -0.00836874819741736770379,
-    0.0334806625409744615033,
-    -0.0126926147662974029034,
-    -0.0365637971411762664006,
-    0.0219878681111168899165,
-    0.00822687874676915743155,
-    -0.00538772965071242932965,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0, 0.5].
-const ERF_INV_IMPL_AD: &[f64] = &[
-    1.0,
-    -0.970005043303290640362,
-    -1.56574558234175846809,
-    1.56221558398423026363,
-    0.662328840472002992063,
-    -0.71228902341542847553,
-    -0.0527396382340099713954,
-    0.0795283687341571680018,
-    -0.00233393759374190016776,
-    0.000886216390456424707504,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.5, 0.75].
-const ERF_INV_IMPL_BN: &[f64] = &[
-    -0.202433508355938759655,
-    0.105264680699391713268,
-    8.37050328343119927838,
-    17.6447298408374015486,
-    -18.8510648058714251895,
-    -44.6382324441786960818,
-    17.445385985570866523,
-    21.1294655448340526258,
-    -3.67192254707729348546,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.5, 0.75].
-const ERF_INV_IMPL_BD: &[f64] = &[
-    1.0,
-    6.24264124854247537712,
-    3.9713437953343869095,
-    -28.6608180499800029974,
-    -20.1432634680485188801,
-    48.5609213108739935468,
-    10.8268667355460159008,
-    -22.6436933413139721736,
-    1.72114765761200282724,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x less than 3.
-const ERF_INV_IMPL_CN: &[f64] = &[
-    -0.131102781679951906451,
-    -0.163794047193317060787,
-    0.117030156341995252019,
-    0.387079738972604337464,
-    0.337785538912035898924,
-    0.142869534408157156766,
-    0.0290157910005329060432,
-    0.00214558995388805277169,
-    -0.679465575181126350155e-6,
-    0.285225331782217055858e-7,
-    -0.681149956853776992068e-9,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x less than 3.
-const ERF_INV_IMPL_CD: &[f64] = &[
-    1.0,
-    3.46625407242567245975,
-    5.38168345707006855425,
-    4.77846592945843778382,
-    2.59301921623620271374,
-    0.848854343457902036425,
-    0.152264338295331783612,
-    0.01105924229346489121,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 3 and 6.
-const ERF_INV_IMPL_DN: &[f64] = &[
-    -0.0350353787183177984712,
-    -0.00222426529213447927281,
-    0.0185573306514231072324,
-    0.00950804701325919603619,
-    0.00187123492819559223345,
-    0.000157544617424960554631,
-    0.460469890584317994083e-5,
-    -0.230404776911882601748e-9,
-    0.266339227425782031962e-11,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 3 and 6.
-const ERF_INV_IMPL_DD: &[f64] = &[
-    1.0,
-    1.3653349817554063097,
-    0.762059164553623404043,
-    0.220091105764131249824,
-    0.0341589143670947727934,
-    0.00263861676657015992959,
-    0.764675292302794483503e-4,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 6 and 18.
-const ERF_INV_IMPL_EN: &[f64] = &[
-    -0.0167431005076633737133,
-    -0.00112951438745580278863,
-    0.00105628862152492910091,
-    0.000209386317487588078668,
-    0.149624783758342370182e-4,
-    0.449696789927706453732e-6,
-    0.462596163522878599135e-8,
-    -0.281128735628831791805e-13,
-    0.99055709973310326855e-16,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 6 and 18.
-const ERF_INV_IMPL_ED: &[f64] = &[
-    1.0,
-    0.591429344886417493481,
-    0.138151865749083321638,
-    0.0160746087093676504695,
-    0.000964011807005165528527,
-    0.275335474764726041141e-4,
-    0.282243172016108031869e-6,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 18 and 44.
-const ERF_INV_IMPL_FN: &[f64] = &[
-    -0.0024978212791898131227,
-    -0.779190719229053954292e-5,
-    0.254723037413027451751e-4,
-    0.162397777342510920873e-5,
-    0.396341011304801168516e-7,
-    0.411632831190944208473e-9,
-    0.145596286718675035587e-11,
-    -0.116765012397184275695e-17,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x between 18 and 44.
-const ERF_INV_IMPL_FD: &[f64] = &[
-    1.0,
-    0.207123112214422517181,
-    0.0169410838120975906478,
-    0.000690538265622684595676,
-    0.145007359818232637924e-4,
-    0.144437756628144157666e-6,
-    0.509761276599778486139e-9,
-];
-
-/// Polynomial coefficients for a numerator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x greater than 44.
-const ERF_INV_IMPL_GN: &[f64] = &[
-    -0.000539042911019078575891,
-    -0.28398759004727721098e-6,
-    0.899465114892291446442e-6,
-    0.229345859265920864296e-7,
-    0.225561444863500149219e-9,
-    0.947846627503022684216e-12,
-    0.135880130108924861008e-14,
-    -0.348890393399948882918e-21,
-];
-
-/// Polynomial coefficients for a denominator of `erf_inv_impl`
-/// in the interval [0.75, 1] with x greater than 44.
-const ERF_INV_IMPL_GD: &[f64] = &[
-    1.0,
-    0.0845746234001899436914,
-    0.00282092984726264681981,
-    0.468292921940894236786e-4,
-    0.399968812193862100054e-6,
-    0.161809290887904476097e-8,
-    0.231558608310259605225e-11,
-];
-
-/// `erf_impl` computes the error function at `z`.
-/// If `inv` is true, `1 - erf` is calculated as opposed to `erf`
-fn erf_impl(z: f64, inv: bool) -> f64 {
-    if z < 0.0 {
-        if !inv {
-            return -erf_impl(-z, false);
-        }
-        if z < -0.5 {
-            return 2.0 - erf_impl(-z, true);
-        }
-        return 1.0 + erf_impl(-z, false);
-    }
-
-    let result = if z < 0.5 {
-        if z < 1e-10 {
-            z * 1.125 + z * 0.003379167095512573896158903121545171688
-        } else {
-            z * 1.125
-                + z * evaluate::polynomial(z, ERF_IMPL_AN) / evaluate::polynomial(z, ERF_IMPL_AD)
-        }
-    } else if z < 110.0 {
-        let (r, b) = if z < 0.75 {
-            (
-                evaluate::polynomial(z - 0.5, ERF_IMPL_BN)
-                    / evaluate::polynomial(z - 0.5, ERF_IMPL_BD),
-                0.3440242112,
-            )
-        } else if z < 1.25 {
-            (
-                evaluate::polynomial(z - 0.75, ERF_IMPL_CN)
-                    / evaluate::polynomial(z - 0.75, ERF_IMPL_CD),
-                0.419990927,
-            )
-        } else if z < 2.25 {
-            (
-                evaluate::polynomial(z - 1.25, ERF_IMPL_DN)
-                    / evaluate::polynomial(z - 1.25, ERF_IMPL_DD),
-                0.4898625016,
-            )
-        } else if z < 3.5 {
-            (
-                evaluate::polynomial(z - 2.25, ERF_IMPL_EN)
-                    / evaluate::polynomial(z - 2.25, ERF_IMPL_ED),
-                0.5317370892,
-            )
-        } else if z < 5.25 {
-            (
-                evaluate::polynomial(z - 3.5, ERF_IMPL_FN)
-                    / evaluate::polynomial(z - 3.5, ERF_IMPL_FD),
-                0.5489973426,
-            )
-        } else if z < 8.0 {
-            (
-                evaluate::polynomial(z - 5.25, ERF_IMPL_GN)
-                    / evaluate::polynomial(z - 5.25, ERF_IMPL_GD),
-                0.5571740866,
-            )
-        } else if z < 11.5 {
-            (
-                evaluate::polynomial(z - 8.0, ERF_IMPL_HN)
-                    / evaluate::polynomial(z - 8.0, ERF_IMPL_HD),
-                0.5609807968,
-            )
-        } else if z < 17.0 {
-            (
-                evaluate::polynomial(z - 11.5, ERF_IMPL_IN)
-                    / evaluate::polynomial(z - 11.5, ERF_IMPL_ID),
-                0.5626493692,
-            )
-        } else if z < 24.0 {
-            (
-                evaluate::polynomial(z - 17.0, ERF_IMPL_JN)
-                    / evaluate::polynomial(z - 17.0, ERF_IMPL_JD),
-                0.5634598136,
-            )
-        } else if z < 38.0 {
-            (
-                evaluate::polynomial(z - 24.0, ERF_IMPL_KN)
-                    / evaluate::polynomial(z - 24.0, ERF_IMPL_KD),
-                0.5638477802,
-            )
-        } else if z < 60.0 {
-            (
-                evaluate::polynomial(z - 38.0, ERF_IMPL_LN)
-                    / evaluate::polynomial(z - 38.0, ERF_IMPL_LD),
-                0.5640528202,
-            )
-        } else if z < 85.0 {
-            (
-                evaluate::polynomial(z - 60.0, ERF_IMPL_MN)
-                    / evaluate::polynomial(z - 60.0, ERF_IMPL_MD),
-                0.5641309023,
-            )
-        } else {
-            (
-                evaluate::polynomial(z - 85.0, ERF_IMPL_NN)
-                    / evaluate::polynomial(z - 85.0, ERF_IMPL_ND),
-                0.5641584396,
-            )
-        };
-        let g = (-z * z).exp() / z;
-        g * b + g * r
-    } else {
-        0.0
-    };
-
-    if inv && z >= 0.5 {
-        result
-    } else if z >= 0.5 || inv {
-        1.0 - result
-    } else {
-        result
-    }
-}
-
-// `erf_inv_impl` computes the inverse error function where
-// `p`,`q`, and `s` are the first, second, and third intermediate
-// parameters respectively
-fn erf_inv_impl(p: f64, q: f64, s: f64) -> f64 {
-    let result = if p <= 0.5 {
-        let y = 0.0891314744949340820313;
-        let g = p * (p + 10.0);
-        let r = evaluate::polynomial(p, ERF_INV_IMPL_AN) / evaluate::polynomial(p, ERF_INV_IMPL_AD);
-        g * y + g * r
-    } else if q >= 0.25 {
-        let y = 2.249481201171875;
-        let g = (-2.0 * q.ln()).sqrt();
-        let xs = q - 0.25;
-        let r =
-            evaluate::polynomial(xs, ERF_INV_IMPL_BN) / evaluate::polynomial(xs, ERF_INV_IMPL_BD);
-        g / (y + r)
-    } else {
-        let x = (-q.ln()).sqrt();
-        if x < 3.0 {
-            let y = 0.807220458984375;
-            let xs = x - 1.125;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_CN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_CD);
-            y * x + r * x
-        } else if x < 6.0 {
-            let y = 0.93995571136474609375;
-            let xs = x - 3.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_DN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_DD);
-            y * x + r * x
-        } else if x < 18.0 {
-            let y = 0.98362827301025390625;
-            let xs = x - 6.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_EN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_ED);
-            y * x + r * x
-        } else if x < 44.0 {
-            let y = 0.99714565277099609375;
-            let xs = x - 18.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_FN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_FD);
-            y * x + r * x
-        } else {
-            let y = 0.99941349029541015625;
-            let xs = x - 44.0;
-            let r = evaluate::polynomial(xs, ERF_INV_IMPL_GN)
-                / evaluate::polynomial(xs, ERF_INV_IMPL_GD);
-            y * x + r * x
-        }
-    };
-    s * result
-}
--- a/candle-core/src/cpu/kernels.rs
+++ b/candle-core/src/cpu/kernels.rs
@ -1,191 +0,0 @@
-pub trait VecOps: num_traits::NumAssign + Copy {
-    fn min(self, rhs: Self) -> Self;
-    fn max(self, rhs: Self) -> Self;
-
-    /// Dot-product of two vectors.
-    ///
-    /// # Safety
-    ///
-    /// The length of `lhs` and `rhs` have to be at least `len`. `res` has to point to a valid
-    /// element.
-    #[inline(always)]
-    unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
-        *res = Self::zero();
-        for i in 0..len {
-            *res += *lhs.add(i) * *rhs.add(i)
-        }
-    }
-
-    /// Sum of all elements in a vector.
-    ///
-    /// # Safety
-    ///
-    /// The length of `xs` must be at least `len`. `res` has to point to a valid
-    /// element.
-    #[inline(always)]
-    unsafe fn vec_reduce_sum(xs: *const Self, res: *mut Self, len: usize) {
-        *res = Self::zero();
-        for i in 0..len {
-            *res += *xs.add(i)
-        }
-    }
-
-    /// Maximum element in a non-empty vector.
-    ///
-    /// # Safety
-    ///
-    /// The length of `xs` must be at least `len` and positive. `res` has to point to a valid
-    /// element.
-    #[inline(always)]
-    unsafe fn vec_reduce_max(xs: *const Self, res: *mut Self, len: usize) {
-        *res = *xs;
-        for i in 1..len {
-            *res = (*res).max(*xs.add(i))
-        }
-    }
-
-    /// Minimum element in a non-empty vector.
-    ///
-    /// # Safety
-    ///
-    /// The length of `xs` must be at least `len` and positive. `res` has to point to a valid
-    /// element.
-    #[inline(always)]
-    unsafe fn vec_reduce_min(xs: *const Self, res: *mut Self, len: usize) {
-        *res = *xs;
-        for i in 1..len {
-            *res = (*res).min(*xs.add(i))
-        }
-    }
-}
-
-impl VecOps for f32 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        Self::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        Self::max(self, other)
-    }
-
-    #[inline(always)]
-    unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
-        super::vec_dot_f32(lhs, rhs, res, len)
-    }
-
-    #[inline(always)]
-    unsafe fn vec_reduce_sum(xs: *const Self, res: *mut Self, len: usize) {
-        super::vec_sum(xs, res, len)
-    }
-}
-
-impl VecOps for half::f16 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        Self::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        Self::max(self, other)
-    }
-
-    #[inline(always)]
-    unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
-        let mut res_f32 = 0f32;
-        super::vec_dot_f16(lhs, rhs, &mut res_f32, len);
-        *res = half::f16::from_f32(res_f32);
-    }
-}
-
-impl VecOps for f64 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        Self::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        Self::max(self, other)
-    }
-}
-impl VecOps for half::bf16 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        Self::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        Self::max(self, other)
-    }
-}
-impl VecOps for u8 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        <Self as Ord>::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        <Self as Ord>::max(self, other)
-    }
-}
-impl VecOps for u32 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        <Self as Ord>::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        <Self as Ord>::max(self, other)
-    }
-}
-impl VecOps for i64 {
-    #[inline(always)]
-    fn min(self, other: Self) -> Self {
-        <Self as Ord>::min(self, other)
-    }
-
-    #[inline(always)]
-    fn max(self, other: Self) -> Self {
-        <Self as Ord>::max(self, other)
-    }
-}
-
-#[inline(always)]
-pub fn par_for_each(n_threads: usize, func: impl Fn(usize) + Send + Sync) {
-    if n_threads == 1 {
-        func(0)
-    } else {
-        rayon::scope(|s| {
-            for thread_idx in 0..n_threads {
-                let func = &func;
-                s.spawn(move |_| func(thread_idx));
-            }
-        })
-    }
-}
-
-#[inline(always)]
-pub fn par_range(lo: usize, up: usize, n_threads: usize, func: impl Fn(usize) + Send + Sync) {
-    if n_threads == 1 {
-        for i in lo..up {
-            func(i)
-        }
-    } else {
-        rayon::scope(|s| {
-            for thread_idx in 0..n_threads {
-                let func = &func;
-                s.spawn(move |_| {
-                    for i in (thread_idx..up).step_by(n_threads) {
-                        func(i)
-                    }
-                });
-            }
-        })
-    }
-}
--- a/candle-core/src/cpu/mod.rs
+++ b/candle-core/src/cpu/mod.rs
@ -1,180 +0,0 @@
-pub mod erf;
-pub mod kernels;
-
-trait Cpu<const ARR: usize> {
-    type Unit;
-    type Array;
-    const STEP: usize;
-    const EPR: usize;
-
-    fn n() -> usize;
-    unsafe fn zero() -> Self::Unit;
-    unsafe fn zero_array() -> Self::Array;
-    unsafe fn load(mem_addr: *const f32) -> Self::Unit;
-    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit;
-    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit;
-    unsafe fn vec_reduce(x: Self::Array, y: *mut f32);
-    unsafe fn from_f32(v: f32) -> Self::Unit;
-    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit);
-}
-
-trait CpuF16<const ARR: usize> {
-    type Unit;
-    type Array;
-    const STEP: usize;
-    const EPR: usize;
-
-    fn n() -> usize;
-    unsafe fn zero() -> Self::Unit;
-    unsafe fn zero_array() -> Self::Array;
-    unsafe fn load(mem_addr: *const f16) -> Self::Unit;
-    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit;
-    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit;
-    unsafe fn vec_reduce(x: Self::Array, y: *mut f32);
-    unsafe fn from_f32(v: f32) -> Self::Unit;
-    unsafe fn vec_store(mem_addr: *mut f16, a: Self::Unit);
-}
-use half::f16;
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[cfg(target_feature = "avx")]
-pub mod avx;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[cfg(target_feature = "avx")]
-pub use avx::{CurrentCpu, CurrentCpuF16};
-
-#[cfg(target_arch = "wasm32")]
-#[cfg(target_feature = "simd128")]
-pub mod simd128;
-#[cfg(target_arch = "wasm32")]
-#[cfg(target_feature = "simd128")]
-pub use simd128::CurrentCpu;
-
-#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
-#[cfg(target_feature = "neon")]
-pub mod neon;
-#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
-#[cfg(target_feature = "neon")]
-pub use neon::CurrentCpu;
-
-#[cfg(any(
-    target_feature = "neon",
-    target_feature = "avx",
-    target_feature = "simd128"
-))]
-#[inline(always)]
-pub(crate) unsafe fn vec_dot_f32(a_row: *const f32, b_row: *const f32, c: *mut f32, k: usize) {
-    let np = k & !(CurrentCpu::STEP - 1);
-
-    let mut sum = CurrentCpu::zero_array();
-    let mut ax = CurrentCpu::zero_array();
-    let mut ay = CurrentCpu::zero_array();
-
-    for i in (0..np).step_by(CurrentCpu::STEP) {
-        for j in 0..CurrentCpu::n() {
-            ax[j] = CurrentCpu::load(a_row.add(i + j * CurrentCpu::EPR));
-            ay[j] = CurrentCpu::load(b_row.add(i + j * CurrentCpu::EPR));
-
-            sum[j] = CurrentCpu::vec_fma(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    CurrentCpu::vec_reduce(sum, c);
-
-    // leftovers
-    for i in np..k {
-        *c += *a_row.add(i) * (*b_row.add(i));
-    }
-}
-
-#[cfg(not(any(
-    target_feature = "neon",
-    target_feature = "avx",
-    target_feature = "simd128"
-)))]
-#[inline(always)]
-pub(crate) unsafe fn vec_dot_f32(a_row: *const f32, b_row: *const f32, c: *mut f32, k: usize) {
-    // leftovers
-    for i in 0..k {
-        *c += *a_row.add(i) * (*b_row.add(i));
-    }
-}
-
-#[cfg(any(
-    target_feature = "neon",
-    target_feature = "avx",
-    target_feature = "simd128"
-))]
-#[inline(always)]
-pub(crate) unsafe fn vec_sum(row: *const f32, b: *mut f32, k: usize) {
-    let np = k & !(CurrentCpu::STEP - 1);
-
-    let mut sum = CurrentCpu::zero_array();
-    let mut x = CurrentCpu::zero_array();
-
-    for i in (0..np).step_by(CurrentCpu::STEP) {
-        for j in 0..CurrentCpu::n() {
-            x[j] = CurrentCpu::load(row.add(i + j * CurrentCpu::EPR));
-            sum[j] = CurrentCpu::vec_add(sum[j], x[j]);
-        }
-    }
-
-    CurrentCpu::vec_reduce(sum, b);
-
-    // leftovers
-    for i in np..k {
-        *b += *row.add(i)
-    }
-}
-
-#[cfg(not(any(
-    target_feature = "neon",
-    target_feature = "avx",
-    target_feature = "simd128"
-)))]
-#[inline(always)]
-pub(crate) unsafe fn vec_sum(row: *const f32, b: *mut f32, k: usize) {
-    *b = 0f32;
-    for i in 0..k {
-        *b += *row.add(i)
-    }
-}
-
-#[cfg(target_feature = "avx")]
-#[inline(always)]
-pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c: *mut f32, k: usize) {
-    let mut sumf = 0.0f32;
-    let np = k & !(CurrentCpuF16::STEP - 1);
-
-    let mut sum = CurrentCpuF16::zero_array();
-    let mut ax = CurrentCpuF16::zero_array();
-    let mut ay = CurrentCpuF16::zero_array();
-
-    for i in (0..np).step_by(CurrentCpuF16::STEP) {
-        for j in 0..CurrentCpuF16::n() {
-            ax[j] = CurrentCpuF16::load(a_row.add(i + j * CurrentCpuF16::EPR));
-            ay[j] = CurrentCpuF16::load(b_row.add(i + j * CurrentCpuF16::EPR));
-
-            sum[j] = CurrentCpuF16::vec_fma(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    CurrentCpuF16::vec_reduce(sum, &mut sumf);
-
-    // leftovers
-    for i in np..k {
-        sumf += (*a_row.add(i)).to_f32() * (*b_row.add(i)).to_f32();
-    }
-    *c = sumf;
-}
-
-#[cfg(not(target_feature = "avx"))]
-#[inline(always)]
-pub(crate) unsafe fn vec_dot_f16(a_row: *const f16, b_row: *const f16, c: *mut f32, k: usize) {
-    // leftovers
-    let mut sum = 0.0;
-    for i in 0..k {
-        sum += (*a_row.add(i)).to_f32() * (*b_row.add(i)).to_f32();
-    }
-    *c = sum;
-}
--- a/candle-core/src/cpu/neon.rs
+++ b/candle-core/src/cpu/neon.rs
@ -1,74 +0,0 @@
-use super::Cpu;
-#[cfg(target_arch = "arm")]
-use core::arch::arm::*;
-
-#[cfg(target_arch = "aarch64")]
-use core::arch::aarch64::*;
-
-pub struct CurrentCpu {}
-
-const STEP: usize = 16;
-const EPR: usize = 4;
-const ARR: usize = STEP / EPR;
-
-impl CurrentCpu {
-    #[cfg(target_arch = "aarch64")]
-    unsafe fn reduce_one(x: float32x4_t) -> f32 {
-        vaddvq_f32(x)
-    }
-
-    #[cfg(target_arch = "arm")]
-    unsafe fn reduce_one(x: float32x4_t) -> f32 {
-        vgetq_lane_f32(x, 0) + vgetq_lane_f32(x, 1) + vgetq_lane_f32(x, 2) + vgetq_lane_f32(x, 3)
-    }
-}
-
-impl Cpu<ARR> for CurrentCpu {
-    type Unit = float32x4_t;
-    type Array = [float32x4_t; ARR];
-
-    const STEP: usize = STEP;
-    const EPR: usize = EPR;
-
-    fn n() -> usize {
-        ARR
-    }
-
-    unsafe fn zero() -> Self::Unit {
-        vdupq_n_f32(0.0)
-    }
-
-    unsafe fn from_f32(x: f32) -> Self::Unit {
-        vdupq_n_f32(x)
-    }
-
-    unsafe fn zero_array() -> Self::Array {
-        [Self::zero(); ARR]
-    }
-
-    unsafe fn load(mem_addr: *const f32) -> Self::Unit {
-        vld1q_f32(mem_addr)
-    }
-
-    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
-        vaddq_f32(a, b)
-    }
-
-    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
-        vfmaq_f32(a, b, c)
-    }
-
-    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
-        vst1q_f32(mem_addr, a);
-    }
-
-    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
-        for i in 0..ARR / 2 {
-            x[2 * i] = vaddq_f32(x[2 * i], x[2 * i + 1]);
-        }
-        for i in 0..ARR / 4 {
-            x[4 * i] = vaddq_f32(x[4 * i], x[4 * i + 2]);
-        }
-        *y = Self::reduce_one(x[0]);
-    }
-}
--- a/candle-core/src/cpu/simd128.rs
+++ b/candle-core/src/cpu/simd128.rs
@ -1,64 +0,0 @@
-use super::Cpu;
-use core::arch::wasm32::*;
-
-pub struct CurrentCpu {}
-
-const STEP: usize = 16;
-const EPR: usize = 4;
-const ARR: usize = STEP / EPR;
-
-impl Cpu<ARR> for CurrentCpu {
-    type Unit = v128;
-    type Array = [v128; ARR];
-
-    const STEP: usize = STEP;
-    const EPR: usize = EPR;
-
-    fn n() -> usize {
-        ARR
-    }
-
-    unsafe fn zero() -> Self::Unit {
-        f32x4_splat(0.0)
-    }
-
-    unsafe fn zero_array() -> Self::Array {
-        [Self::zero(); ARR]
-    }
-
-    unsafe fn from_f32(v: f32) -> Self::Unit {
-        f32x4_splat(v)
-    }
-
-    unsafe fn load(mem_addr: *const f32) -> Self::Unit {
-        v128_load(mem_addr as *mut v128)
-    }
-
-    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
-        f32x4_add(a, b)
-    }
-
-    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
-        f32x4_add(f32x4_mul(b, c), a)
-    }
-
-    unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) {
-        v128_store(mem_addr as *mut v128, a);
-    }
-
-    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
-        for i in 0..ARR / 2 {
-            x[2 * i] = f32x4_add(x[2 * i], x[2 * i + 1]);
-        }
-        for i in 0..ARR / 4 {
-            x[4 * i] = f32x4_add(x[4 * i], x[4 * i + 2]);
-        }
-        for i in 0..ARR / 8 {
-            x[8 * i] = f32x4_add(x[8 * i], x[8 * i + 4]);
-        }
-        *y = f32x4_extract_lane::<0>(x[0])
-            + f32x4_extract_lane::<1>(x[0])
-            + f32x4_extract_lane::<2>(x[0])
-            + f32x4_extract_lane::<3>(x[0]);
-    }
-}
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
--- a/candle-core/src/cudnn.rs
+++ b/candle-core/src/cudnn.rs
@ -1,123 +0,0 @@
-use crate::WithDType;
-use cudarc;
-use cudarc::cudnn::safe::{Conv2dForward, Cudnn};
-use cudarc::driver::{CudaSlice, CudaView, DeviceRepr, ValidAsZeroBits};
-use std::cell::RefCell;
-use std::collections::HashMap;
-use std::sync::Arc;
-
-// The cudnn handles are stored per thread here rather than on the CudaDevice as they are neither
-// send nor sync.
-thread_local! {
-    static CUDNN: RefCell<HashMap<crate::cuda_backend::DeviceId, Arc<Cudnn>>> = HashMap::new().into();
-}
-
-impl From<cudarc::cudnn::CudnnError> for crate::Error {
-    fn from(err: cudarc::cudnn::CudnnError) -> Self {
-        crate::Error::wrap(err)
-    }
-}
-
-impl From<cudarc::driver::DriverError> for crate::Error {
-    fn from(err: cudarc::driver::DriverError) -> Self {
-        crate::Error::wrap(err)
-    }
-}
-
-pub(crate) fn launch_conv2d<
-    T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType,
->(
-    src: &CudaView<T>,
-    src_l: &crate::Layout,
-    filter: &CudaView<T>,
-    dst: &mut CudaSlice<T>,
-    params: &crate::conv::ParamsConv2D,
-    dev: &crate::cuda_backend::CudaDevice,
-) -> crate::Result<()> {
-    use crate::conv::CudnnFwdAlgo as CandleAlgo;
-    use cudarc::cudnn::sys::cudnnConvolutionFwdAlgo_t as A;
-
-    let device_id = dev.id();
-    let cudnn = CUDNN.with(|cudnn| {
-        if let Some(cudnn) = cudnn.borrow().get(&device_id) {
-            return Ok(cudnn.clone());
-        }
-        let c = Cudnn::new(dev.cuda_device());
-        if let Ok(c) = &c {
-            cudnn.borrow_mut().insert(device_id, c.clone());
-        }
-        c
-    })?;
-    let conv = cudnn.create_conv2d::<T>(
-        /* pad */ [params.padding as i32, params.padding as i32],
-        /* stride */ [params.stride as i32, params.stride as i32],
-        /* dilation */ [params.dilation as i32, params.dilation as i32],
-        cudarc::cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
-    )?;
-    let x_shape = [
-        params.b_size as i32,
-        params.c_in as i32,
-        params.i_h as i32,
-        params.i_w as i32,
-    ];
-    // Note that `src` already starts at the proper offset.
-    let x = if src_l.is_contiguous() {
-        cudnn.create_4d_tensor(
-            cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
-            x_shape,
-        )?
-    } else {
-        let s = src_l.stride();
-        cudnn.create_4d_tensor_ex(
-            x_shape,
-            [s[0] as i32, s[1] as i32, s[2] as i32, s[3] as i32],
-        )?
-    };
-    let w = cudnn.create_4d_filter(
-        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
-        [
-            params.c_out as i32,
-            params.c_in as i32,
-            params.k_h as i32,
-            params.k_w as i32,
-        ],
-    )?;
-    let (w_out, h_out) = (params.out_w() as i32, params.out_h() as i32);
-    let y = cudnn.create_4d_tensor(
-        cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
-        [params.b_size as i32, params.c_out as i32, h_out, w_out],
-    )?;
-    let conv2d = Conv2dForward {
-        conv: &conv,
-        x: &x,
-        w: &w,
-        y: &y,
-    };
-    let alg = match params.cudnn_fwd_algo {
-        None => conv2d.pick_algorithm()?,
-        Some(CandleAlgo::ImplicitGemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
-        Some(CandleAlgo::ImplicitPrecompGemm) => {
-            A::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
-        }
-        Some(CandleAlgo::Gemm) => A::CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-        Some(CandleAlgo::Direct) => A::CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
-        Some(CandleAlgo::Fft) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-        Some(CandleAlgo::FftTiling) => A::CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
-        Some(CandleAlgo::Winograd) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-        Some(CandleAlgo::WinogradNonFused) => A::CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
-        Some(CandleAlgo::Count) => A::CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
-    };
-    let workspace_size = conv2d.get_workspace_size(alg)?;
-    let mut workspace = dev.cuda_device().alloc_zeros::<u8>(workspace_size)?;
-    unsafe {
-        conv2d.launch::<CudaSlice<u8>, _, _, _>(
-            alg,
-            Some(&mut workspace),
-            (T::one(), T::zero()),
-            src,
-            filter,
-            dst,
-        )?;
-    }
-    Ok(())
-}
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -16,6 +16,7 @@ pub enum Device {
    Cuda(crate::CudaDevice),
 }

+// TODO: Should we back the cpu implementation using the NdArray crate or similar?
 pub trait NdArray {
    fn shape(&self) -> Result<Shape>;

@ -70,7 +71,8 @@ impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize> NdArray
    }

    fn to_cpu_storage(&self) -> CpuStorage {
-        let mut vec = Vec::with_capacity(N1 * N2 * N3);
+        let mut vec = Vec::new();
+        vec.reserve(N1 * N2 * N3);
        for i1 in 0..N1 {
            for i2 in 0..N2 {
                vec.extend(self[i1][i2])
@ -80,49 +82,6 @@ impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize> NdArray
    }
 }

-impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize, const N4: usize> NdArray
-    for &[[[[S; N4]; N3]; N2]; N1]
-{
-    fn shape(&self) -> Result<Shape> {
-        Ok(Shape::from((N1, N2, N3, N4)))
-    }
-
-    fn to_cpu_storage(&self) -> CpuStorage {
-        let mut vec = Vec::with_capacity(N1 * N2 * N3 * N4);
-        for i1 in 0..N1 {
-            for i2 in 0..N2 {
-                for i3 in 0..N3 {
-                    vec.extend(self[i1][i2][i3])
-                }
-            }
-        }
-        S::to_cpu_storage_owned(vec)
-    }
-}
-
-impl<S: NdArray> NdArray for Vec<S> {
-    fn shape(&self) -> Result<Shape> {
-        if self.is_empty() {
-            crate::bail!("empty array")
-        }
-        let shape0 = self[0].shape()?;
-        let n = self.len();
-        for v in self.iter() {
-            let shape = v.shape()?;
-            if shape != shape0 {
-                crate::bail!("two elements have different shapes {shape:?} {shape0:?}")
-            }
-        }
-        Ok(Shape::from([[n].as_slice(), shape0.dims()].concat()))
-    }
-
-    fn to_cpu_storage(&self) -> CpuStorage {
-        // This allocates intermediary memory and shouldn't be necessary.
-        let storages = self.iter().map(|v| v.to_cpu_storage()).collect::<Vec<_>>();
-        CpuStorage::concat(storages.as_slice()).unwrap()
-    }
-}
-
 impl Device {
    pub fn new_cuda(ordinal: usize) -> Result<Self> {
        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
@ -143,13 +102,6 @@ impl Device {
        }
    }

-    pub fn is_cpu(&self) -> bool {
-        match self {
-            Self::Cpu => true,
-            Self::Cuda(_) => false,
-        }
-    }
-
    pub fn is_cuda(&self) -> bool {
        match self {
            Self::Cpu => false,
@ -165,12 +117,12 @@ impl Device {
        }
    }

-    pub(crate) fn rand_uniform_f64(
+    pub(crate) fn rand_uniform(
        &self,
-        lo: f64,
-        up: f64,
        shape: &Shape,
        dtype: DType,
+        lo: f64,
+        up: f64,
    ) -> Result<Storage> {
        match self {
            Device::Cpu => {
@ -184,21 +136,12 @@ impl Device {
        }
    }

-    pub(crate) fn rand_uniform<T: crate::FloatDType>(
+    pub(crate) fn rand_normal(
        &self,
-        lo: T,
-        up: T,
-        shape: &Shape,
-    ) -> Result<Storage> {
-        self.rand_uniform_f64(lo.to_f64(), up.to_f64(), shape, T::DTYPE)
-    }
-
-    pub(crate) fn rand_normal_f64(
-        &self,
-        mean: f64,
-        std: f64,
        shape: &Shape,
        dtype: DType,
+        mean: f64,
+        std: f64,
    ) -> Result<Storage> {
        match self {
            Device::Cpu => {
@ -212,15 +155,6 @@ impl Device {
        }
    }

-    pub(crate) fn rand_normal<T: crate::FloatDType>(
-        &self,
-        mean: T,
-        std: T,
-        shape: &Shape,
-    ) -> Result<Storage> {
-        self.rand_normal_f64(mean.to_f64(), std.to_f64(), shape, T::DTYPE)
-    }
-
    pub(crate) fn ones(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
        match self {
            Device::Cpu => {
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@ -9,14 +9,11 @@ impl Tensor {
        &self,
        f: &mut std::fmt::Formatter,
    ) -> std::fmt::Result {
-        let device_str = match self.device().location() {
-            crate::DeviceLocation::Cpu => "".to_owned(),
-            crate::DeviceLocation::Cuda { gpu_id } => {
-                format!(", cuda:{}", gpu_id)
-            }
+        let prefix = match self.device() {
+            crate::Device::Cpu => "Cpu",
+            crate::Device::Cuda(_) => "Cuda",
        };
-
-        write!(f, "Tensor[")?;
+        write!(f, "{prefix}Tensor[")?;
        match self.dims() {
            [] => {
                if let Ok(v) = self.to_scalar::<T>() {
@ -43,7 +40,7 @@ impl Tensor {
                }
            }
        }
-        write!(f, "; {}{}]", self.dtype().as_str(), device_str)
+        write!(f, "; {}]", self.dtype().as_str())
    }
 }

@ -52,7 +49,6 @@ impl std::fmt::Debug for Tensor {
        match self.dtype() {
            DType::U8 => self.fmt_dt::<u8>(f),
            DType::U32 => self.fmt_dt::<u32>(f),
-            DType::I64 => self.fmt_dt::<i64>(f),
            DType::BF16 => self.fmt_dt::<bf16>(f),
            DType::F16 => self.fmt_dt::<f16>(f),
            DType::F32 => self.fmt_dt::<f32>(f),
@ -435,12 +431,6 @@ impl std::fmt::Display for Tensor {
                tf.fmt_tensor(self, 1, max_w, summarize, &po, f)?;
                writeln!(f)?;
            }
-            DType::I64 => {
-                let tf: IntFormatter<i64> = IntFormatter::new();
-                let max_w = tf.max_width(&to_display);
-                tf.fmt_tensor(self, 1, max_w, summarize, &po, f)?;
-                writeln!(f)?;
-            }
            DType::BF16 => {
                if let Ok(tf) = FloatFormatter::<bf16>::new(&to_display, &po) {
                    let max_w = tf.max_width(&to_display);
@ -470,20 +460,6 @@ impl std::fmt::Display for Tensor {
                }
            }
        };
-
-        let device_str = match self.device().location() {
-            crate::DeviceLocation::Cpu => "".to_owned(),
-            crate::DeviceLocation::Cuda { gpu_id } => {
-                format!(", cuda:{}", gpu_id)
-            }
-        };
-
-        write!(
-            f,
-            "Tensor[{:?}, {}{}]",
-            self.dims(),
-            self.dtype().as_str(),
-            device_str
-        )
+        write!(f, "Tensor[{:?}, {}]", self.dims(), self.dtype().as_str())
    }
 }
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@ -1,24 +1,13 @@
-//! Types for elements that can be stored and manipulated using tensors.
-#![allow(clippy::redundant_closure_call)]
 use crate::backend::BackendStorage;
 use crate::{CpuStorage, Error, Result};

-/// The different types of elements allowed in tensors.
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum DType {
-    // Unsigned 8 bits integer.
    U8,
-    // Unsigned 32 bits integer.
    U32,
-    // Signed 64 bits integer.
-    I64,
-    // Brain floating-point using half precision (16 bits).
    BF16,
-    // Floating-point using half precision (16 bits).
    F16,
-    // Floating-point using single precision (32 bits).
    F32,
-    // Floating-point using double precision (64 bits).
    F64,
 }

@ -31,7 +20,6 @@ impl std::str::FromStr for DType {
        match s {
            "u8" => Ok(Self::U8),
            "u32" => Ok(Self::U32),
-            "i64" => Ok(Self::I64),
            "bf16" => Ok(Self::BF16),
            "f16" => Ok(Self::F16),
            "f32" => Ok(Self::F32),
@ -42,12 +30,10 @@ impl std::str::FromStr for DType {
 }

 impl DType {
-    /// String representation for dtypes.
    pub fn as_str(&self) -> &'static str {
        match self {
            Self::U8 => "u8",
            Self::U32 => "u32",
-            Self::I64 => "i64",
            Self::BF16 => "bf16",
            Self::F16 => "f16",
            Self::F32 => "f32",
@ -55,45 +41,19 @@ impl DType {
        }
    }

-    /// The size used by each element in bytes, i.e. 1 for `U8`, 4 for `F32`.
    pub fn size_in_bytes(&self) -> usize {
        match self {
-            Self::U8 => 1,
+            Self::U8 => 4,
            Self::U32 => 4,
-            Self::I64 => 8,
            Self::BF16 => 2,
            Self::F16 => 2,
            Self::F32 => 4,
            Self::F64 => 8,
        }
    }
-
-    pub fn is_int(&self) -> bool {
-        match self {
-            Self::U8 | Self::U32 | Self::I64 => true,
-            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => false,
-        }
-    }
-
-    pub fn is_float(&self) -> bool {
-        match self {
-            Self::U8 | Self::U32 | Self::I64 => false,
-            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => true,
-        }
-    }
 }

-pub trait WithDType:
-    Sized
-    + Copy
-    + num_traits::NumAssign
-    + std::cmp::PartialOrd
-    + std::fmt::Display
-    + 'static
-    + Send
-    + Sync
-    + crate::cpu::kernels::VecOps
-{
+pub trait WithDType: Sized + Copy + num_traits::NumAssign + std::cmp::PartialOrd + 'static {
    const DTYPE: DType;

    fn from_f64(v: f64) -> Self;
@ -155,47 +115,7 @@ use half::{bf16, f16};

 with_dtype!(u8, U8, |v: f64| v as u8, |v: u8| v as f64);
 with_dtype!(u32, U32, |v: f64| v as u32, |v: u32| v as f64);
-with_dtype!(i64, I64, |v: f64| v as i64, |v: i64| v as f64);
 with_dtype!(f16, F16, f16::from_f64, f16::to_f64);
 with_dtype!(bf16, BF16, bf16::from_f64, bf16::to_f64);
 with_dtype!(f32, F32, |v: f64| v as f32, |v: f32| v as f64);
 with_dtype!(f64, F64, |v: f64| v, |v: f64| v);
-
-pub trait IntDType: WithDType {
-    fn is_true(&self) -> bool;
-    fn as_usize(&self) -> usize;
-}
-
-impl IntDType for i64 {
-    fn is_true(&self) -> bool {
-        *self != 0
-    }
-    fn as_usize(&self) -> usize {
-        *self as usize
-    }
-}
-
-impl IntDType for u32 {
-    fn is_true(&self) -> bool {
-        *self != 0
-    }
-    fn as_usize(&self) -> usize {
-        *self as usize
-    }
-}
-
-impl IntDType for u8 {
-    fn is_true(&self) -> bool {
-        *self != 0
-    }
-    fn as_usize(&self) -> usize {
-        *self as usize
-    }
-}
-
-pub trait FloatDType: WithDType {}
-
-impl FloatDType for f16 {}
-impl FloatDType for bf16 {}
-impl FloatDType for f32 {}
-impl FloatDType for f64 {}
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -1,5 +1,4 @@
 #![allow(dead_code)]
-use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{CpuStorage, DType, Error, Layout, Result, Shape};

 #[derive(Debug, Clone)]
@ -37,19 +36,15 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn powf(&self, _: &Layout, _: f64) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn elu(&self, _: &Layout, _: f64) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn reduce_op(&self, _: ReduceOp, _: &Layout, _: &[usize]) -> Result<Self> {
+    fn sum(&self, _: &Layout, _: &[usize]) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn cmp(&self, _: CmpOp, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
+    fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
        Err(Error::NotCompiledWithCudaSupport)
    }

@ -57,11 +52,16 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn unary_impl<B: UnaryOpT>(&self, _: &Layout) -> Result<Self> {
+    fn unary_impl<B: crate::op::UnaryOp>(&self, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn binary_impl<B: BinaryOpT>(&self, _: &Self, _: &Layout, _: &Layout) -> Result<Self> {
+    fn binary_impl<B: crate::op::BinaryOp>(
+        &self,
+        _: &Self,
+        _: &Layout,
+        _: &Layout,
+    ) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }

@ -79,54 +79,7 @@ impl crate::backend::BackendStorage for CudaStorage {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn conv2d(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConv2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn conv_transpose2d(
-        &self,
-        _l: &Layout,
-        _kernel: &Self,
-        _kernel_l: &Layout,
-        _params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn index_select(&self, _: &Self, _: &Layout, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-    fn gather(&self, _: &Layout, _: &Self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn scatter_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn index_add(
-        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: usize,
-    ) -> Result<Self> {
+    fn embedding(&self, _: &Layout, _: &Self, _: &Layout) -> Result<Self> {
        Err(Error::NotCompiledWithCudaSupport)
    }

@ -143,22 +96,6 @@ impl crate::backend::BackendStorage for CudaStorage {
    fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()> {
        Err(Error::NotCompiledWithCudaSupport)
    }
-
-    fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
-    fn upsample_nearest2d(&self, _: &Layout, _: usize, _: usize) -> Result<Self> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
 }

 impl crate::backend::BackendDevice for CudaDevice {
@ -167,10 +104,6 @@ impl crate::backend::BackendDevice for CudaDevice {
        Err(Error::NotCompiledWithCudaSupport)
    }

-    fn set_seed(&self, _: u64) -> Result<()> {
-        Err(Error::NotCompiledWithCudaSupport)
-    }
-
    fn location(&self) -> crate::DeviceLocation {
        fail!()
    }
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -30,7 +30,7 @@ pub enum Error {
    UnsupportedDTypeForOp(DType, &'static str),

    // === Dimension Index Errors ===
-    #[error("{op}: dimension index {dim} out of range for shape {shape:?}")]
+    #[error("{op}: dimension index {dim} out of range for {shape:?}")]
    DimOutOfRange {
        shape: Shape,
        dim: i32,
@ -79,19 +79,6 @@ pub enum Error {
        nth_shape: Shape,
    },

-    #[error("Cannot divide tensor of shape {shape:?} equally along dim {dim} into {n_parts}")]
-    ShapeMismatchSplit {
-        shape: Shape,
-        dim: usize,
-        n_parts: usize,
-    },
-
-    #[error("{op} can only be performed on a single dimension")]
-    OnlySingleDimension { op: &'static str, dims: Vec<usize> },
-
-    #[error("empty tensor for {op}")]
-    EmptyTensor { op: &'static str },
-
    // === Device Errors ===
    #[error("device mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
    DeviceMismatchBinaryOp {
@ -119,11 +106,11 @@ pub enum Error {
        msg: &'static str,
    },

-    #[error("{op} invalid index {index} with dim size {size}")]
+    #[error("{op} invalid index {index} with vocab {vocab_size}")]
    InvalidIndex {
        op: &'static str,
        index: usize,
-        size: usize,
+        vocab_size: usize,
    },

    #[error("cannot broadcast {src_shape:?} to {dst_shape:?}")]
@ -181,37 +168,21 @@ pub enum Error {
    #[error("unsupported safetensor dtype {0:?}")]
    UnsupportedSafeTensorDtype(safetensors::Dtype),

-    /// Arbitrary errors wrapping.
    #[error(transparent)]
    Wrapped(Box<dyn std::error::Error + Send + Sync>),

-    /// Adding path information to an error.
-    #[error("path: {path:?} {inner}")]
-    WithPath {
-        inner: Box<Self>,
-        path: std::path::PathBuf,
-    },
-
    #[error("{inner}\n{backtrace}")]
    WithBacktrace {
        inner: Box<Self>,
        backtrace: Box<std::backtrace::Backtrace>,
    },
-
-    /// User generated error message, typically created via `bail!`.
-    #[error("{0}")]
-    Msg(String),
 }

 pub type Result<T> = std::result::Result<T, Error>;

 impl Error {
    pub fn wrap(err: impl std::error::Error + Send + Sync + 'static) -> Self {
-        Self::Wrapped(Box::new(err)).bt()
-    }
-
-    pub fn msg(err: impl std::error::Error + Send + Sync + 'static) -> Self {
-        Self::Msg(err.to_string()).bt()
+        Self::Wrapped(Box::new(err))
    }

    pub fn bt(self) -> Self {
@ -225,32 +196,4 @@ impl Error {
            },
        }
    }
-
-    pub fn with_path<P: AsRef<std::path::Path>>(self, p: P) -> Self {
-        Self::WithPath {
-            inner: Box::new(self),
-            path: p.as_ref().to_path_buf(),
-        }
-    }
-}
-
-#[macro_export]
-macro_rules! bail {
-    ($msg:literal $(,)?) => {
-        return Err($crate::Error::Msg(format!($msg).into()).bt())
-    };
-    ($err:expr $(,)?) => {
-        return Err($crate::Error::Msg(format!($err).into()).bt())
-    };
-    ($fmt:expr, $($arg:tt)*) => {
-        return Err($crate::Error::Msg(format!($fmt, $($arg)*).into()).bt())
-    };
-}
-
-pub fn zip<T, U>(r1: Result<T>, r2: Result<U>) -> Result<(T, U)> {
-    match (r1, r2) {
-        (Ok(r1), Ok(r2)) => Ok((r1, r2)),
-        (Err(e), _) => Err(e),
-        (_, Err(e)) => Err(e),
-    }
 }
--- a/candle-core/src/indexer.rs
+++ b/candle-core/src/indexer.rs
@ -7,7 +7,7 @@ impl Tensor {
    /// Intended to be use by the trait `.i()`
    ///
    /// ```
-    /// # use candle_core::{Tensor, DType, Device, IndexOp};
+    /// # use candle::{Tensor, DType, Device, IndexOp};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
    /// let c = a.i(0..1)?;
@ -22,7 +22,7 @@ impl Tensor {
    /// let c = a.i((.., ..=2))?;
    /// assert_eq!(c.shape().dims(), &[2, 3]);
    ///
-    /// # Ok::<(), candle_core::Error>(())
+    /// # Ok::<(), candle::Error>(())
    /// ```
    fn index(&self, indexers: &[TensorIndexer]) -> Result<Self, Error> {
        let mut x = self.clone();
@ -42,35 +42,23 @@ impl Tensor {
                        Bound::Excluded(n) => *n,
                        Bound::Unbounded => dims[i],
                    };
-                    let out = x.narrow(current_dim, start, stop.saturating_sub(start))?;
+                    let out = x.narrow(current_dim, start, stop - start)?;
                    current_dim += 1;
                    out
                }
-                TensorIndexer::IndexSelect(indexes) => {
-                    if indexes.rank() != 1 {
-                        crate::bail!("multi-dimensional tensor indexing is not supported")
-                    }
-                    let out = x.index_select(&indexes.to_device(x.device())?, current_dim)?;
-                    current_dim += 1;
-                    out
-                }
-                TensorIndexer::Err(e) => crate::bail!("indexing error {e:?}"),
            };
        }
        Ok(x)
    }
 }

-#[derive(Debug)]
+#[derive(Debug, Clone)]
 /// Generic structure used to index a slice of the tensor
 pub enum TensorIndexer {
    /// This selects the elemnts for which an index has some specific value.
    Select(usize),
    /// This is a regular slice, purely indexing a chunk of the tensor
    Narrow(Bound<usize>, Bound<usize>),
-    /// Indexing via a 1d tensor
-    IndexSelect(Tensor),
-    Err(Error),
 }

 impl From<usize> for TensorIndexer {
@ -79,31 +67,6 @@ impl From<usize> for TensorIndexer {
    }
 }

-impl From<&[u32]> for TensorIndexer {
-    fn from(index: &[u32]) -> Self {
-        match Tensor::new(index, &crate::Device::Cpu) {
-            Ok(tensor) => TensorIndexer::IndexSelect(tensor),
-            Err(e) => TensorIndexer::Err(e),
-        }
-    }
-}
-
-impl From<Vec<u32>> for TensorIndexer {
-    fn from(index: Vec<u32>) -> Self {
-        let len = index.len();
-        match Tensor::from_vec(index, len, &crate::Device::Cpu) {
-            Ok(tensor) => TensorIndexer::IndexSelect(tensor),
-            Err(e) => TensorIndexer::Err(e),
-        }
-    }
-}
-
-impl From<&Tensor> for TensorIndexer {
-    fn from(tensor: &Tensor) -> Self {
-        TensorIndexer::IndexSelect(tensor.clone())
-    }
-}
-
 macro_rules! impl_from_range {
    ($range_type:ty) => {
        impl From<$range_type> for TensorIndexer {
--- a/candle-core/src/layout.rs
+++ b/candle-core/src/layout.rs
@ -9,14 +9,6 @@ pub struct Layout {
 }

 impl Layout {
-    pub fn new(shape: Shape, stride: Vec<usize>, start_offset: usize) -> Self {
-        Self {
-            shape,
-            stride,
-            start_offset,
-        }
-    }
-
    pub fn contiguous_with_offset<S: Into<Shape>>(shape: S, start_offset: usize) -> Self {
        let shape = shape.into();
        let stride = shape.stride_contiguous();
@ -120,31 +112,6 @@ impl Layout {
        })
    }

-    pub(crate) fn permute(&self, idxs: &[usize]) -> Result<Self> {
-        let is_permutation =
-            idxs.len() == self.shape.rank() && (0..idxs.len()).all(|i| idxs.contains(&i));
-        if !is_permutation {
-            crate::bail!(
-                "dimension mismatch in permute, tensor {:?}, dims: {:?}",
-                self.dims(),
-                idxs
-            )
-        }
-        let stride = self.stride();
-        let dims = self.shape().dims();
-        let mut perm_stride = stride.to_vec();
-        let mut perm_dims = dims.to_vec();
-        for (i, &idx) in idxs.iter().enumerate() {
-            perm_stride[i] = stride[idx];
-            perm_dims[i] = dims[idx];
-        }
-        Ok(Self {
-            shape: Shape::from(perm_dims),
-            stride: perm_stride,
-            start_offset: self.start_offset,
-        })
-    }
-
    pub fn broadcast_as<S: Into<Shape>>(&self, shape: S) -> Result<Self> {
        let shape = shape.into();
        if shape.rank() < self.shape().rank() {
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -1,8 +1,8 @@
 //! ML framework for Rust
 //!
 //! ```rust
-//! use candle_core::{Tensor, DType, Device};
-//! # use candle_core::Error;
+//! use candle::{Tensor, DType, Device};
+//! # use candle::Error;
 //! # fn main() -> Result<(), Error>{
 //!
 //! let a = Tensor::arange(0f32, 6f32, &Device::Cpu)?.reshape((2, 3))?;
@ -33,48 +33,38 @@
 //!
 //! Rust is cool, and a lot of the HF ecosystem already has Rust crates [safetensors](https://github.com/huggingface/safetensors) and [tokenizers](https://github.com/huggingface/tokenizers)

-#[cfg(feature = "accelerate")]
-mod accelerate;
-pub mod backend;
-pub mod backprop;
+mod backend;
+mod backprop;
 mod conv;
 mod convert;
-pub mod cpu;
-pub mod cpu_backend;
+mod cpu_backend;
 #[cfg(feature = "cuda")]
-pub mod cuda_backend;
-#[cfg(feature = "cudnn")]
-pub mod cudnn;
+mod cuda_backend;
 mod device;
 pub mod display;
 mod dtype;
 mod dummy_cuda_backend;
-pub mod error;
+mod error;
 mod indexer;
 pub mod layout;
 #[cfg(feature = "mkl")]
 mod mkl;
 pub mod npy;
 mod op;
-pub mod pickle;
-pub mod quantized;
 pub mod safetensors;
-pub mod scalar;
-pub mod shape;
+mod shape;
 mod storage;
 mod strided_index;
 mod tensor;
-pub mod test_utils;
 pub mod utils;
 mod variable;

 pub use cpu_backend::CpuStorage;
 pub use device::{Device, DeviceLocation};
-pub use dtype::{DType, FloatDType, IntDType, WithDType};
+pub use dtype::{DType, WithDType};
 pub use error::{Error, Result};
 pub use indexer::IndexOp;
 pub use layout::Layout;
-pub use op::{CustomOp1, CustomOp2, CustomOp3};
 pub use shape::{Shape, D};
 pub use storage::Storage;
 pub use strided_index::{StridedBlocks, StridedIndex};
@ -89,39 +79,3 @@ pub use dummy_cuda_backend::{CudaDevice, CudaStorage};

 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;
-
-#[cfg(feature = "accelerate")]
-extern crate accelerate_src;
-
-pub trait ToUsize2 {
-    fn to_usize2(self) -> (usize, usize);
-}
-
-impl ToUsize2 for usize {
-    fn to_usize2(self) -> (usize, usize) {
-        (self, self)
-    }
-}
-
-impl ToUsize2 for (usize, usize) {
-    fn to_usize2(self) -> (usize, usize) {
-        self
-    }
-}
-
-// A simple trait defining a module with forward method using a single argument.
-pub trait Module {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor>;
-}
-
-impl Module for quantized::QMatMul {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        self.forward(xs)
-    }
-}
-
-impl<T: Fn(&Tensor) -> Result<Tensor>> Module for T {
-    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        self(xs)
-    }
-}
--- a/candle-core/src/mkl.rs
+++ b/candle-core/src/mkl.rs
@ -25,10 +25,6 @@ mod ffi {
        pub fn vdMul(n: c_int, a: *const c_double, b: *const c_double, y: *mut c_double);
        pub fn vsDiv(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_float);
        pub fn vdDiv(n: c_int, a: *const c_double, b: *const c_double, y: *mut c_double);
-        pub fn vsFmax(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_float);
-        pub fn vdFmax(n: c_int, a: *const c_double, b: *const c_double, y: *mut c_double);
-        pub fn vsFmin(n: c_int, a: *const c_float, b: *const c_float, y: *mut c_float);
-        pub fn vdFmin(n: c_int, a: *const c_double, b: *const c_double, y: *mut c_double);

        pub fn sgemm_(
            transa: *const c_char,
@ -301,7 +297,7 @@ pub fn vd_sqr(a: &[f64], y: &mut [f64]) {
 }

 #[inline]
-pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
+fn vs_tanh(a: &[f32], y: &mut [f32]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
@ -311,7 +307,7 @@ pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
 }

 #[inline]
-pub fn vd_tanh(a: &[f64], y: &mut [f64]) {
+fn vd_tanh(a: &[f64], y: &mut [f64]) {
    let a_len = a.len();
    let y_len = y.len();
    if a_len != y_len {
@ -380,7 +376,3 @@ binary_op!(vs_mul, f32, vsMul);
 binary_op!(vd_mul, f64, vdMul);
 binary_op!(vs_div, f32, vsDiv);
 binary_op!(vd_div, f64, vdDiv);
-binary_op!(vs_max, f32, vsFmax);
-binary_op!(vd_max, f64, vdFmax);
-binary_op!(vs_min, f32, vsFmin);
-binary_op!(vd_min, f64, vdFmin);
--- a/candle-core/src/npy.rs
+++ b/candle-core/src/npy.rs
@ -26,7 +26,7 @@
 //! values = np.loadz("test.npz")
 //! ```
 use crate::{DType, Device, Error, Result, Shape, Tensor};
-use byteorder::{LittleEndian, ReadBytesExt};
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use half::{bf16, f16, slice::HalfFloatSliceExt};
 use std::collections::HashMap;
 use std::fs::File;
@ -85,7 +85,6 @@ impl Header {
            DType::F16 => "f2",
            DType::F32 => "f4",
            DType::F64 => "f8",
-            DType::I64 => "i8",
            DType::U32 => "u4",
            DType::U8 => "u1",
        };
@ -161,7 +160,7 @@ impl Header {
                    "f" | "f4" => DType::F32,
                    "d" | "f8" => DType::F64,
                    // "i" | "i4" => DType::S32,
-                    "q" | "i8" => DType::I64,
+                    // "q" | "i8" => DType::S64,
                    // "h" | "i2" => DType::S16,
                    // "b" | "i1" => DType::S8,
                    "B" | "u1" => DType::U8,
@ -197,11 +196,7 @@ impl Header {

 impl Tensor {
    // TODO: Add the possibility to read directly to a device?
-    pub(crate) fn from_reader<R: std::io::Read>(
-        shape: Shape,
-        dtype: DType,
-        reader: &mut R,
-    ) -> Result<Self> {
+    fn from_reader<R: std::io::Read>(shape: Shape, dtype: DType, reader: &mut R) -> Result<Self> {
        let elem_count = shape.elem_count();
        match dtype {
            DType::BF16 => {
@ -234,11 +229,6 @@ impl Tensor {
                reader.read_u32_into::<LittleEndian>(&mut data_t)?;
                Tensor::from_vec(data_t, shape, &Device::Cpu)
            }
-            DType::I64 => {
-                let mut data_t = vec![0i64; elem_count];
-                reader.read_i64_into::<LittleEndian>(&mut data_t)?;
-                Tensor::from_vec(data_t, shape, &Device::Cpu)
-            }
        }
    }

@ -317,7 +307,42 @@ impl Tensor {
        header.push('\n');
        f.write_all(&[(header.len() % 256) as u8, (header.len() / 256) as u8])?;
        f.write_all(header.as_bytes())?;
-        self.write_bytes(f)
+        let elem_count = self.elem_count();
+        match self.dtype() {
+            DType::BF16 => {
+                let vs = self.reshape(elem_count)?.to_vec1::<bf16>()?;
+                for &v in vs.reinterpret_cast() {
+                    f.write_u16::<LittleEndian>(v)?
+                }
+            }
+            DType::F16 => {
+                let vs = self.reshape(elem_count)?.to_vec1::<f16>()?;
+                for &v in vs.reinterpret_cast() {
+                    f.write_u16::<LittleEndian>(v)?
+                }
+            }
+            DType::F32 => {
+                // TODO: Avoid using a buffer when data is already on the CPU.
+                for v in self.reshape(elem_count)?.to_vec1::<f32>()? {
+                    f.write_f32::<LittleEndian>(v)?
+                }
+            }
+            DType::F64 => {
+                for v in self.reshape(elem_count)?.to_vec1::<f64>()? {
+                    f.write_f64::<LittleEndian>(v)?
+                }
+            }
+            DType::U32 => {
+                for v in self.reshape(elem_count)?.to_vec1::<u32>()? {
+                    f.write_u32::<LittleEndian>(v)?
+                }
+            }
+            DType::U8 => {
+                let data = self.reshape(elem_count)?.to_vec1::<u8>()?;
+                f.write_all(&data)?;
+            }
+        }
+        Ok(())
    }

    /// Writes a multi-dimensional array in the npy format.
@ -348,7 +373,7 @@ pub struct NpzTensors {
    index_per_name: HashMap<String, usize>,
    path: std::path::PathBuf,
    // We do not store a zip reader as it needs mutable access to extract data. Instead we
-    // re-create a zip reader for each tensor.
+    // re-create a zip reader each time.
 }

 impl NpzTensors {
@ -371,25 +396,6 @@ impl NpzTensors {
        })
    }

-    pub fn names(&self) -> Vec<&String> {
-        self.index_per_name.keys().collect()
-    }
-
-    /// This only returns the shape and dtype for a named tensor. Compared to `get`, this avoids
-    /// reading the whole tensor data.
-    pub fn get_shape_and_dtype(&self, name: &str) -> Result<(Shape, DType)> {
-        let index = match self.index_per_name.get(name) {
-            None => crate::bail!("cannot find tensor {name}"),
-            Some(index) => *index,
-        };
-        let zip_reader = BufReader::new(File::open(&self.path)?);
-        let mut zip = zip::ZipArchive::new(zip_reader)?;
-        let mut reader = zip.by_index(index)?;
-        let header = read_header(&mut reader)?;
-        let header = Header::parse(&header)?;
-        Ok((header.shape(), header.descr))
-    }
-
    pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
        let index = match self.index_per_name.get(name) {
            None => return Ok(None),
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -1,84 +1,15 @@
-#![allow(clippy::redundant_closure_call)]
-use crate::{CpuStorage, CudaStorage, Layout, Result, Shape, Tensor};
+use crate::Tensor;
 use half::{bf16, f16};
 use num_traits::float::Float;

-#[derive(Clone, Copy, PartialEq, Eq)]
-pub enum CmpOp {
-    Eq,
-    Ne,
-    Le,
-    Ge,
-    Lt,
-    Gt,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum ReduceOp {
-    Sum,
-    Min,
-    Max,
-    ArgMin,
-    ArgMax,
-}
-
-impl ReduceOp {
-    pub(crate) fn name(&self) -> &'static str {
-        match self {
-            Self::ArgMax => "argmax",
-            Self::ArgMin => "argmin",
-            Self::Min => "min",
-            Self::Max => "max",
-            Self::Sum => "sum",
-        }
-    }
-}
-
-// These ops return the same type as their input type.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum BinaryOp {
-    Add,
-    Mul,
-    Sub,
-    Div,
-    Maximum,
-    Minimum,
-}
-
-// Unary ops with no argument
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum UnaryOp {
-    Exp,
-    Log,
-    Sin,
-    Cos,
-    Abs,
-    Neg,
-    Recip,
-    Sqr,
-    Sqrt,
-    Gelu,
-    GeluErf,
-    Erf,
-    Relu,
-    Tanh,
-    Floor,
-    Ceil,
-    Round,
-}
-
 #[derive(Clone)]
-pub enum Op {
-    Binary(Tensor, Tensor, BinaryOp),
-    Unary(Tensor, UnaryOp),
-    Cmp(Tensor, CmpOp),
-    // The third argument is the reduced shape with `keepdim=true`.
-    Reduce(Tensor, ReduceOp, Vec<usize>),
+pub(crate) enum Op {
+    Add(Tensor, Tensor),
+    Mul(Tensor, Tensor),
+    Sub(Tensor, Tensor),
+    Div(Tensor, Tensor),
    Matmul(Tensor, Tensor),
-    Gather(Tensor, Tensor, usize),
-    ScatterAdd(Tensor, Tensor, Tensor, usize),
-    IndexSelect(Tensor, Tensor, usize),
-    IndexAdd(Tensor, Tensor, Tensor, usize),
+    Embedding(Tensor, Tensor),
    WhereCond(Tensor, Tensor, Tensor),

    #[allow(dead_code)]
@ -87,43 +18,8 @@ pub enum Op {
        kernel: Tensor,
        padding: usize,
        stride: usize,
-        dilation: usize,
    },

-    #[allow(dead_code)]
-    Conv2D {
-        arg: Tensor,
-        kernel: Tensor,
-        padding: usize,
-        stride: usize,
-        dilation: usize,
-    },
-
-    #[allow(dead_code)]
-    ConvTranspose2D {
-        arg: Tensor,
-        kernel: Tensor,
-        padding: usize,
-        output_padding: usize,
-        stride: usize,
-        dilation: usize,
-    },
-
-    AvgPool2D {
-        arg: Tensor,
-        kernel_size: (usize, usize),
-        stride: (usize, usize),
-    },
-
-    MaxPool2D {
-        arg: Tensor,
-        kernel_size: (usize, usize),
-        stride: (usize, usize),
-    },
-
-    UpsampleNearest1D(Tensor),
-    UpsampleNearest2D(Tensor),
-
    Cat(Vec<Tensor>, usize),

    #[allow(dead_code)] // add is currently unused.
@ -132,138 +28,29 @@ pub enum Op {
        mul: f64,
        add: f64,
    },
+    Sum(Tensor, Vec<usize>),
    ToDType(Tensor),
-    Copy(Tensor),
    Broadcast(Tensor),
+    Exp(Tensor),
+    Log(Tensor),
+    Sin(Tensor),
+    Cos(Tensor),
+    Abs(Tensor),
    Narrow(Tensor, usize, usize, usize),
-    SliceScatter0(Tensor, Tensor, usize),
+    Neg(Tensor),
    Reshape(Tensor),
+    Softmax(Tensor, usize),
+    Sqr(Tensor),
+    Sqrt(Tensor),
    ToDevice(Tensor),
    Transpose(Tensor, usize, usize),
-    Permute(Tensor, Vec<usize>),
+    Gelu(Tensor),
+    Relu(Tensor),
    Elu(Tensor, f64),
-    Powf(Tensor, f64),
-    CustomOp1(Tensor, std::sync::Arc<Box<dyn CustomOp1 + Send + Sync>>),
-    CustomOp2(
-        Tensor,
-        Tensor,
-        std::sync::Arc<Box<dyn CustomOp2 + Send + Sync>>,
-    ),
-    CustomOp3(
-        Tensor,
-        Tensor,
-        Tensor,
-        std::sync::Arc<Box<dyn CustomOp3 + Send + Sync>>,
-    ),
+    // TODO: Support for custom ops.
 }

-/// Unary ops that can be defined in user-land.
-pub trait CustomOp1 {
-    // Box<dyn> does not support const yet, so use a function to get the name.
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    /// This function takes as argument the argument `arg` used in the forward pass, the result
-    /// produced by the forward operation `res` and the gradient of the result `grad_res`.
-    /// The function should return the gradient of the argument.
-    fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait CustomOp2 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-    ) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    fn bwd(
-        &self,
-        _arg1: &Tensor,
-        _arg2: &Tensor,
-        _res: &Tensor,
-        _grad_res: &Tensor,
-    ) -> Result<(Option<Tensor>, Option<Tensor>)> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait CustomOp3 {
-    fn name(&self) -> &'static str;
-
-    /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cpu_fwd(
-        &self,
-        s1: &CpuStorage,
-        l1: &Layout,
-        s2: &CpuStorage,
-        l2: &Layout,
-        s3: &CpuStorage,
-        l3: &Layout,
-    ) -> Result<(CpuStorage, Shape)>;
-
-    /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
-    /// offsets etc so the associated layout should be used to access it.
-    fn cuda_fwd(
-        &self,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-        _: &CudaStorage,
-        _: &Layout,
-    ) -> Result<(CudaStorage, Shape)> {
-        Err(crate::Error::Cuda(
-            format!("no cuda implementation for {}", self.name()).into(),
-        ))
-    }
-
-    fn bwd(
-        &self,
-        _arg1: &Tensor,
-        _arg2: &Tensor,
-        _arg3: &Tensor,
-        _res: &Tensor,
-        _grad_res: &Tensor,
-    ) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
-        Err(crate::Error::BackwardNotSupported { op: self.name() })
-    }
-}
-
-pub trait UnaryOpT {
+pub(crate) trait UnaryOp {
    const NAME: &'static str;
    const KERNEL: &'static str;
    const V: Self;
@ -273,7 +60,6 @@ pub trait UnaryOpT {
    fn f64(v1: f64) -> f64;
    fn u8(v1: u8) -> u8;
    fn u32(v1: u32) -> u32;
-    fn i64(v1: i64) -> i64;

    // There is no very good way to represent optional function in traits so we go for an explicit
    // boolean flag to mark the function as existing.
@ -287,7 +73,7 @@ pub trait UnaryOpT {
    fn f64_vec(_xs: &[f64], _ys: &mut [f64]) {}
 }

-pub trait BinaryOpT {
+pub(crate) trait BinaryOp {
    const NAME: &'static str;
    const KERNEL: &'static str;
    const V: Self;
@ -297,7 +83,6 @@ pub trait BinaryOpT {
    fn f64(v1: f64, v2: f64) -> f64;
    fn u8(v1: u8, v2: u8) -> u8;
    fn u32(v1: u32, v2: u32) -> u32;
-    fn i64(v1: i64, v2: i64) -> i64;

    const BF16_VEC: bool = false;
    fn bf16_vec(_xs1: &[bf16], _xs2: &[bf16], _ys: &mut [bf16]) {}
@ -311,37 +96,26 @@ pub trait BinaryOpT {
    fn u8_vec(_xs1: &[u8], _xs2: &[u8], _ys: &mut [u8]) {}
    const U32_VEC: bool = false;
    fn u32_vec(_xs1: &[u32], _xs2: &[u32], _ys: &mut [u32]) {}
-    const I64_VEC: bool = false;
-    fn i64_vec(_xs1: &[i64], _xs2: &[i64], _ys: &mut [i64]) {}
 }

 pub(crate) struct Add;
 pub(crate) struct Div;
 pub(crate) struct Mul;
 pub(crate) struct Sub;
-pub(crate) struct Maximum;
-pub(crate) struct Minimum;
 pub(crate) struct Exp;
 pub(crate) struct Log;
 pub(crate) struct Sin;
 pub(crate) struct Cos;
 pub(crate) struct Abs;
 pub(crate) struct Neg;
-pub(crate) struct Recip;
 pub(crate) struct Sqr;
 pub(crate) struct Sqrt;
 pub(crate) struct Gelu;
-pub(crate) struct GeluErf;
-pub(crate) struct Erf;
 pub(crate) struct Relu;
-pub(crate) struct Tanh;
-pub(crate) struct Floor;
-pub(crate) struct Ceil;
-pub(crate) struct Round;

 macro_rules! bin_op {
    ($op:ident, $name: literal, $e: expr, $f32_vec: ident, $f64_vec: ident) => {
-        impl BinaryOpT for $op {
+        impl BinaryOp for $op {
            const NAME: &'static str = $name;
            const KERNEL: &'static str = concat!("b", $name);
            const V: Self = $op;
@ -369,10 +143,6 @@ macro_rules! bin_op {
            fn u32(v1: u32, v2: u32) -> u32 {
                $e(v1, v2)
            }
-            #[inline(always)]
-            fn i64(v1: i64, v2: i64) -> i64 {
-                $e(v1, v2)
-            }

            #[cfg(feature = "mkl")]
            const F32_VEC: bool = true;
@ -388,21 +158,6 @@ macro_rules! bin_op {
            fn f64_vec(xs1: &[f64], xs2: &[f64], ys: &mut [f64]) {
                crate::mkl::$f64_vec(xs1, xs2, ys)
            }
-
-            #[cfg(feature = "accelerate")]
-            const F32_VEC: bool = true;
-            #[cfg(feature = "accelerate")]
-            const F64_VEC: bool = true;
-            #[cfg(feature = "accelerate")]
-            #[inline(always)]
-            fn f32_vec(xs1: &[f32], xs2: &[f32], ys: &mut [f32]) {
-                crate::accelerate::$f32_vec(xs1, xs2, ys)
-            }
-            #[cfg(feature = "accelerate")]
-            #[inline(always)]
-            fn f64_vec(xs1: &[f64], xs2: &[f64], ys: &mut [f64]) {
-                crate::accelerate::$f64_vec(xs1, xs2, ys)
-            }
        }
    };
 }
@ -411,25 +166,10 @@ bin_op!(Add, "add", |v1, v2| v1 + v2, vs_add, vd_add);
 bin_op!(Sub, "sub", |v1, v2| v1 - v2, vs_sub, vd_sub);
 bin_op!(Mul, "mul", |v1, v2| v1 * v2, vs_mul, vd_mul);
 bin_op!(Div, "div", |v1, v2| v1 / v2, vs_div, vd_div);
-bin_op!(
-    Minimum,
-    "minimum",
-    |v1, v2| if v1 > v2 { v2 } else { v1 },
-    vs_min,
-    vd_min
-);
-bin_op!(
-    Maximum,
-    "maximum",
-    |v1, v2| if v1 < v2 { v2 } else { v1 },
-    vs_max,
-    vd_max
-);

-#[allow(clippy::redundant_closure_call)]
 macro_rules! unary_op {
    ($op: ident, $name: literal, $a: ident, $e: expr) => {
-        impl UnaryOpT for $op {
+        impl UnaryOp for $op {
            const NAME: &'static str = $name;
            const KERNEL: &'static str = concat!("u", $name);
            const V: Self = $op;
@ -457,15 +197,11 @@ macro_rules! unary_op {
            fn u32(_: u32) -> u32 {
                todo!("no unary function for u32")
            }
-            #[inline(always)]
-            fn i64(_: i64) -> i64 {
-                todo!("no unary function for i64")
-            }
        }
    };

    ($op: ident, $name: literal, $a: ident, $e: expr, $f32_vec:ident, $f64_vec:ident) => {
-        impl UnaryOpT for $op {
+        impl UnaryOp for $op {
            const NAME: &'static str = $name;
            const KERNEL: &'static str = concat!("u", $name);
            const V: Self = $op;
@ -493,10 +229,6 @@ macro_rules! unary_op {
            fn u32(_: u32) -> u32 {
                todo!("no unary function for u32")
            }
-            #[inline(always)]
-            fn i64(_: i64) -> i64 {
-                todo!("no unary function for i64")
-            }

            #[cfg(feature = "mkl")]
            const F32_VEC: bool = true;
@ -512,21 +244,6 @@ macro_rules! unary_op {
            fn f64_vec(xs: &[f64], ys: &mut [f64]) {
                crate::mkl::$f64_vec(xs, ys)
            }
-
-            #[cfg(feature = "accelerate")]
-            const F32_VEC: bool = true;
-            #[cfg(feature = "accelerate")]
-            const F64_VEC: bool = true;
-            #[cfg(feature = "accelerate")]
-            #[inline(always)]
-            fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-                crate::accelerate::$f32_vec(xs, ys)
-            }
-            #[cfg(feature = "accelerate")]
-            #[inline(always)]
-            fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-                crate::accelerate::$f64_vec(xs, ys)
-            }
        }
    };
 }
@ -535,16 +252,14 @@ unary_op!(Exp, "exp", v, v.exp(), vs_exp, vd_exp);
 unary_op!(Log, "log", v, v.ln(), vs_ln, vd_ln);
 unary_op!(Sin, "sin", v, v.sin(), vs_sin, vd_sin);
 unary_op!(Cos, "cos", v, v.cos(), vs_cos, vd_cos);
-unary_op!(Tanh, "tanh", v, v.tanh(), vs_tanh, vd_tanh);
 unary_op!(Abs, "abs", v, v.abs());
 unary_op!(Neg, "neg", v, -v);
-unary_op!(Recip, "recip", v, v.recip());
 unary_op!(Sqr, "sqr", v, v * v, vs_sqr, vd_sqr);
 unary_op!(Sqrt, "sqrt", v, v.sqrt(), vs_sqrt, vd_sqrt);

 /// `gelu` operation
 /// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
-impl UnaryOpT for Gelu {
+impl UnaryOp for Gelu {
    const NAME: &'static str = "gelu";
    const V: Self = Gelu;
    #[inline(always)]
@ -589,10 +304,6 @@ impl UnaryOpT for Gelu {
    fn u32(_: u32) -> u32 {
        0
    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
    const KERNEL: &'static str = "ugelu";

    #[cfg(feature = "mkl")]
@ -612,197 +323,9 @@ impl UnaryOpT for Gelu {
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::mkl::vd_gelu(xs, ys)
    }
-
-    #[cfg(feature = "accelerate")]
-    const F32_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
-        crate::accelerate::vs_gelu(xs, ys)
-    }
-
-    #[cfg(feature = "accelerate")]
-    const F64_VEC: bool = true;
-
-    #[cfg(feature = "accelerate")]
-    #[inline(always)]
-    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
-        crate::accelerate::vd_gelu(xs, ys)
-    }
 }

-impl UnaryOpT for Erf {
-    const NAME: &'static str = "erf";
-    const KERNEL: &'static str = "uerf";
-    const V: Self = Erf;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        bf16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        f16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        Self::f64(v as f64) as f32
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        crate::cpu::erf::erf(v)
-    }
-    #[inline(always)]
-    fn u8(_: u8) -> u8 {
-        0
-    }
-    #[inline(always)]
-    fn u32(_: u32) -> u32 {
-        0
-    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
-}
-
-impl UnaryOpT for Ceil {
-    const NAME: &'static str = "ceil";
-    const KERNEL: &'static str = "uceil";
-    const V: Self = Ceil;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.ceil()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-impl UnaryOpT for Floor {
-    const NAME: &'static str = "floor";
-    const KERNEL: &'static str = "ufloor";
-    const V: Self = Floor;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.floor()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-impl UnaryOpT for Round {
-    const NAME: &'static str = "round";
-    const KERNEL: &'static str = "uround";
-    const V: Self = Round;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        v.round()
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        v.round()
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        v.round()
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        v.round()
-    }
-    #[inline(always)]
-    fn u8(v: u8) -> u8 {
-        v
-    }
-    #[inline(always)]
-    fn u32(v: u32) -> u32 {
-        v
-    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-impl UnaryOpT for GeluErf {
-    const NAME: &'static str = "gelu_erf";
-    const KERNEL: &'static str = "ugelu_erf";
-    const V: Self = GeluErf;
-    #[inline(always)]
-    fn bf16(v: bf16) -> bf16 {
-        bf16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f16(v: f16) -> f16 {
-        f16::from_f64(Self::f64(v.to_f64()))
-    }
-    #[inline(always)]
-    fn f32(v: f32) -> f32 {
-        Self::f64(v as f64) as f32
-    }
-    #[inline(always)]
-    fn f64(v: f64) -> f64 {
-        (crate::cpu::erf::erf(v / 2f64.sqrt()) + 1.) * 0.5 * v
-    }
-    #[inline(always)]
-    fn u8(_: u8) -> u8 {
-        0
-    }
-    #[inline(always)]
-    fn u32(_: u32) -> u32 {
-        0
-    }
-    #[inline(always)]
-    fn i64(_: i64) -> i64 {
-        0
-    }
-}
-
-impl UnaryOpT for Relu {
+impl UnaryOp for Relu {
    const NAME: &'static str = "relu";
    const KERNEL: &'static str = "urelu";
    const V: Self = Relu;
@ -830,68 +353,4 @@ impl UnaryOpT for Relu {
    fn u32(v: u32) -> u32 {
        v
    }
-    #[inline(always)]
-    fn i64(v: i64) -> i64 {
-        v
-    }
-}
-
-/// `BackpropOp` is a wrapper around `Option<Op>`. The main goal is to ensure that dependencies are
-/// properly checked when creating a new value
-#[derive(Clone)]
-pub struct BackpropOp(Option<Op>);
-
-impl BackpropOp {
-    pub(crate) fn none() -> Self {
-        BackpropOp(None)
-    }
-
-    pub(crate) fn new1(arg: &Tensor, f: impl Fn(Tensor) -> Op) -> Self {
-        let op = if arg.track_op() {
-            Some(f(arg.clone()))
-        } else {
-            None
-        };
-        Self(op)
-    }
-
-    pub(crate) fn new2(arg1: &Tensor, arg2: &Tensor, f: impl Fn(Tensor, Tensor) -> Op) -> Self {
-        let op = if arg1.track_op() || arg2.track_op() {
-            Some(f(arg1.clone(), arg2.clone()))
-        } else {
-            None
-        };
-        Self(op)
-    }
-
-    pub(crate) fn new3(
-        arg1: &Tensor,
-        arg2: &Tensor,
-        arg3: &Tensor,
-        f: impl Fn(Tensor, Tensor, Tensor) -> Op,
-    ) -> Self {
-        let op = if arg1.track_op() || arg2.track_op() || arg3.track_op() {
-            Some(f(arg1.clone(), arg2.clone(), arg3.clone()))
-        } else {
-            None
-        };
-        Self(op)
-    }
-
-    pub(crate) fn new<A: AsRef<Tensor>>(args: &[A], f: impl Fn(Vec<Tensor>) -> Op) -> Self {
-        let op = if args.iter().any(|arg| arg.as_ref().track_op()) {
-            let args: Vec<Tensor> = args.iter().map(|arg| arg.as_ref().clone()).collect();
-            Some(f(args))
-        } else {
-            None
-        };
-        Self(op)
-    }
-}
-
-impl std::ops::Deref for BackpropOp {
-    type Target = Option<Op>;
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
 }
--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@ -1,725 +0,0 @@
-// Just enough pickle support to be able to read PyTorch checkpoints.
-// This hardcodes objects that are required for tensor reading, we may want to make this a bit more
-// composable/tensor agnostic at some point.
-use crate::{DType, Error as E, Layout, Result, Tensor};
-use byteorder::{LittleEndian, ReadBytesExt};
-use std::collections::HashMap;
-use std::io::BufRead;
-
-const VERBOSE: bool = false;
-
-// https://docs.juliahub.com/Pickle/LAUNc/0.1.0/opcode/
-#[repr(u8)]
-#[derive(Debug, Eq, PartialEq, Clone)]
-pub enum OpCode {
-    // https://github.com/python/cpython/blob/ed25f097160b5cbb0c9a1f9a746d2f1bbc96515a/Lib/pickletools.py#L2123
-    Proto = 0x80,
-    Global = b'c',
-    BinPut = b'q',
-    LongBinPut = b'r',
-    EmptyTuple = b')',
-    Reduce = b'R',
-    Mark = b'(',
-    BinUnicode = b'X',
-    BinInt = b'J',
-    Tuple = b't',
-    BinPersId = b'Q',
-    BinInt1 = b'K',
-    BinInt2 = b'M',
-    Tuple1 = 0x85,
-    Tuple2 = 0x86,
-    Tuple3 = 0x87,
-    NewTrue = 0x88,
-    NewFalse = 0x89,
-    None = b'N',
-    BinGet = b'h',
-    LongBinGet = b'j',
-    SetItem = b's',
-    SetItems = b'u',
-    EmptyDict = b'}',
-    Dict = b'd',
-    Build = b'b',
-    Stop = b'.',
-    NewObj = 0x81,
-    EmptyList = b']',
-    BinFloat = b'g',
-    Append = b'a',
-    Appends = b'e',
-}
-
-// Avoid using FromPrimitive so as not to drag another dependency.
-impl TryFrom<u8> for OpCode {
-    type Error = u8;
-    fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
-        match value {
-            0x80 => Ok(Self::Proto),
-            b'c' => Ok(Self::Global),
-            b'q' => Ok(Self::BinPut),
-            b'r' => Ok(Self::LongBinPut),
-            b')' => Ok(Self::EmptyTuple),
-            b'R' => Ok(Self::Reduce),
-            b'(' => Ok(Self::Mark),
-            b'X' => Ok(Self::BinUnicode),
-            b'J' => Ok(Self::BinInt),
-            b't' => Ok(Self::Tuple),
-            b'Q' => Ok(Self::BinPersId),
-            b'K' => Ok(Self::BinInt1),
-            b'M' => Ok(Self::BinInt2),
-            b'N' => Ok(Self::None),
-            0x85 => Ok(Self::Tuple1),
-            0x86 => Ok(Self::Tuple2),
-            0x87 => Ok(Self::Tuple3),
-            0x88 => Ok(Self::NewTrue),
-            0x89 => Ok(Self::NewFalse),
-            b'h' => Ok(Self::BinGet),
-            b'j' => Ok(Self::LongBinGet),
-            b's' => Ok(Self::SetItem),
-            b'u' => Ok(Self::SetItems),
-            b'}' => Ok(Self::EmptyDict),
-            b'd' => Ok(Self::EmptyDict),
-            b'b' => Ok(Self::Build),
-            b'.' => Ok(Self::Stop),
-            0x81 => Ok(Self::NewObj),
-            b']' => Ok(Self::EmptyList),
-            b'G' => Ok(Self::BinFloat),
-            b'a' => Ok(Self::Append),
-            b'e' => Ok(Self::Appends),
-            value => Err(value),
-        }
-    }
-}
-
-fn read_to_newline<R: BufRead>(r: &mut R) -> Result<Vec<u8>> {
-    let mut data: Vec<u8> = Vec::with_capacity(32);
-    r.read_until(b'\n', &mut data)?;
-    data.pop();
-    if data.last() == Some(&b'\r') {
-        data.pop();
-    }
-    Ok(data)
-}
-
-#[derive(Debug, Clone, PartialEq)]
-pub enum Object {
-    Class {
-        module_name: String,
-        class_name: String,
-    },
-    Int(i32),
-    Float(f64),
-    Unicode(String),
-    Bool(bool),
-    None,
-    Tuple(Vec<Object>),
-    List(Vec<Object>),
-    Mark,
-    Dict(Vec<(Object, Object)>),
-    Reduce {
-        callable: Box<Object>,
-        args: Box<Object>,
-    },
-    Build {
-        callable: Box<Object>,
-        args: Box<Object>,
-    },
-    PersistentLoad(Box<Object>),
-}
-
-type OResult<T> = std::result::Result<T, Object>;
-
-impl Object {
-    pub fn unicode(self) -> OResult<String> {
-        match self {
-            Self::Unicode(t) => Ok(t),
-            _ => Err(self),
-        }
-    }
-
-    pub fn reduce(self) -> OResult<(Self, Self)> {
-        match self {
-            Self::Reduce { callable, args } => Ok((*callable, *args)),
-            _ => Err(self),
-        }
-    }
-
-    pub fn none(self) -> OResult<()> {
-        match self {
-            Self::None => Ok(()),
-            _ => Err(self),
-        }
-    }
-
-    pub fn persistent_load(self) -> OResult<Self> {
-        match self {
-            Self::PersistentLoad(t) => Ok(*t),
-            _ => Err(self),
-        }
-    }
-
-    pub fn bool(self) -> OResult<bool> {
-        match self {
-            Self::Bool(t) => Ok(t),
-            _ => Err(self),
-        }
-    }
-
-    pub fn int(self) -> OResult<i32> {
-        match self {
-            Self::Int(t) => Ok(t),
-            _ => Err(self),
-        }
-    }
-
-    pub fn tuple(self) -> OResult<Vec<Self>> {
-        match self {
-            Self::Tuple(t) => Ok(t),
-            _ => Err(self),
-        }
-    }
-
-    pub fn dict(self) -> OResult<Vec<(Self, Self)>> {
-        match self {
-            Self::Dict(t) => Ok(t),
-            _ => Err(self),
-        }
-    }
-
-    pub fn class(self) -> OResult<(String, String)> {
-        match self {
-            Self::Class {
-                module_name,
-                class_name,
-            } => Ok((module_name, class_name)),
-            _ => Err(self),
-        }
-    }
-}
-
-impl TryFrom<Object> for String {
-    type Error = Object;
-    fn try_from(value: Object) -> std::result::Result<Self, Self::Error> {
-        match value {
-            Object::Unicode(s) => Ok(s),
-            other => Err(other),
-        }
-    }
-}
-
-impl TryFrom<Object> for usize {
-    type Error = Object;
-    fn try_from(value: Object) -> std::result::Result<Self, Self::Error> {
-        match value {
-            Object::Int(s) if s >= 0 => Ok(s as usize),
-            other => Err(other),
-        }
-    }
-}
-
-impl<T: TryFrom<Object, Error = Object>> TryFrom<Object> for Vec<T> {
-    type Error = Object;
-    fn try_from(value: Object) -> std::result::Result<Self, Self::Error> {
-        match value {
-            Object::Tuple(values) => {
-                // This does not return the appropriate value in the error case but instead return
-                // the object related to the first error.
-                values
-                    .into_iter()
-                    .map(|v| T::try_from(v))
-                    .collect::<std::result::Result<Vec<T>, Self::Error>>()
-            }
-            other => Err(other),
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct Stack {
-    stack: Vec<Object>,
-    memo: HashMap<u32, Object>,
-}
-
-impl Stack {
-    pub fn empty() -> Self {
-        Self {
-            stack: Vec::with_capacity(512),
-            memo: HashMap::new(),
-        }
-    }
-
-    pub fn stack(&self) -> &[Object] {
-        self.stack.as_slice()
-    }
-
-    pub fn read_loop<R: BufRead>(&mut self, r: &mut R) -> Result<()> {
-        loop {
-            if self.read(r)? {
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    pub fn finalize(mut self) -> Result<Object> {
-        self.pop()
-    }
-
-    fn push(&mut self, obj: Object) {
-        self.stack.push(obj)
-    }
-
-    fn pop(&mut self) -> Result<Object> {
-        match self.stack.pop() {
-            None => crate::bail!("unexpected empty stack"),
-            Some(obj) => Ok(obj),
-        }
-    }
-
-    // https://docs.juliahub.com/Pickle/LAUNc/0.1.0/opcode/#Pickle.OpCodes.BUILD
-    fn build(&mut self) -> Result<()> {
-        let args = self.pop()?;
-        let obj = self.pop()?;
-        let obj = match (obj, args) {
-            (Object::Dict(mut obj), Object::Dict(mut args)) => {
-                obj.append(&mut args);
-                Object::Dict(obj)
-            }
-            (obj, args) => Object::Build {
-                callable: Box::new(obj),
-                args: Box::new(args),
-            },
-        };
-        self.push(obj);
-        Ok(())
-    }
-
-    fn reduce(&mut self) -> Result<()> {
-        let args = self.pop()?;
-        let callable = self.pop()?;
-        #[allow(clippy::single_match)]
-        let reduced = match &callable {
-            Object::Class {
-                module_name,
-                class_name,
-            } => {
-                if module_name == "collections" && class_name == "OrderedDict" {
-                    // TODO: have a separate ordered dict.
-                    Some(Object::Dict(vec![]))
-                } else {
-                    None
-                }
-            }
-            _ => None,
-        };
-        let reduced = reduced.unwrap_or_else(|| Object::Reduce {
-            callable: Box::new(callable),
-            args: Box::new(args),
-        });
-        self.push(reduced);
-        Ok(())
-    }
-
-    fn last(&mut self) -> Result<&mut Object> {
-        match self.stack.last_mut() {
-            None => crate::bail!("unexpected empty stack"),
-            Some(obj) => Ok(obj),
-        }
-    }
-
-    fn memo_get(&self, id: u32) -> Result<Object> {
-        match self.memo.get(&id) {
-            None => crate::bail!("missing object in memo {id}"),
-            Some(obj) => {
-                // Maybe we should use refcounting rather than doing potential large clones here.
-                Ok(obj.clone())
-            }
-        }
-    }
-
-    fn memo_put(&mut self, id: u32) -> Result<()> {
-        let obj = self.last()?.clone();
-        self.memo.insert(id, obj);
-        Ok(())
-    }
-
-    fn persistent_load(&self, id: Object) -> Result<Object> {
-        Ok(Object::PersistentLoad(Box::new(id)))
-    }
-
-    fn new_obj(&self, class: Object, args: Object) -> Result<Object> {
-        Ok(Object::Reduce {
-            callable: Box::new(class),
-            args: Box::new(args),
-        })
-    }
-
-    fn pop_to_marker(&mut self) -> Result<Vec<Object>> {
-        let mut mark_idx = None;
-        for (idx, obj) in self.stack.iter().enumerate().rev() {
-            if obj == &Object::Mark {
-                mark_idx = Some(idx);
-                break;
-            }
-        }
-        match mark_idx {
-            Some(mark_idx) => {
-                let objs = self.stack.split_off(mark_idx + 1);
-                self.stack.pop();
-                Ok(objs)
-            }
-            None => {
-                crate::bail!("marker object not found")
-            }
-        }
-    }
-
-    pub fn read<R: BufRead>(&mut self, r: &mut R) -> Result<bool> {
-        let op_code = match OpCode::try_from(r.read_u8()?) {
-            Ok(op_code) => op_code,
-            Err(op_code) => {
-                crate::bail!("unknown op-code {op_code}")
-            }
-        };
-        // println!("op: {op_code:?}");
-        // println!("{:?}", self.stack);
-        match op_code {
-            OpCode::Proto => {
-                let version = r.read_u8()?;
-                if VERBOSE {
-                    println!("proto {version}");
-                }
-            }
-            OpCode::Global => {
-                let module_name = read_to_newline(r)?;
-                let class_name = read_to_newline(r)?;
-                let module_name = String::from_utf8_lossy(&module_name).to_string();
-                let class_name = String::from_utf8_lossy(&class_name).to_string();
-                self.push(Object::Class {
-                    module_name,
-                    class_name,
-                })
-            }
-            OpCode::BinInt1 => {
-                let arg = r.read_u8()?;
-                self.push(Object::Int(arg as i32))
-            }
-            OpCode::BinInt2 => {
-                let arg = r.read_u16::<LittleEndian>()?;
-                self.push(Object::Int(arg as i32))
-            }
-            OpCode::BinInt => {
-                let arg = r.read_i32::<LittleEndian>()?;
-                self.push(Object::Int(arg))
-            }
-            OpCode::BinFloat => {
-                let arg = r.read_f64::<LittleEndian>()?;
-                self.push(Object::Float(arg))
-            }
-            OpCode::BinUnicode => {
-                let len = r.read_u32::<LittleEndian>()?;
-                let mut data = vec![0u8; len as usize];
-                r.read_exact(&mut data)?;
-                let data = String::from_utf8(data).map_err(E::wrap)?;
-                self.push(Object::Unicode(data))
-            }
-            OpCode::BinPersId => {
-                let id = self.pop()?;
-                let obj = self.persistent_load(id)?;
-                self.push(obj)
-            }
-            OpCode::Tuple => {
-                let objs = self.pop_to_marker()?;
-                self.push(Object::Tuple(objs))
-            }
-            OpCode::Tuple1 => {
-                let obj = self.pop()?;
-                self.push(Object::Tuple(vec![obj]))
-            }
-            OpCode::Tuple2 => {
-                let obj2 = self.pop()?;
-                let obj1 = self.pop()?;
-                self.push(Object::Tuple(vec![obj1, obj2]))
-            }
-            OpCode::Tuple3 => {
-                let obj3 = self.pop()?;
-                let obj2 = self.pop()?;
-                let obj1 = self.pop()?;
-                self.push(Object::Tuple(vec![obj1, obj2, obj3]))
-            }
-            OpCode::NewTrue => self.push(Object::Bool(true)),
-            OpCode::NewFalse => self.push(Object::Bool(false)),
-            OpCode::Append => {
-                let value = self.pop()?;
-                let pylist = self.last()?;
-                if let Object::List(d) = pylist {
-                    d.push(value)
-                } else {
-                    crate::bail!("expected a list, got {pylist:?}")
-                }
-            }
-            OpCode::Appends => {
-                let objs = self.pop_to_marker()?;
-                let pylist = self.last()?;
-                if let Object::List(d) = pylist {
-                    d.extend(objs)
-                } else {
-                    crate::bail!("expected a list, got {pylist:?}")
-                }
-            }
-            OpCode::SetItem => {
-                let value = self.pop()?;
-                let key = self.pop()?;
-                let pydict = self.last()?;
-                if let Object::Dict(d) = pydict {
-                    d.push((key, value))
-                } else {
-                    crate::bail!("expected a dict, got {pydict:?}")
-                }
-            }
-            OpCode::SetItems => {
-                let mut objs = self.pop_to_marker()?;
-                let pydict = self.last()?;
-                if let Object::Dict(d) = pydict {
-                    if objs.len() % 2 != 0 {
-                        crate::bail!("setitems: not an even number of objects")
-                    }
-                    while let Some(value) = objs.pop() {
-                        let key = objs.pop().unwrap();
-                        d.push((key, value))
-                    }
-                } else {
-                    crate::bail!("expected a dict, got {pydict:?}")
-                }
-            }
-            OpCode::None => self.push(Object::None),
-            OpCode::Stop => {
-                return Ok(true);
-            }
-            OpCode::Build => self.build()?,
-            OpCode::EmptyDict => self.push(Object::Dict(vec![])),
-            OpCode::Dict => {
-                let mut objs = self.pop_to_marker()?;
-                let mut pydict = vec![];
-                if objs.len() % 2 != 0 {
-                    crate::bail!("setitems: not an even number of objects")
-                }
-                while let Some(value) = objs.pop() {
-                    let key = objs.pop().unwrap();
-                    pydict.push((key, value))
-                }
-                self.push(Object::Dict(pydict))
-            }
-            OpCode::Mark => self.push(Object::Mark),
-            OpCode::Reduce => self.reduce()?,
-            OpCode::EmptyTuple => self.push(Object::Tuple(vec![])),
-            OpCode::EmptyList => self.push(Object::List(vec![])),
-            OpCode::BinGet => {
-                let arg = r.read_u8()?;
-                let obj = self.memo_get(arg as u32)?;
-                self.push(obj)
-            }
-            OpCode::LongBinGet => {
-                let arg = r.read_u32::<LittleEndian>()?;
-                let obj = self.memo_get(arg)?;
-                self.push(obj)
-            }
-            OpCode::BinPut => {
-                let arg = r.read_u8()?;
-                self.memo_put(arg as u32)?
-            }
-            OpCode::LongBinPut => {
-                let arg = r.read_u32::<LittleEndian>()?;
-                self.memo_put(arg)?
-            }
-            OpCode::NewObj => {
-                let args = self.pop()?;
-                let class = self.pop()?;
-                let obj = self.new_obj(class, args)?;
-                self.push(obj)
-            }
-        }
-        Ok(false)
-    }
-}
-
-impl From<Object> for E {
-    fn from(value: Object) -> Self {
-        E::Msg(format!("conversion error on {value:?}"))
-    }
-}
-
-// https://github.com/pytorch/pytorch/blob/4eac43d046ded0f0a5a5fa8db03eb40f45bf656e/torch/_utils.py#L198
-// Arguments: storage, storage_offset, size, stride, requires_grad, backward_hooks
-fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
-    let mut args = args.tuple()?;
-    let stride = Vec::<usize>::try_from(args.remove(3))?;
-    let size = Vec::<usize>::try_from(args.remove(2))?;
-    let offset = args.remove(1).int()? as usize;
-    let storage = args.remove(0).persistent_load()?;
-    let mut storage = storage.tuple()?;
-    let storage_size = storage.remove(4).int()? as usize;
-    let path = storage.remove(2).unicode()?;
-    let (_module_name, class_name) = storage.remove(1).class()?;
-    let dtype = match class_name.as_str() {
-        "FloatStorage" => DType::F32,
-        "DoubleStorage" => DType::F64,
-        "HalfStorage" => DType::F16,
-        "BFloat16Storage" => DType::BF16,
-        "ByteStorage" => DType::U8,
-        other => {
-            crate::bail!("unsupported storage type {other}")
-        }
-    };
-    let layout = Layout::new(crate::Shape::from(size), stride, offset);
-    Ok((layout, dtype, path, storage_size))
-}
-
-#[derive(Debug, Clone)]
-pub struct TensorInfo {
-    pub name: String,
-    pub dtype: DType,
-    pub layout: Layout,
-    pub path: String,
-    pub storage_size: usize,
-}
-
-pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
-    file: P,
-    verbose: bool,
-) -> Result<Vec<TensorInfo>> {
-    let file = std::fs::File::open(file)?;
-    let zip_reader = std::io::BufReader::new(file);
-    let mut zip = zip::ZipArchive::new(zip_reader)?;
-    let zip_file_names = zip
-        .file_names()
-        .map(|f| f.to_string())
-        .collect::<Vec<String>>();
-
-    let mut tensor_infos = vec![];
-    for file_name in zip_file_names.iter() {
-        if !file_name.ends_with("data.pkl") {
-            continue;
-        }
-        let dir_name = std::path::PathBuf::from(file_name.strip_suffix(".pkl").unwrap());
-        let reader = zip.by_name(file_name)?;
-        let mut reader = std::io::BufReader::new(reader);
-        let mut stack = Stack::empty();
-        stack.read_loop(&mut reader)?;
-        let obj = stack.finalize()?;
-        if VERBOSE || verbose {
-            println!("{obj:?}");
-        }
-        let obj = match obj {
-            Object::Build { callable, args } => match *callable {
-                Object::Reduce { callable, args: _ } => match *callable {
-                    Object::Class {
-                        module_name,
-                        class_name,
-                    } if module_name == "__torch__" && class_name == "Module" => *args,
-                    _ => continue,
-                },
-                _ => continue,
-            },
-            obj => obj,
-        };
-        if let Object::Dict(key_values) = obj {
-            for (name, value) in key_values.into_iter() {
-                let name = match name.unicode() {
-                    Ok(name) => name,
-                    Err(_) => continue,
-                };
-                let (callable, args) = match value.reduce() {
-                    Ok(callable_args) => callable_args,
-                    _ => continue,
-                };
-                let (callable, args) = match callable {
-                    Object::Class {
-                        module_name,
-                        class_name,
-                    } if module_name == "torch._tensor"
-                        && class_name == "_rebuild_from_type_v2" =>
-                    {
-                        let mut args = args.tuple()?;
-                        let callable = args.remove(0);
-                        let args = args.remove(1);
-                        (callable, args)
-                    }
-                    _ => (callable, args),
-                };
-                match callable {
-                    Object::Class {
-                        module_name,
-                        class_name,
-                    } if module_name == "torch._utils" && class_name == "_rebuild_tensor_v2" => {}
-                    _ => continue,
-                };
-                match rebuild_args(args) {
-                    Ok((layout, dtype, file_path, storage_size)) => {
-                        let mut path = dir_name.clone();
-                        path.push(file_path);
-                        tensor_infos.push(TensorInfo {
-                            name,
-                            dtype,
-                            layout,
-                            path: path.to_string_lossy().into_owned(),
-                            storage_size,
-                        })
-                    }
-                    Err(err) => {
-                        eprintln!("skipping {name}: {err:?}")
-                    }
-                }
-            }
-        }
-    }
-    Ok(tensor_infos)
-}
-
-/// Lazy tensor loader.
-pub struct PthTensors {
-    tensor_infos: HashMap<String, TensorInfo>,
-    path: std::path::PathBuf,
-    // We do not store a zip reader as it needs mutable access to extract data. Instead we
-    // re-create a zip reader for each tensor.
-}
-
-impl PthTensors {
-    pub fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
-        let tensor_infos = read_pth_tensor_info(path.as_ref(), false)?;
-        let tensor_infos = tensor_infos
-            .into_iter()
-            .map(|ti| (ti.name.to_string(), ti))
-            .collect();
-        let path = path.as_ref().to_owned();
-        Ok(Self { tensor_infos, path })
-    }
-
-    pub fn tensor_infos(&self) -> &HashMap<String, TensorInfo> {
-        &self.tensor_infos
-    }
-
-    pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
-        let tensor_info = match self.tensor_infos.get(name) {
-            None => return Ok(None),
-            Some(tensor_info) => tensor_info,
-        };
-        // We hope that the file has not changed since first reading it.
-        let zip_reader = std::io::BufReader::new(std::fs::File::open(&self.path)?);
-        let mut zip = zip::ZipArchive::new(zip_reader)?;
-        let mut reader = zip.by_name(&tensor_info.path)?;
-
-        // Reading the data is a bit tricky as it can be strided, use an offset, etc.
-        // For now only support the basic case.
-        if tensor_info.layout.start_offset() != 0 || !tensor_info.layout.is_contiguous() {
-            crate::bail!(
-                "cannot retrieve non-contiguous tensors {:?}",
-                tensor_info.layout
-            )
-        }
-        let tensor = Tensor::from_reader(
-            tensor_info.layout.shape().clone(),
-            tensor_info.dtype,
-            &mut reader,
-        )?;
-        Ok(Some(tensor))
-    }
-}
--- a/candle-core/src/quantized/avx.rs
+++ b/candle-core/src/quantized/avx.rs
@ -1,672 +0,0 @@
-use super::k_quants::{
-    BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K,
-};
-use crate::Result;
-use byteorder::{ByteOrder, LittleEndian};
-use half::f16;
-
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-#[inline(always)]
-pub(crate) unsafe fn sum_i16_pairs_float(x: __m256i) -> __m256 {
-    let ones = _mm256_set1_epi16(1);
-    let summed_pairs = _mm256_madd_epi16(ones, x);
-    _mm256_cvtepi32_ps(summed_pairs)
-}
-
-#[inline(always)]
-pub(crate) unsafe fn mul_sum_us8_pairs_float(ax: __m256i, sy: __m256i) -> __m256 {
-    let dot = _mm256_maddubs_epi16(ax, sy);
-    sum_i16_pairs_float(dot)
-}
-
-#[inline(always)]
-pub(crate) unsafe fn hsum_float_8(x: __m256) -> f32 {
-    let res = _mm256_extractf128_ps(x, 1);
-    let res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    let res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    let res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    _mm_cvtss_f32(res)
-}
-
-#[inline(always)]
-pub(crate) unsafe fn bytes_from_nibbles_32(rsi: *const u8) -> __m256i {
-    let tmp = _mm_loadu_si128(rsi as *const __m128i);
-    let bytes = _mm256_insertf128_si256::<1>(_mm256_castsi128_si256(tmp), _mm_srli_epi16(tmp, 4));
-    let low_mask = _mm256_set1_epi8(0xF);
-    _mm256_and_si256(low_mask, bytes)
-}
-
-#[inline(always)]
-pub(crate) unsafe fn mul_sum_i8_pairs_float(x: __m256i, y: __m256i) -> __m256 {
-    let ax = _mm256_sign_epi8(x, x);
-    let sy = _mm256_sign_epi8(y, x);
-    mul_sum_us8_pairs_float(ax, sy)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    let nb = n / qk;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
-    }
-    if nb % 2 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
-    }
-
-    unsafe {
-        let mut acc = _mm256_setzero_ps();
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = _mm256_set1_ps(f16::to_f32(x.d) * f16::to_f32(y.d));
-            let bx = bytes_from_nibbles_32(x.qs.as_ptr());
-            let off = _mm256_set1_epi8(8);
-            let bx = _mm256_sub_epi8(bx, off);
-            let by = _mm256_loadu_si256(y.qs.as_ptr() as *const __m256i);
-            let q = mul_sum_i8_pairs_float(bx, by);
-            acc = _mm256_fmadd_ps(d, q, acc);
-        }
-        Ok(hsum_float_8(acc))
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
-    }
-    unsafe {
-        let mut acc = _mm256_setzero_ps();
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = _mm256_set1_ps(f16::to_f32(x.d) * f16::to_f32(y.d));
-            let bx = _mm256_loadu_si256(x.qs.as_ptr() as *const __m256i);
-            let by = _mm256_loadu_si256(y.qs.as_ptr() as *const __m256i);
-            let q = mul_sum_i8_pairs_float(bx, by);
-            acc = _mm256_fmadd_ps(d, q, acc);
-        }
-        Ok(hsum_float_8(acc))
-    }
-}
-
-#[inline(always)]
-unsafe fn get_scale_shuffle(i: usize) -> __m128i {
-    const K_SHUFFLE: [u8; 128] = [
-        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
-        3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7,
-        7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10,
-        11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13,
-        13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
-    ];
-    _mm_loadu_si128((K_SHUFFLE.as_ptr() as *const __m128i).add(i))
-}
-
-#[inline(always)]
-unsafe fn get_scale_shuffle_k4(i: usize) -> __m256i {
-    const K_SHUFFLE: [u8; 256] = [
-        0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-        0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-        2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-        4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-        6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10,
-        11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 12, 13, 12, 13, 12, 13,
-        12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12,
-        13, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
-        14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
-    ];
-    _mm256_loadu_si256((K_SHUFFLE.as_ptr() as *const __m256i).add(i))
-}
-
-#[inline(always)]
-unsafe fn get_scale_shuffle_q3k(i: usize) -> __m256i {
-    const K_SHUFFLE: [u8; 128] = [
-        0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-        2, 3, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-        6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 10, 11, 10, 11, 10, 11,
-        10, 11, 10, 11, 10, 11, 10, 11, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12,
-        13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
-    ];
-    _mm256_loadu_si256((K_SHUFFLE.as_ptr() as *const __m256i).add(i))
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % qk != 0 {
-        crate::bail!("vec_dot_q6k_8k: {n} is not divisible by {qk}")
-    }
-
-    unsafe {
-        let m4 = _mm256_set1_epi8(0xF);
-        let m2 = _mm256_set1_epi8(3);
-        let m32s = _mm256_set1_epi8(32);
-        let mut acc = _mm256_setzero_ps();
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let mut q4 = x.ql.as_ptr();
-            let mut qh = x.qh.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let scales = _mm_loadu_si128(x.scales.as_ptr() as *const __m128i);
-            let mut sumi = _mm256_setzero_si256();
-
-            for j in 0..QK_K / 128 {
-                let is = j * 4;
-                let scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is));
-                let scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-                let scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-                let scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-
-                let q4bits1 = _mm256_loadu_si256(q4 as *const __m256i);
-                q4 = q4.add(32);
-                let q4bits2 = _mm256_loadu_si256(q4 as *const __m256i);
-                q4 = q4.add(32);
-                let q4bits_h = _mm256_loadu_si256(qh as *const __m256i);
-                qh = qh.add(32);
-
-                let q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bits_h, m2), 4);
-                let q4h_1 =
-                    _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bits_h, 2), m2), 4);
-                let q4h_2 =
-                    _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bits_h, 4), m2), 4);
-                let q4h_3 =
-                    _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bits_h, 6), m2), 4);
-
-                let q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-                let q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-                let q4_2 =
-                    _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-                let q4_3 =
-                    _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
-
-                let q8_0 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_1 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_2 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_3 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-
-                let q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-                let q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-                let q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-                let q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
-                let p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-                let p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-                let p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
-                let p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
-
-                let p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-                let p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-                let p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-                let p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-                let p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-                let p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-                let p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
-                let p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
-
-                sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-                sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
-            }
-            acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-        }
-        Ok(hsum_float_8(acc))
-    }
-}
-
-#[inline(always)]
-unsafe fn mm256_set_m128i(a: __m128i, b: __m128i) -> __m256i {
-    _mm256_insertf128_si256(_mm256_castsi128_si256(b), a, 1)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
-    }
-
-    unsafe {
-        let m3 = _mm256_set1_epi8(3);
-        let m4 = _mm_set1_epi8(0xF);
-
-        let mut acc = _mm256_setzero_ps();
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let dmin = -y.d * x.dmin.to_f32();
-
-            let mut q2 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mins_and_scales = _mm_loadu_si128(x.scales.as_ptr() as *const __m128i);
-            let scales8 = _mm_and_si128(mins_and_scales, m4);
-            let mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-            let mins = _mm256_cvtepi8_epi16(mins8);
-            let prod =
-                _mm256_madd_epi16(mins, _mm256_loadu_si256(y.bsums.as_ptr() as *const __m256i));
-
-            acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
-
-            let all_scales = _mm256_cvtepi8_epi16(scales8);
-            let l_scales = _mm256_extracti128_si256(all_scales, 0);
-            let h_scales = _mm256_extracti128_si256(all_scales, 1);
-            let scales = [
-                mm256_set_m128i(l_scales, l_scales),
-                mm256_set_m128i(h_scales, h_scales),
-            ];
-
-            let mut sumi = _mm256_setzero_si256();
-
-            for scale in scales {
-                let q2bits = _mm256_loadu_si256(q2 as *const __m256i);
-                q2 = q2.add(32);
-
-                let q8_0 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_1 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_2 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_3 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-
-                let q2_0 = _mm256_and_si256(q2bits, m3);
-                let q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
-                let q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
-                let q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
-
-                let p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-                let p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-                let p2 = _mm256_maddubs_epi16(q2_2, q8_2);
-                let p3 = _mm256_maddubs_epi16(q2_3, q8_3);
-
-                let p0 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(scale, get_scale_shuffle_q3k(0)), p0);
-                let p1 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(scale, get_scale_shuffle_q3k(1)), p1);
-                let p2 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(scale, get_scale_shuffle_q3k(2)), p2);
-                let p3 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(scale, get_scale_shuffle_q3k(3)), p3);
-
-                let p0 = _mm256_add_epi32(p0, p1);
-                let p2 = _mm256_add_epi32(p2, p3);
-
-                sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
-            }
-            acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-        }
-
-        Ok(hsum_float_8(acc))
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q3k_q8k: {n} is not divisible by {QK_K}")
-    }
-
-    const KMASK1: u32 = 0x03030303;
-    const KMASK2: u32 = 0x0f0f0f0f;
-
-    let mut aux = [0u32; 3];
-
-    unsafe {
-        let m3 = _mm256_set1_epi8(3);
-        let mone = _mm256_set1_epi8(1);
-        let m32 = _mm_set1_epi8(32);
-
-        let mut acc = _mm256_setzero_ps();
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-
-            let mut q3 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            LittleEndian::read_u32_into(&x.scales, &mut aux);
-            let scales128 = _mm_set_epi32(
-                (((aux[1] >> 4) & KMASK2) | (((aux[2] >> 6) & KMASK1) << 4)) as i32,
-                (((aux[0] >> 4) & KMASK2) | (((aux[2] >> 4) & KMASK1) << 4)) as i32,
-                ((aux[1] & KMASK2) | (((aux[2] >> 2) & KMASK1) << 4)) as i32,
-                ((aux[0] & KMASK2) | (((aux[2]) & KMASK1) << 4)) as i32,
-            );
-            let scales128 = _mm_sub_epi8(scales128, m32);
-            let all_scales = _mm256_cvtepi8_epi16(scales128);
-            let l_scales = _mm256_extracti128_si256(all_scales, 0);
-            let h_scales = _mm256_extracti128_si256(all_scales, 1);
-            let scales = [
-                mm256_set_m128i(l_scales, l_scales),
-                mm256_set_m128i(h_scales, h_scales),
-            ];
-
-            // high bit
-            let hbits = _mm256_loadu_si256(x.hmask.as_ptr() as *const __m256i);
-
-            let mut sumi = _mm256_setzero_si256();
-
-            for (j, scale) in scales.iter().enumerate() {
-                // load low 2 bits
-                let q3bits = _mm256_loadu_si256(q3 as *const __m256i);
-                q3 = q3.add(32);
-
-                // Prepare low and high bits
-                // We hardcode the shifts here to avoid loading them into a seperate register
-                let q3l_0 = _mm256_and_si256(q3bits, m3);
-                let q3h_0 = if j == 0 {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 0)), 0)
-                } else {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 4)), 4)
-                };
-                let q3h_0 = _mm256_slli_epi16(q3h_0, 2);
-
-                let q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
-                let q3h_1 = if j == 0 {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 1)), 1)
-                } else {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 5)), 5)
-                };
-                let q3h_1 = _mm256_slli_epi16(q3h_1, 2);
-
-                let q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
-                let q3h_2 = if j == 0 {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 2)), 2)
-                } else {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 6)), 6)
-                };
-                let q3h_2 = _mm256_slli_epi16(q3h_2, 2);
-
-                let q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
-                let q3h_3 = if j == 0 {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 3)), 3)
-                } else {
-                    _mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, 7)), 7)
-                };
-                let q3h_3 = _mm256_slli_epi16(q3h_3, 2);
-
-                // load Q8 quants
-                let q8_0 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_1 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_2 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_3 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-
-                // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we
-                // can use _mm256_maddubs_epi16, and then subtract. The high bit part has the 2
-                // already subtracted (and so, it is zero if the high bit was not set, and 2 if the
-                // high bit was set)
-                let q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-                let q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-                let q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
-                let q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
-
-                let p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-                let p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-                let p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
-                let p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
-
-                let p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-                let p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-                let p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-                let p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-                // multiply with scales
-                let p16_0 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(*scale, get_scale_shuffle_q3k(0)), p16_0);
-                let p16_1 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(*scale, get_scale_shuffle_q3k(1)), p16_1);
-                let p16_2 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(*scale, get_scale_shuffle_q3k(2)), p16_2);
-                let p16_3 =
-                    _mm256_madd_epi16(_mm256_shuffle_epi8(*scale, get_scale_shuffle_q3k(3)), p16_3);
-
-                // accumulate
-                let p16_0 = _mm256_add_epi32(p16_0, p16_1);
-                let p16_2 = _mm256_add_epi32(p16_2, p16_3);
-                sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
-            }
-
-            // multiply with block scale and accumulate
-            acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-        }
-        Ok(hsum_float_8(acc))
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut utmp = [0u32; 4];
-    const KMASK1: u32 = 0x3f3f3f3f;
-    const KMASK2: u32 = 0x0f0f0f0f;
-    const KMASK3: u32 = 0x03030303;
-
-    unsafe {
-        let m4 = _mm256_set1_epi8(0xF);
-
-        let mut acc = _mm256_setzero_ps();
-        let mut acc_m = _mm_setzero_ps();
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let dmin = -y.d * x.dmin.to_f32();
-
-            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
-
-            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
-            let uaux = utmp[1] & KMASK1;
-            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
-            utmp[2] = uaux;
-            utmp[0] &= KMASK1;
-
-            let mut q4 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(
-                utmp[3] as i32,
-                utmp[2] as i32,
-                utmp[1] as i32,
-                utmp[0] as i32,
-            ));
-
-            let q8sums = _mm256_loadu_si256(y.bsums.as_ptr() as *const __m256i);
-            let q8s = _mm_hadd_epi16(
-                _mm256_extracti128_si256(q8sums, 0),
-                _mm256_extracti128_si256(q8sums, 1),
-            );
-            let prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-            acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
-
-            let sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
-            let scales = mm256_set_m128i(sc128, sc128);
-
-            let mut sumi = _mm256_setzero_si256();
-
-            for j in 0..QK_K / 64 {
-                let scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2 * j));
-                let scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2 * j + 1));
-
-                let q4bits = _mm256_loadu_si256(q4 as *const __m256i);
-                q4 = q4.add(32);
-                let q4l = _mm256_and_si256(q4bits, m4);
-                let q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-                let q8l = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let p16l = _mm256_maddubs_epi16(q4l, q8l);
-                let p16l = _mm256_madd_epi16(scale_l, p16l);
-                sumi = _mm256_add_epi32(sumi, p16l);
-
-                let q8h = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let p16h = _mm256_maddubs_epi16(q4h, q8h);
-                let p16h = _mm256_madd_epi16(scale_h, p16h);
-                sumi = _mm256_add_epi32(sumi, p16h);
-            }
-
-            let vd = _mm256_set1_ps(d);
-            acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-        }
-
-        let acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-        let acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-        Ok(hsum_float_8(acc) + _mm_cvtss_f32(acc_m))
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut utmp = [0u32; 4];
-    const KMASK1: u32 = 0x3f3f3f3f;
-    const KMASK2: u32 = 0x0f0f0f0f;
-    const KMASK3: u32 = 0x03030303;
-
-    unsafe {
-        let m4 = _mm256_set1_epi8(0xF);
-        let mzero = _mm_setzero_si128();
-        let mone = _mm256_set1_epi8(1);
-
-        let mut acc = _mm256_setzero_ps();
-        let mut summs = 0.0;
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let dmin = -y.d * x.dmin.to_f32();
-
-            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
-
-            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
-            let uaux = utmp[1] & KMASK1;
-            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
-            utmp[2] = uaux;
-            utmp[0] &= KMASK1;
-
-            let mut q5 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(
-                utmp[3] as i32,
-                utmp[2] as i32,
-                utmp[1] as i32,
-                utmp[0] as i32,
-            ));
-
-            let q8sums = _mm256_loadu_si256(y.bsums.as_ptr() as *const __m256i);
-            let q8s = _mm_hadd_epi16(
-                _mm256_extracti128_si256(q8sums, 0),
-                _mm256_extracti128_si256(q8sums, 1),
-            );
-            let prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-            let hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-            summs += dmin * _mm_extract_epi32(hsum, 0) as f32;
-
-            let sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
-            let scales = mm256_set_m128i(sc128, sc128);
-
-            let hbits = _mm256_loadu_si256(x.qh.as_ptr() as *const __m256i);
-            let mut hmask = mone;
-
-            let mut sumi = _mm256_setzero_si256();
-
-            for j in 0..QK_K / 64 {
-                let scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2 * j));
-                let scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2 * j + 1));
-
-                let q5bits = _mm256_loadu_si256(q5 as *const __m256i);
-                q5 = q5.add(32);
-
-                //Similar to q3k we hardcode the shifts here to avoid loading them into a seperate register
-                let q5l_0 = _mm256_and_si256(q5bits, m4);
-                let q5l_0_shift_input = _mm256_and_si256(hbits, hmask);
-                let q5l_0_right_shift = match j {
-                    0 => _mm256_srli_epi16(q5l_0_shift_input, 0),
-                    1 => _mm256_srli_epi16(q5l_0_shift_input, 2),
-                    2 => _mm256_srli_epi16(q5l_0_shift_input, 4),
-                    3 => _mm256_srli_epi16(q5l_0_shift_input, 6),
-                    _ => unreachable!(),
-                };
-                let q5h_0 = _mm256_slli_epi16(q5l_0_right_shift, 4);
-                let q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
-                hmask = _mm256_slli_epi16(hmask, 1);
-
-                let q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-                let q5l_1_shift_input = _mm256_and_si256(hbits, hmask);
-                let q5l_1_right_shift = match j {
-                    0 => _mm256_srli_epi16(q5l_1_shift_input, 1),
-                    1 => _mm256_srli_epi16(q5l_1_shift_input, 3),
-                    2 => _mm256_srli_epi16(q5l_1_shift_input, 5),
-                    3 => _mm256_srli_epi16(q5l_1_shift_input, 7),
-                    _ => unreachable!(),
-                };
-
-                let q5h_1 = _mm256_slli_epi16(q5l_1_right_shift, 4);
-                let q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
-                hmask = _mm256_slli_epi16(hmask, 1);
-
-                let q8_0 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-                let q8_1 = _mm256_loadu_si256(q8 as *const __m256i);
-                q8 = q8.add(32);
-
-                let p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
-                let p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
-
-                let p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-                let p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-                sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-            }
-            let vd = _mm256_set1_ps(d);
-            acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-        }
-        Ok(hsum_float_8(acc) + summs)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % qk != 0 {
-        crate::bail!("vec_dot_q8k_8k: {n} is not divisible by {qk}")
-    }
-
-    unsafe {
-        let mut acc = _mm256_setzero_ps();
-        for (xs, ys) in xs.iter().zip(ys.iter()) {
-            let mut sumi = _mm256_setzero_si256();
-            let x_qs = xs.qs.as_ptr();
-            let y_qs = ys.qs.as_ptr();
-            for j in (0..QK_K).step_by(32) {
-                let xs = _mm256_loadu_si256(x_qs.add(j) as *const __m256i);
-                let ys = _mm256_loadu_si256(y_qs.add(j) as *const __m256i);
-
-                let xs0 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(xs, 0));
-                let ys0 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(ys, 0));
-                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(xs0, ys0));
-
-                let xs1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(xs, 1));
-                let ys1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(ys, 1));
-                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(xs1, ys1));
-            }
-            let d = _mm256_set1_ps(xs.d * ys.d);
-            acc = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi), acc);
-        }
-        Ok(hsum_float_8(acc))
-    }
-}
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -1,231 +0,0 @@
-//! Support for the GGML file format.
-
-use super::{k_quants, GgmlDType};
-use crate::Result;
-use byteorder::{LittleEndian, ReadBytesExt};
-use std::collections::HashMap;
-
-// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.h#L37
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum Magic {
-    Ggjt,
-    Ggla,
-    Ggmf,
-    Ggml,
-    Ggsn,
-}
-
-impl TryFrom<u32> for Magic {
-    type Error = crate::Error;
-    fn try_from(value: u32) -> Result<Self> {
-        let magic = match value {
-            0x67676a74 => Self::Ggjt,
-            0x67676c61 => Self::Ggla,
-            0x67676d66 => Self::Ggmf,
-            0x67676d6c => Self::Ggml,
-            0x6767736e => Self::Ggsn,
-            _ => crate::bail!("unknown magic {value:08x}"),
-        };
-        Ok(magic)
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum VersionedMagic {
-    GgmlUnversioned,
-    GgmfV1,
-    GgjtV1,
-    GgjtV2,
-    GgjtV3,
-}
-
-impl VersionedMagic {
-    fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
-        let magic = reader.read_u32::<LittleEndian>()?;
-        let magic = Magic::try_from(magic)?;
-        if magic == Magic::Ggml {
-            return Ok(Self::GgmlUnversioned);
-        }
-        let version = reader.read_u32::<LittleEndian>()?;
-        let versioned_magic = match (magic, version) {
-            (Magic::Ggmf, 1) => Self::GgmfV1,
-            (Magic::Ggjt, 1) => Self::GgjtV1,
-            (Magic::Ggjt, 2) => Self::GgjtV2,
-            (Magic::Ggjt, 3) => Self::GgjtV3,
-            _ => crate::bail!("ggml: unsupported magic/version {magic:?}/{version}"),
-        };
-        Ok(versioned_magic)
-    }
-
-    fn align32(&self) -> bool {
-        match self {
-            Self::GgmlUnversioned | Self::GgmfV1 => false,
-            Self::GgjtV1 | Self::GgjtV2 | Self::GgjtV3 => true,
-        }
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct HParams {
-    pub n_vocab: u32,
-    pub n_embd: u32,
-    pub n_mult: u32,
-    pub n_head: u32,
-    pub n_layer: u32,
-    pub n_rot: u32,
-    pub ftype: u32,
-}
-
-impl HParams {
-    fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
-        let n_vocab = reader.read_u32::<LittleEndian>()?;
-        let n_embd = reader.read_u32::<LittleEndian>()?;
-        let n_mult = reader.read_u32::<LittleEndian>()?;
-        let n_head = reader.read_u32::<LittleEndian>()?;
-        let n_layer = reader.read_u32::<LittleEndian>()?;
-        let n_rot = reader.read_u32::<LittleEndian>()?;
-        let ftype = reader.read_u32::<LittleEndian>()?;
-        Ok(Self {
-            n_vocab,
-            n_embd,
-            n_mult,
-            n_head,
-            n_layer,
-            n_rot,
-            ftype,
-        })
-    }
-}
-
-#[derive(Debug, Clone, PartialEq)]
-pub struct Vocab {
-    pub token_score_pairs: Vec<(Vec<u8>, f32)>,
-}
-
-impl Vocab {
-    fn read<R: std::io::Read>(reader: &mut R, n_vocab: usize) -> Result<Self> {
-        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L556
-        let mut token_score_pairs = Vec::with_capacity(n_vocab);
-        for _index in 0..n_vocab {
-            let len = reader.read_u32::<LittleEndian>()? as usize;
-            let mut word = vec![0u8; len];
-            reader.read_exact(&mut word)?;
-            let score = reader.read_f32::<LittleEndian>()?;
-            token_score_pairs.push((word, score))
-        }
-        Ok(Self { token_score_pairs })
-    }
-}
-
-fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
-    raw_data: &[u8],
-    size_in_bytes: usize,
-    dims: Vec<usize>,
-) -> Result<super::QTensor> {
-    let raw_data_ptr = raw_data.as_ptr();
-    let n_blocks = size_in_bytes / std::mem::size_of::<T>();
-    let data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
-    super::QTensor::new(data.to_vec(), dims)
-}
-
-/// Creates a [Tensor] from a raw GGML tensor.
-pub fn qtensor_from_ggml(
-    ggml_dtype: GgmlDType,
-    raw_data: &[u8],
-    dims: Vec<usize>,
-) -> Result<super::QTensor> {
-    let tensor_elems = dims.iter().product::<usize>();
-    let blck_size = ggml_dtype.blck_size();
-    if tensor_elems % blck_size != 0 {
-        crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {blck_size}"
-        )
-    }
-    let size_in_bytes = tensor_elems / blck_size * ggml_dtype.type_size();
-
-    match ggml_dtype {
-        GgmlDType::F32 => from_raw_data::<f32>(raw_data, size_in_bytes, dims),
-        GgmlDType::F16 => from_raw_data::<half::f16>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q4_0 => from_raw_data::<k_quants::BlockQ4_0>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q4_1 => from_raw_data::<k_quants::BlockQ4_1>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q5_0 => from_raw_data::<k_quants::BlockQ5_0>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q5_1 => from_raw_data::<k_quants::BlockQ5_1>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q8_0 => from_raw_data::<k_quants::BlockQ8_0>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q2K => from_raw_data::<k_quants::BlockQ2K>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q3K => from_raw_data::<k_quants::BlockQ3K>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q4K => from_raw_data::<k_quants::BlockQ4K>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q5K => from_raw_data::<k_quants::BlockQ5K>(raw_data, size_in_bytes, dims),
-        GgmlDType::Q6K => from_raw_data::<k_quants::BlockQ6K>(raw_data, size_in_bytes, dims),
-        _ => crate::bail!("quantized type {ggml_dtype:?} is not supported yet"),
-    }
-}
-
-fn read_one_tensor<R: std::io::Seek + std::io::Read>(
-    reader: &mut R,
-    magic: VersionedMagic,
-) -> Result<(String, super::QTensor)> {
-    let n_dims = reader.read_u32::<LittleEndian>()?;
-    let name_len = reader.read_u32::<LittleEndian>()?;
-    let ggml_dtype = reader.read_u32::<LittleEndian>()?;
-    let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?;
-    let mut dims = vec![0u32; n_dims as usize];
-    reader.read_u32_into::<LittleEndian>(&mut dims)?;
-    // The dimensions are stored in reverse order, see for example:
-    // https://github.com/ggerganov/llama.cpp/blob/b5ffb2849d23afe73647f68eec7b68187af09be6/convert.py#L969
-    dims.reverse();
-    let mut name = vec![0u8; name_len as usize];
-    reader.read_exact(&mut name)?;
-    let name = String::from_utf8_lossy(&name).into_owned();
-
-    if magic.align32() {
-        let pos = reader.stream_position()?;
-        reader.seek(std::io::SeekFrom::Current(((32 - pos % 32) % 32) as i64))?;
-    }
-    let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>();
-    let tensor_elems = dims.iter().product::<usize>();
-    let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
-    // TODO: Mmap version to avoid copying the data around?
-    let mut raw_data = vec![0u8; size_in_bytes];
-    reader.read_exact(&mut raw_data)?;
-    match qtensor_from_ggml(ggml_dtype, &raw_data, dims) {
-        Ok(tensor) => Ok((name, tensor)),
-        Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
-    }
-}
-
-pub struct Content {
-    pub magic: VersionedMagic,
-    pub hparams: HParams,
-    pub vocab: Vocab,
-    pub tensors: HashMap<String, super::QTensor>,
-}
-
-impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Content> {
-        // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
-        let last_position = reader.seek(std::io::SeekFrom::End(0))?;
-        reader.seek(std::io::SeekFrom::Start(0))?;
-        let magic = VersionedMagic::read(reader)?;
-        let hparams = HParams::read(reader)?;
-        let vocab = Vocab::read(reader, hparams.n_vocab as usize)?;
-        let mut tensors = HashMap::new();
-
-        while reader.stream_position()? != last_position {
-            let (name, tensor) = read_one_tensor(reader, magic)?;
-            tensors.insert(name, tensor);
-        }
-        Ok(Self {
-            magic,
-            hparams,
-            vocab,
-            tensors,
-        })
-    }
-
-    pub fn remove(&mut self, name: &str) -> Result<super::QTensor> {
-        match self.tensors.remove(name) {
-            None => crate::bail!("cannot find tensor with name '{name}'"),
-            Some(tensor) => Ok(tensor),
-        }
-    }
-}
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -1,518 +0,0 @@
-//! Support for the GGUF file format.
-//!
-//! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md
-
-use super::{GgmlDType, QTensor};
-use crate::Result;
-use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use std::collections::HashMap;
-
-pub const DEFAULT_ALIGNMENT: u64 = 32;
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum Magic {
-    Gguf,
-}
-
-impl TryFrom<u32> for Magic {
-    type Error = crate::Error;
-    fn try_from(value: u32) -> Result<Self> {
-        let magic = match value {
-            0x46554747 | 0x47475546 => Self::Gguf,
-            _ => crate::bail!("unknown magic 0x{value:08x}"),
-        };
-        Ok(magic)
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum VersionedMagic {
-    GgufV1,
-    GgufV2,
-}
-
-impl VersionedMagic {
-    fn read<R: std::io::Read>(reader: &mut R) -> Result<Self> {
-        let magic = reader.read_u32::<LittleEndian>()?;
-        let magic = Magic::try_from(magic)?;
-        let version = reader.read_u32::<LittleEndian>()?;
-        let versioned_magic = match (magic, version) {
-            (Magic::Gguf, 1) => Self::GgufV1,
-            (Magic::Gguf, 2) => Self::GgufV2,
-            _ => crate::bail!("ggml: unsupported magic/version {magic:?}/{version}"),
-        };
-        Ok(versioned_magic)
-    }
-}
-
-#[derive(Debug)]
-pub struct TensorInfo {
-    pub ggml_dtype: GgmlDType,
-    pub shape: crate::Shape,
-    pub offset: u64,
-}
-
-impl TensorInfo {
-    pub fn read<R: std::io::Seek + std::io::Read>(
-        &self,
-        reader: &mut R,
-        tensor_data_offset: u64,
-    ) -> Result<QTensor> {
-        let tensor_elems = self.shape.elem_count();
-        let blck_size = self.ggml_dtype.blck_size();
-        if tensor_elems % blck_size != 0 {
-            crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {blck_size}"
-        )
-        }
-        let size_in_bytes = tensor_elems / blck_size * self.ggml_dtype.type_size();
-        let mut raw_data = vec![0u8; size_in_bytes];
-        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
-        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
-    }
-}
-
-#[derive(Debug)]
-pub struct Content {
-    pub magic: VersionedMagic,
-    pub metadata: HashMap<String, Value>,
-    pub tensor_infos: HashMap<String, TensorInfo>,
-    pub tensor_data_offset: u64,
-}
-
-fn read_string<R: std::io::Read>(reader: &mut R, magic: &VersionedMagic) -> Result<String> {
-    let len = match magic {
-        VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-        VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
-    };
-    let mut v = vec![0u8; len];
-    reader.read_exact(&mut v)?;
-    // GGUF strings are supposed to be non-null terminated but in practice this happens.
-    while let Some(0) = v.last() {
-        v.pop();
-    }
-    // GGUF strings are utf8 encoded but there are cases that don't seem to be valid.
-    Ok(String::from_utf8_lossy(&v).into_owned())
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum ValueType {
-    // The value is a 8-bit unsigned integer.
-    U8,
-    // The value is a 8-bit signed integer.
-    I8,
-    // The value is a 16-bit unsigned little-endian integer.
-    U16,
-    // The value is a 16-bit signed little-endian integer.
-    I16,
-    // The value is a 32-bit unsigned little-endian integer.
-    U32,
-    // The value is a 32-bit signed little-endian integer.
-    I32,
-    // The value is a 64-bit unsigned little-endian integer.
-    U64,
-    // The value is a 64-bit signed little-endian integer.
-    I64,
-    // The value is a 32-bit IEEE754 floating point number.
-    F32,
-    // The value is a 64-bit IEEE754 floating point number.
-    F64,
-    // The value is a boolean.
-    // 1-byte value where 0 is false and 1 is true.
-    // Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
-    Bool,
-    // The value is a UTF-8 non-null-terminated string, with length prepended.
-    String,
-    // The value is an array of other values, with the length and type prepended.
-    ///
-    // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
-    Array,
-}
-
-#[derive(Debug, Clone)]
-pub enum Value {
-    U8(u8),
-    I8(i8),
-    U16(u16),
-    I16(i16),
-    U32(u32),
-    I32(i32),
-    U64(u64),
-    I64(i64),
-    F32(f32),
-    F64(f64),
-    Bool(bool),
-    String(String),
-    Array(Vec<Value>),
-}
-
-impl Value {
-    pub fn value_type(&self) -> ValueType {
-        match self {
-            Self::U8(_) => ValueType::U8,
-            Self::I8(_) => ValueType::I8,
-            Self::U16(_) => ValueType::U16,
-            Self::I16(_) => ValueType::I16,
-            Self::U32(_) => ValueType::U32,
-            Self::I32(_) => ValueType::I32,
-            Self::U64(_) => ValueType::U64,
-            Self::I64(_) => ValueType::I64,
-            Self::F32(_) => ValueType::F32,
-            Self::F64(_) => ValueType::F64,
-            Self::Bool(_) => ValueType::Bool,
-            Self::String(_) => ValueType::String,
-            Self::Array(_) => ValueType::Array,
-        }
-    }
-
-    pub fn to_u8(&self) -> Result<u8> {
-        match self {
-            Self::U8(v) => Ok(*v),
-            v => crate::bail!("not a u8 {v:?}"),
-        }
-    }
-
-    pub fn to_i8(&self) -> Result<i8> {
-        match self {
-            Self::I8(v) => Ok(*v),
-            v => crate::bail!("not a i8 {v:?}"),
-        }
-    }
-
-    pub fn to_u16(&self) -> Result<u16> {
-        match self {
-            Self::U16(v) => Ok(*v),
-            v => crate::bail!("not a u16 {v:?}"),
-        }
-    }
-
-    pub fn to_i16(&self) -> Result<i16> {
-        match self {
-            Self::I16(v) => Ok(*v),
-            v => crate::bail!("not a i16 {v:?}"),
-        }
-    }
-
-    pub fn to_u32(&self) -> Result<u32> {
-        match self {
-            Self::U32(v) => Ok(*v),
-            v => crate::bail!("not a u32 {v:?}"),
-        }
-    }
-
-    pub fn to_i32(&self) -> Result<i32> {
-        match self {
-            Self::I32(v) => Ok(*v),
-            v => crate::bail!("not a i32 {v:?}"),
-        }
-    }
-
-    pub fn to_u64(&self) -> Result<u64> {
-        match self {
-            Self::U64(v) => Ok(*v),
-            v => crate::bail!("not a u64 {v:?}"),
-        }
-    }
-
-    pub fn to_i64(&self) -> Result<i64> {
-        match self {
-            Self::I64(v) => Ok(*v),
-            v => crate::bail!("not a i64 {v:?}"),
-        }
-    }
-
-    pub fn to_f32(&self) -> Result<f32> {
-        match self {
-            Self::F32(v) => Ok(*v),
-            v => crate::bail!("not a f32 {v:?}"),
-        }
-    }
-
-    pub fn to_f64(&self) -> Result<f64> {
-        match self {
-            Self::F64(v) => Ok(*v),
-            v => crate::bail!("not a f64 {v:?}"),
-        }
-    }
-
-    pub fn to_bool(&self) -> Result<bool> {
-        match self {
-            Self::Bool(v) => Ok(*v),
-            v => crate::bail!("not a bool {v:?}"),
-        }
-    }
-
-    pub fn to_vec(&self) -> Result<&Vec<Value>> {
-        match self {
-            Self::Array(v) => Ok(v),
-            v => crate::bail!("not a vec {v:?}"),
-        }
-    }
-
-    pub fn to_string(&self) -> Result<&String> {
-        match self {
-            Self::String(v) => Ok(v),
-            v => crate::bail!("not a string {v:?}"),
-        }
-    }
-
-    fn read<R: std::io::Read>(
-        reader: &mut R,
-        value_type: ValueType,
-        magic: &VersionedMagic,
-    ) -> Result<Self> {
-        let v = match value_type {
-            ValueType::U8 => Self::U8(reader.read_u8()?),
-            ValueType::I8 => Self::I8(reader.read_i8()?),
-            ValueType::U16 => Self::U16(reader.read_u16::<LittleEndian>()?),
-            ValueType::I16 => Self::I16(reader.read_i16::<LittleEndian>()?),
-            ValueType::U32 => Self::U32(reader.read_u32::<LittleEndian>()?),
-            ValueType::I32 => Self::I32(reader.read_i32::<LittleEndian>()?),
-            ValueType::U64 => Self::U64(reader.read_u64::<LittleEndian>()?),
-            ValueType::I64 => Self::I64(reader.read_i64::<LittleEndian>()?),
-            ValueType::F32 => Self::F32(reader.read_f32::<LittleEndian>()?),
-            ValueType::F64 => Self::F64(reader.read_f64::<LittleEndian>()?),
-            ValueType::Bool => match reader.read_u8()? {
-                0 => Self::Bool(false),
-                1 => Self::Bool(true),
-                b => crate::bail!("unexpected bool value {b}"),
-            },
-            ValueType::String => Self::String(read_string(reader, magic)?),
-            ValueType::Array => {
-                let value_type = reader.read_u32::<LittleEndian>()?;
-                let value_type = ValueType::from_u32(value_type)?;
-                let len = match magic {
-                    VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-                    VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
-                };
-                let mut vs = Vec::with_capacity(len);
-                for _ in 0..len {
-                    vs.push(Value::read(reader, value_type, magic)?)
-                }
-                Self::Array(vs)
-            }
-        };
-        Ok(v)
-    }
-
-    fn write<W: std::io::Write>(&self, w: &mut W) -> Result<()> {
-        match self {
-            &Self::U8(v) => w.write_u8(v)?,
-            &Self::I8(v) => w.write_i8(v)?,
-            &Self::U16(v) => w.write_u16::<LittleEndian>(v)?,
-            &Self::I16(v) => w.write_i16::<LittleEndian>(v)?,
-            &Self::U32(v) => w.write_u32::<LittleEndian>(v)?,
-            &Self::I32(v) => w.write_i32::<LittleEndian>(v)?,
-            &Self::U64(v) => w.write_u64::<LittleEndian>(v)?,
-            &Self::I64(v) => w.write_i64::<LittleEndian>(v)?,
-            &Self::F32(v) => w.write_f32::<LittleEndian>(v)?,
-            &Self::F64(v) => w.write_f64::<LittleEndian>(v)?,
-            &Self::Bool(v) => w.write_u8(u8::from(v))?,
-            Self::String(v) => write_string(w, v.as_str())?,
-            Self::Array(v) => {
-                // The `Value` type does not enforce that all the values in an Array have the same
-                // type.
-                let value_type = if v.is_empty() {
-                    // Doesn't matter, the array is empty.
-                    ValueType::U32
-                } else {
-                    let value_type: std::collections::HashSet<_> =
-                        v.iter().map(|elem| elem.value_type()).collect();
-                    if value_type.len() != 1 {
-                        crate::bail!("multiple value-types in the same array {value_type:?}")
-                    }
-                    value_type.into_iter().next().unwrap()
-                };
-                w.write_u32::<LittleEndian>(value_type.to_u32())?;
-                w.write_u64::<LittleEndian>(v.len() as u64)?;
-                for elem in v.iter() {
-                    elem.write(w)?
-                }
-            }
-        }
-        Ok(())
-    }
-}
-
-impl ValueType {
-    fn from_u32(v: u32) -> Result<Self> {
-        let v = match v {
-            0 => Self::U8,
-            1 => Self::I8,
-            2 => Self::U16,
-            3 => Self::I16,
-            4 => Self::U32,
-            5 => Self::I32,
-            6 => Self::F32,
-            7 => Self::Bool,
-            8 => Self::String,
-            9 => Self::Array,
-            10 => Self::U64,
-            11 => Self::I64,
-            12 => Self::F64,
-            v => crate::bail!("unrecognized value-type {v:#08x}"),
-        };
-        Ok(v)
-    }
-
-    fn to_u32(self) -> u32 {
-        match self {
-            Self::U8 => 0,
-            Self::I8 => 1,
-            Self::U16 => 2,
-            Self::I16 => 3,
-            Self::U32 => 4,
-            Self::I32 => 5,
-            Self::F32 => 6,
-            Self::Bool => 7,
-            Self::String => 8,
-            Self::Array => 9,
-            Self::U64 => 10,
-            Self::I64 => 11,
-            Self::F64 => 12,
-        }
-    }
-}
-
-impl Content {
-    pub fn read<R: std::io::Seek + std::io::Read>(reader: &mut R) -> Result<Self> {
-        let magic = VersionedMagic::read(reader)?;
-
-        let tensor_count = match magic {
-            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
-        };
-        let metadata_kv_count = match magic {
-            VersionedMagic::GgufV1 => reader.read_u32::<LittleEndian>()? as usize,
-            VersionedMagic::GgufV2 => reader.read_u64::<LittleEndian>()? as usize,
-        };
-
-        let mut metadata = HashMap::new();
-        for _idx in 0..metadata_kv_count {
-            let key = read_string(reader, &magic)?;
-            let value_type = reader.read_u32::<LittleEndian>()?;
-            let value_type = ValueType::from_u32(value_type)?;
-            let value = Value::read(reader, value_type, &magic)?;
-            metadata.insert(key, value);
-        }
-        let mut tensor_infos = HashMap::new();
-        for _idx in 0..tensor_count {
-            let tensor_name = read_string(reader, &magic)?;
-            let n_dimensions = reader.read_u32::<LittleEndian>()?;
-
-            let mut dimensions: Vec<usize> = match magic {
-                VersionedMagic::GgufV1 => {
-                    let mut dimensions = vec![0; n_dimensions as usize];
-                    reader.read_u32_into::<LittleEndian>(&mut dimensions)?;
-                    dimensions.into_iter().map(|c| c as usize).collect()
-                }
-                VersionedMagic::GgufV2 => {
-                    let mut dimensions = vec![0; n_dimensions as usize];
-                    reader.read_u64_into::<LittleEndian>(&mut dimensions)?;
-                    dimensions.into_iter().map(|c| c as usize).collect()
-                }
-            };
-
-            dimensions.reverse();
-            let ggml_dtype = reader.read_u32::<LittleEndian>()?;
-            let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?;
-            let offset = reader.read_u64::<LittleEndian>()?;
-            tensor_infos.insert(
-                tensor_name,
-                TensorInfo {
-                    shape: crate::Shape::from(dimensions),
-                    offset,
-                    ggml_dtype,
-                },
-            );
-        }
-        let position = reader.stream_position()?;
-        let alignment = match metadata.get("general.alignment") {
-            Some(Value::U8(v)) => *v as u64,
-            Some(Value::U16(v)) => *v as u64,
-            Some(Value::U32(v)) => *v as u64,
-            Some(Value::I8(v)) if *v >= 0 => *v as u64,
-            Some(Value::I16(v)) if *v >= 0 => *v as u64,
-            Some(Value::I32(v)) if *v >= 0 => *v as u64,
-            _ => DEFAULT_ALIGNMENT,
-        };
-        let tensor_data_offset = (position + alignment - 1) / alignment * alignment;
-        Ok(Self {
-            magic,
-            metadata,
-            tensor_infos,
-            tensor_data_offset,
-        })
-    }
-
-    pub fn tensor<R: std::io::Seek + std::io::Read>(
-        &self,
-        reader: &mut R,
-        name: &str,
-    ) -> Result<QTensor> {
-        let tensor_info = match self.tensor_infos.get(name) {
-            Some(tensor_info) => tensor_info,
-            None => crate::bail!("cannot find tensor-infor for {name}"),
-        };
-        tensor_info.read(reader, self.tensor_data_offset)
-    }
-}
-
-fn write_string<W: std::io::Write>(w: &mut W, str: &str) -> Result<()> {
-    let bytes = str.as_bytes();
-    w.write_u64::<LittleEndian>(bytes.len() as u64)?;
-    w.write_all(bytes)?;
-    Ok(())
-}
-
-pub fn write<W: std::io::Seek + std::io::Write>(
-    w: &mut W,
-    metadata: &[(&str, &Value)],
-    tensors: &[(&str, &QTensor)],
-) -> Result<()> {
-    w.write_u32::<LittleEndian>(0x46554747)?;
-    w.write_u32::<LittleEndian>(2)?; // version 2.
-    w.write_u64::<LittleEndian>(tensors.len() as u64)?;
-    w.write_u64::<LittleEndian>(metadata.len() as u64)?;
-    for (name, value) in metadata.iter() {
-        write_string(w, name)?;
-        w.write_u32::<LittleEndian>(value.value_type().to_u32())?;
-        value.write(w)?;
-    }
-    let mut offset = 0usize;
-    let mut offsets = Vec::with_capacity(tensors.len());
-    for (name, tensor) in tensors.iter() {
-        write_string(w, name)?;
-        let dims = tensor.shape().dims();
-        w.write_u32::<LittleEndian>(dims.len() as u32)?;
-        for &dim in dims.iter().rev() {
-            w.write_u64::<LittleEndian>(dim as u64)?;
-        }
-        w.write_u32::<LittleEndian>(tensor.dtype().to_u32())?;
-        w.write_u64::<LittleEndian>(offset as u64)?;
-        offsets.push(offset);
-        let size_in_bytes = tensor.storage_size_in_bytes();
-        let padding = 31 - (31 + size_in_bytes) % 32;
-        offset += size_in_bytes + padding;
-    }
-    let pos = w.stream_position()? as usize;
-    let padding = 31 - (31 + pos) % 32;
-    w.write_all(&vec![0u8; padding])?;
-    let tensor_start_pos = w.stream_position()? as usize;
-    for (offset, (_name, tensor)) in offsets.iter().zip(tensors.iter()) {
-        let pos = w.stream_position()? as usize;
-        if tensor_start_pos + offset != pos {
-            crate::bail!(
-                "internal error, unexpected current position {tensor_start_pos} {offset} {pos}"
-            )
-        }
-        let data_ptr = tensor.as_ptr();
-        let size_in_bytes = tensor.storage_size_in_bytes();
-        let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
-        w.write_all(data)?;
-        let padding = 31 - (31 + size_in_bytes) % 32;
-        w.write_all(&vec![0u8; padding])?;
-    }
-    Ok(())
-}
--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -1,324 +0,0 @@
-use crate::{Device, Result, Shape, Tensor};
-
-#[cfg(target_feature = "avx")]
-pub mod avx;
-pub mod ggml_file;
-pub mod gguf_file;
-pub mod k_quants;
-#[cfg(target_feature = "neon")]
-pub mod neon;
-#[cfg(target_feature = "simd128")]
-pub mod simd128;
-pub mod utils;
-
-pub use k_quants::GgmlType;
-
-pub struct QTensor {
-    data: Box<dyn QuantizedType>,
-    shape: Shape,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum GgmlDType {
-    F32,
-    F16,
-    Q4_0,
-    Q4_1,
-    Q5_0,
-    Q5_1,
-    Q8_0,
-    Q8_1,
-    Q2K,
-    Q3K,
-    Q4K,
-    Q5K,
-    Q6K,
-    Q8K,
-}
-
-impl GgmlDType {
-    pub(crate) fn from_u32(u: u32) -> Result<Self> {
-        let dtype = match u {
-            0 => Self::F32,
-            1 => Self::F16,
-            2 => Self::Q4_0,
-            3 => Self::Q4_1,
-            6 => Self::Q5_0,
-            7 => Self::Q5_1,
-            8 => Self::Q8_0,
-            9 => Self::Q8_1,
-            10 => Self::Q2K,
-            11 => Self::Q3K,
-            12 => Self::Q4K,
-            13 => Self::Q5K,
-            14 => Self::Q6K,
-            15 => Self::Q8K,
-            _ => crate::bail!("unknown dtype for tensor {u}"),
-        };
-        Ok(dtype)
-    }
-
-    pub(crate) fn to_u32(self) -> u32 {
-        match self {
-            Self::F32 => 0,
-            Self::F16 => 1,
-            Self::Q4_0 => 2,
-            Self::Q4_1 => 3,
-            Self::Q5_0 => 6,
-            Self::Q5_1 => 7,
-            Self::Q8_0 => 8,
-            Self::Q8_1 => 9,
-            Self::Q2K => 10,
-            Self::Q3K => 11,
-            Self::Q4K => 12,
-            Self::Q5K => 13,
-            Self::Q6K => 14,
-            Self::Q8K => 15,
-        }
-    }
-
-    /// The type size for blocks in bytes.
-    pub fn type_size(&self) -> usize {
-        use k_quants::*;
-        match self {
-            Self::F32 => 4,
-            Self::F16 => 2,
-            Self::Q4_0 => std::mem::size_of::<BlockQ4_0>(),
-            Self::Q4_1 => std::mem::size_of::<BlockQ4_1>(),
-            Self::Q5_0 => std::mem::size_of::<BlockQ5_0>(),
-            Self::Q5_1 => std::mem::size_of::<BlockQ5_1>(),
-            // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L932
-            Self::Q8_0 => std::mem::size_of::<BlockQ8_0>(),
-            Self::Q8_1 => std::mem::size_of::<BlockQ8_1>(),
-            Self::Q2K => std::mem::size_of::<BlockQ2K>(),
-            Self::Q3K => std::mem::size_of::<BlockQ3K>(),
-            Self::Q4K => std::mem::size_of::<BlockQ4K>(),
-            Self::Q5K => std::mem::size_of::<BlockQ5K>(),
-            Self::Q6K => std::mem::size_of::<BlockQ6K>(),
-            Self::Q8K => std::mem::size_of::<BlockQ8K>(),
-        }
-    }
-
-    /// The block size, i.e. the number of elements stored in each block.
-    pub fn blck_size(&self) -> usize {
-        match self {
-            Self::F32 => 1,
-            Self::F16 => 1,
-            Self::Q4_0 => k_quants::QK4_0,
-            Self::Q4_1 => k_quants::QK4_1,
-            Self::Q5_0 => k_quants::QK5_0,
-            Self::Q5_1 => k_quants::QK5_1,
-            Self::Q8_0 => k_quants::QK8_0,
-            Self::Q8_1 => k_quants::QK8_1,
-            Self::Q2K | Self::Q3K | Self::Q4K | Self::Q5K | Self::Q6K | Self::Q8K => k_quants::QK_K,
-        }
-    }
-}
-
-// A version of GgmlType without `vec_dot` so that it can be dyn boxed.
-pub trait QuantizedType: Send + Sync {
-    fn dtype(&self) -> GgmlDType;
-    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()>;
-    fn to_float(&self, ys: &mut [f32]) -> Result<()>;
-    fn storage_size_in_bytes(&self) -> usize;
-    fn as_ptr(&self) -> *const u8;
-}
-
-impl<T: k_quants::GgmlType + Send + Sync> QuantizedType for Vec<T> {
-    fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> {
-        k_quants::matmul(mkn, lhs, self.as_slice(), dst)
-    }
-
-    fn dtype(&self) -> GgmlDType {
-        T::DTYPE
-    }
-
-    fn to_float(&self, ys: &mut [f32]) -> Result<()> {
-        T::to_float(self.as_slice(), ys)
-    }
-
-    fn storage_size_in_bytes(&self) -> usize {
-        self.len() * std::mem::size_of::<T>()
-    }
-
-    fn as_ptr(&self) -> *const u8 {
-        self.as_ptr() as *const u8
-    }
-}
-
-impl std::fmt::Debug for QTensor {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "QTensor[{:?}; {:?}]", self.shape, self.dtype())
-    }
-}
-
-fn check_shape<T: k_quants::GgmlType>(shape: &Shape) -> Result<()> {
-    let dims = shape.dims();
-    if dims.is_empty() {
-        crate::bail!("scalar tensor cannot be quantized {shape:?}")
-    }
-    if dims[dims.len() - 1] % T::BLCK_SIZE != 0 {
-        crate::bail!(
-            "quantized tensor must have their last dim divisible by block size {shape:?} {}",
-            T::BLCK_SIZE
-        )
-    }
-    Ok(())
-}
-
-impl QTensor {
-    pub fn new<S: Into<Shape>, T: k_quants::GgmlType + Send + Sync + 'static>(
-        data: Vec<T>,
-        shape: S,
-    ) -> Result<Self> {
-        let shape = shape.into();
-        check_shape::<T>(&shape)?;
-        Ok(Self {
-            data: Box::new(data),
-            shape,
-        })
-    }
-
-    pub fn quantize<T: k_quants::GgmlType + Send + Sync + 'static>(src: &Tensor) -> Result<Self> {
-        let shape = src.shape();
-        check_shape::<T>(shape)?;
-        let src = src
-            .to_dtype(crate::DType::F32)?
-            .flatten_all()?
-            .to_vec1::<f32>()?;
-        if src.len() % T::BLCK_SIZE != 0 {
-            crate::bail!(
-                "tensor size ({shape:?}) is not divisible by block size {}",
-                T::BLCK_SIZE
-            )
-        }
-        let mut data = vec![T::zeros(); src.len() / T::BLCK_SIZE];
-        T::from_float(&src, &mut data)?;
-        Ok(Self {
-            data: Box::new(data),
-            shape: shape.clone(),
-        })
-    }
-
-    pub fn dtype(&self) -> GgmlDType {
-        self.data.dtype()
-    }
-
-    pub fn rank(&self) -> usize {
-        self.shape.rank()
-    }
-
-    pub fn shape(&self) -> &Shape {
-        &self.shape
-    }
-
-    pub fn dequantize(&self, device: &Device) -> Result<Tensor> {
-        let mut f32_data = vec![0f32; self.shape.elem_count()];
-        self.data.to_float(&mut f32_data)?;
-        Tensor::from_vec(f32_data, &self.shape, device)
-    }
-
-    pub fn matmul_t(&self, mkn: (usize, usize, usize), lhs: &[f32], dst: &mut [f32]) -> Result<()> {
-        self.data.matmul_t(mkn, lhs, dst)
-    }
-
-    pub fn storage_size_in_bytes(&self) -> usize {
-        self.data.storage_size_in_bytes()
-    }
-
-    pub fn as_ptr(&self) -> *const u8 {
-        self.data.as_ptr()
-    }
-}
-
-#[derive(Clone, Debug)]
-pub enum QMatMul {
-    QTensor(std::sync::Arc<QTensor>),
-    Tensor(Tensor),
-}
-
-thread_local! {
-    static DEQUANTIZE_ALL: bool = {
-        match std::env::var("CANDLE_DEQUANTIZE_ALL") {
-            Ok(s) => {
-                !s.is_empty() && s != "0"
-            },
-            Err(_) => false,
-        }
-    }
-}
-
-impl QMatMul {
-    pub fn from_arc(qtensor: std::sync::Arc<QTensor>) -> Result<Self> {
-        let dequantize = match qtensor.dtype() {
-            GgmlDType::F32 | GgmlDType::F16 => true,
-            _ => DEQUANTIZE_ALL.with(|b| *b),
-        };
-        let t = if dequantize {
-            let tensor = qtensor.dequantize(&Device::Cpu)?;
-            Self::Tensor(tensor)
-        } else {
-            Self::QTensor(qtensor)
-        };
-        Ok(t)
-    }
-
-    pub fn from_qtensor(qtensor: QTensor) -> Result<Self> {
-        Self::from_arc(std::sync::Arc::new(qtensor))
-    }
-}
-
-impl crate::CustomOp1 for QTensor {
-    fn name(&self) -> &'static str {
-        "qmatmul"
-    }
-
-    fn cpu_fwd(
-        &self,
-        storage: &crate::CpuStorage,
-        layout: &crate::Layout,
-    ) -> Result<(crate::CpuStorage, Shape)> {
-        if !layout.is_contiguous() {
-            crate::bail!("input tensor is not contiguous {layout:?}")
-        }
-        let src_shape = layout.shape();
-        // self is transposed so n is first then k.
-        let (n, k) = self.shape.dims2()?;
-        if src_shape.rank() < 2 {
-            crate::bail!("input tensor has only one dimension {layout:?}")
-        }
-        let mut dst_shape = src_shape.dims().to_vec();
-        let last_k = dst_shape.pop().unwrap();
-        if last_k != k {
-            crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape)
-        }
-        dst_shape.push(n);
-        let dst_shape = Shape::from(dst_shape);
-        let storage = storage.as_slice::<f32>()?;
-        let storage =
-            &storage[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
-        let mut dst_storage = vec![0f32; dst_shape.elem_count()];
-        self.matmul_t(
-            (dst_shape.elem_count() / n, k, n),
-            storage,
-            &mut dst_storage,
-        )?;
-        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
-    }
-}
-
-impl QMatMul {
-    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
-        match self {
-            Self::QTensor(t) => xs.apply_op1_no_bwd(t.as_ref()),
-            Self::Tensor(w) => {
-                let w = match *xs.dims() {
-                    [b1, b2, _, _] => w.broadcast_left((b1, b2))?.t()?,
-                    [bsize, _, _] => w.broadcast_left(bsize)?.t()?,
-                    _ => w.t()?,
-                };
-                xs.matmul(&w)
-            }
-        }
-    }
-}
--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
@ -1,756 +0,0 @@
-use super::k_quants::{
-    BlockQ2K, BlockQ3K, BlockQ4K, BlockQ4_0, BlockQ5K, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K,
-};
-use crate::Result;
-use byteorder::{ByteOrder, LittleEndian};
-
-#[allow(unused_imports)]
-#[cfg(target_arch = "arm")]
-use core::arch::arm::*;
-
-#[allow(unused_imports)]
-#[cfg(target_arch = "aarch64")]
-use core::arch::aarch64::*;
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    let nb = n / qk;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
-    }
-    if nb % 2 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
-    }
-
-    unsafe {
-        let mut sumv0 = vdupq_n_f32(0.0f32);
-        let mut sumv1 = vdupq_n_f32(0.0f32);
-        for i in (0..nb).step_by(2) {
-            let x0 = &xs[i];
-            let x1 = &xs[i + 1];
-            let y0 = &ys[i];
-            let y1 = &ys[i + 1];
-
-            let m4b = vdupq_n_u8(0x0F);
-            let s8b = vdupq_n_s8(0x8);
-
-            let v0_0 = vld1q_u8(x0.qs.as_ptr());
-            let v0_1 = vld1q_u8(x1.qs.as_ptr());
-
-            // 4-bit -> 8-bit
-            let v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
-            let v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            let v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
-            let v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // sub 8
-            let v0_0ls = vsubq_s8(v0_0l, s8b);
-            let v0_0hs = vsubq_s8(v0_0h, s8b);
-            let v0_1ls = vsubq_s8(v0_1l, s8b);
-            let v0_1hs = vsubq_s8(v0_1h, s8b);
-
-            // load y
-            let v1_0l = vld1q_s8(y0.qs.as_ptr());
-            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));
-            let v1_1l = vld1q_s8(y1.qs.as_ptr());
-            let v1_1h = vld1q_s8(y1.qs.as_ptr().add(16));
-
-            // TODO: Support dotprod when it's available outside of nightly.
-            let pl0l = vmull_s8(vget_low_s8(v0_0ls), vget_low_s8(v1_0l));
-            let pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
-            let ph0l = vmull_s8(vget_low_s8(v0_0hs), vget_low_s8(v1_0h));
-            let ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
-
-            let pl1l = vmull_s8(vget_low_s8(v0_1ls), vget_low_s8(v1_1l));
-            let pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
-            let ph1l = vmull_s8(vget_low_s8(v0_1hs), vget_low_s8(v1_1h));
-            let ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
-
-            let pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-            let ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-            let pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-            let ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-            sumv0 = vmlaq_n_f32(
-                sumv0,
-                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
-                x0.d.to_f32() * y0.d.to_f32(),
-            );
-            sumv1 = vmlaq_n_f32(
-                sumv1,
-                vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
-                x1.d.to_f32() * y1.d.to_f32(),
-            );
-        }
-        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
-    }
-    let nb = n / QK8_0;
-    if nb % 2 != 0 {
-        crate::bail!("vec_dot_q8_0_q8_0: {nb} is not even")
-    }
-    unsafe {
-        let mut sumv0 = vdupq_n_f32(0.0f32);
-        let mut sumv1 = vdupq_n_f32(0.0f32);
-        for i in (0..nb).step_by(2) {
-            let x0 = &xs[i];
-            let x1 = &xs[i + 1];
-            let y0 = &ys[i];
-            let y1 = &ys[i + 1];
-
-            let x0_0 = vld1q_s8(x0.qs.as_ptr());
-            let x0_1 = vld1q_s8(x0.qs.as_ptr().add(16));
-            let x1_0 = vld1q_s8(x1.qs.as_ptr());
-            let x1_1 = vld1q_s8(x1.qs.as_ptr().add(16));
-
-            // load y
-            let y0_0 = vld1q_s8(y0.qs.as_ptr());
-            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));
-            let y1_0 = vld1q_s8(y1.qs.as_ptr());
-            let y1_1 = vld1q_s8(y1.qs.as_ptr().add(16));
-
-            // TODO dotprod once this is the intrinsics are.
-            let p0_0 = vmull_s8(vget_low_s8(x0_0), vget_low_s8(y0_0));
-            let p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-            let p0_2 = vmull_s8(vget_low_s8(x0_1), vget_low_s8(y0_1));
-            let p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-            let p1_0 = vmull_s8(vget_low_s8(x1_0), vget_low_s8(y1_0));
-            let p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
-            let p1_2 = vmull_s8(vget_low_s8(x1_1), vget_low_s8(y1_1));
-            let p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
-
-            let p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-            let p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
-            let p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
-            let p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
-
-            sumv0 = vmlaq_n_f32(
-                sumv0,
-                vcvtq_f32_s32(vaddq_s32(p0, p1)),
-                x0.d.to_f32() * y0.d.to_f32(),
-            );
-            sumv1 = vmlaq_n_f32(
-                sumv1,
-                vcvtq_f32_s32(vaddq_s32(p2, p3)),
-                x1.d.to_f32() * y1.d.to_f32(),
-            );
-        }
-        Ok(vaddvq_f32(sumv0) + vaddvq_f32(sumv1))
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
-    }
-
-    let mut sumf = 0f32;
-    for (xs, ys) in xs.iter().zip(ys.iter()) {
-        unsafe {
-            let mut sum_i = vdupq_n_s32(0);
-            let scale = xs.d * ys.d;
-            let xs = xs.qs.as_ptr();
-            let ys = ys.qs.as_ptr();
-            for i in (0..QK_K).step_by(16) {
-                let xs = vld1q_s8(xs.add(i));
-                let ys = vld1q_s8(ys.add(i));
-                let xy_lo = vmull_s8(vget_low_s8(xs), vget_low_s8(ys));
-                let xy_up = vmull_s8(vget_high_s8(xs), vget_high_s8(ys));
-
-                let xy = vaddq_s32(vpaddlq_s16(xy_lo), vpaddlq_s16(xy_up));
-                sum_i = vaddq_s32(sum_i, xy)
-            }
-            sumf += vaddvq_s32(sum_i) as f32 * scale
-        }
-    }
-    Ok(sumf)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut sum = 0f32;
-    unsafe {
-        let m4b = vdupq_n_u8(0xF);
-
-        let mone = vdupq_n_u8(3);
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d_all = x.d.to_f32();
-
-            let mut q6 = x.ql.as_ptr();
-            let mut qh = x.qh.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mut scale = x.scales.as_ptr();
-
-            let q8sums = vld1q_s16_x2(y.bsums.as_ptr());
-            let scales = vld1q_s8(scale);
-            let q6scales = int16x8x2_t(
-                vmovl_s8(vget_low_s8(scales)),
-                vmovl_s8(vget_high_s8(scales)),
-            );
-
-            let prod = vaddq_s32(
-                vaddq_s32(
-                    vmull_s16(vget_low_s16(q8sums.0), vget_low_s16(q6scales.0)),
-                    vmull_s16(vget_high_s16(q8sums.0), vget_high_s16(q6scales.0)),
-                ),
-                vaddq_s32(
-                    vmull_s16(vget_low_s16(q8sums.1), vget_low_s16(q6scales.1)),
-                    vmull_s16(vget_high_s16(q8sums.1), vget_high_s16(q6scales.1)),
-                ),
-            );
-            let isum_mins = vaddvq_s32(prod);
-
-            let mut isum = 0i32;
-
-            for _j in 0..QK_K / 128 {
-                let qhbits = vld1q_u8_x2(qh);
-                qh = qh.add(32);
-                let q6bits = vld1q_u8_x4(q6);
-                q6 = q6.add(64);
-                let q8bytes = vld1q_s8_x4(q8);
-                q8 = q8.add(64);
-
-                let q6h_0 = vshlq_n_u8(vandq_u8(mone, qhbits.0), 4);
-                let q6h_1 = vshlq_n_u8(vandq_u8(mone, qhbits.1), 4);
-                let shifted = vshrq_n_u8(qhbits.0, 2);
-                let q6h_2 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-                let shifted = vshrq_n_u8(qhbits.1, 2);
-                let q6h_3 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-                let q6bytes_0 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.0, m4b), q6h_0));
-                let q6bytes_1 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.1, m4b), q6h_1));
-                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.2, m4b), q6h_2));
-                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.3, m4b), q6h_3));
-
-                // TODO: dotprod
-
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
-                );
-                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
-                scale = scale.add(2);
-
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
-                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
-                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
-                );
-                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
-                scale = scale.add(2);
-
-                let q8bytes = vld1q_s8_x4(q8);
-                q8 = q8.add(64);
-
-                let shifted = vshrq_n_u8(qhbits.0, 4);
-                let q6h_0 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-                let shifted = vshrq_n_u8(qhbits.1, 4);
-                let q6h_1 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-                let shifted = vshrq_n_u8(qhbits.0, 6);
-                let q6h_2 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-                let shifted = vshrq_n_u8(qhbits.1, 6);
-                let q6h_3 = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-                let q6bytes_0 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.0, 4), q6h_0));
-                let q6bytes_1 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.1, 4), q6h_1));
-                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.2, 4), q6h_2));
-                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.3, 4), q6h_3));
-
-                // TODO: dotprod case.
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
-                );
-                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
-                scale = scale.add(2);
-
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
-                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
-                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
-                );
-                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
-                scale = scale.add(2);
-            }
-            sum += d_all * y.d * ((isum - 32 * isum_mins) as f32);
-        }
-    }
-    Ok(sum)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q5k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut sumf = 0f32;
-    let mut utmp = [0u32; 4];
-    const KMASK1: u32 = 0x3f3f3f3f;
-    const KMASK2: u32 = 0x0f0f0f0f;
-    const KMASK3: u32 = 0x03030303;
-
-    unsafe {
-        let m4b = vdupq_n_u8(0xF);
-        let mone = vdupq_n_u8(1);
-        let mtwo = vdupq_n_u8(2);
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let dmin = y.d * x.dmin.to_f32();
-
-            let q8sums = vpaddq_s16(
-                vld1q_s16(y.bsums.as_ptr()),
-                vld1q_s16(y.bsums.as_ptr().add(8)),
-            );
-
-            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
-
-            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
-            let uaux = utmp[1] & KMASK1;
-            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
-            utmp[2] = uaux;
-            utmp[0] &= KMASK1;
-
-            let mins8 = vld1_u8((utmp.as_ptr() as *const u8).add(8));
-            let mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
-            let prod = vaddq_s32(
-                vmull_s16(vget_low_s16(q8sums), vget_low_s16(mins)),
-                vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)),
-            );
-            let sumi_mins = vaddvq_s32(prod);
-
-            let mut scales = utmp.as_ptr() as *const u8;
-
-            let mut q5 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mut qhbits = vld1q_u8_x2(x.qh.as_ptr());
-
-            let mut sumi = 0i32;
-
-            for _j in 0..QK_K / 64 {
-                let q5bits = vld1q_u8_x2(q5);
-                q5 = q5.add(32);
-                let q8bytes = vld1q_s8_x4(q8);
-                q8 = q8.add(64);
-
-                let q5h_0 = vshlq_n_u8(vandq_u8(mone, qhbits.0), 4);
-                let q5h_1 = vshlq_n_u8(vandq_u8(mone, qhbits.1), 4);
-                let q5h_2 = vshlq_n_u8(vandq_u8(mtwo, qhbits.0), 3);
-                let q5h_3 = vshlq_n_u8(vandq_u8(mtwo, qhbits.1), 3);
-                qhbits.0 = vshrq_n_u8(qhbits.0, 2);
-                qhbits.1 = vshrq_n_u8(qhbits.1, 2);
-
-                let q5bytes_0 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.0, m4b), q5h_0));
-                let q5bytes_1 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.1, m4b), q5h_1));
-                let q5bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.0, 4), q5h_2));
-                let q5bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.1, 4), q5h_3));
-
-                // TODO: dotprod
-
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q5bytes_0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q5bytes_1), vget_high_s8(q8bytes.1)),
-                );
-                sumi += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * *scales as i32;
-                scales = scales.add(1);
-
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_2), vget_low_s8(q8bytes.2)),
-                    vmull_s8(vget_high_s8(q5bytes_2), vget_high_s8(q8bytes.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_3), vget_low_s8(q8bytes.3)),
-                    vmull_s8(vget_high_s8(q5bytes_3), vget_high_s8(q8bytes.3)),
-                );
-                sumi += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * *scales as i32;
-                scales = scales.add(1);
-            }
-            sumf += d * sumi as f32 - dmin * sumi_mins as f32;
-        }
-    }
-    Ok(sumf)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut sumf = 0f32;
-    let mut utmp = [0u32; 4];
-    let mut scales = [0u8; 16];
-    const KMASK1: u32 = 0x3f3f3f3f;
-    const KMASK2: u32 = 0x0f0f0f0f;
-    const KMASK3: u32 = 0x03030303;
-
-    unsafe {
-        let m4b = vdupq_n_u8(0xF);
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let dmin = y.d * x.dmin.to_f32();
-
-            let q8sums = vpaddq_s16(
-                vld1q_s16(y.bsums.as_ptr()),
-                vld1q_s16(y.bsums.as_ptr().add(8)),
-            );
-
-            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
-
-            let mins8 = vld1_u32(
-                [
-                    utmp[1] & KMASK1,
-                    ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4),
-                ]
-                .as_ptr(),
-            );
-            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
-            utmp[0] &= KMASK1;
-
-            let mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
-            let prod = vaddq_s32(
-                vmull_s16(vget_low_s16(q8sums), vget_low_s16(mins)),
-                vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)),
-            );
-            sumf -= dmin * vaddvq_s32(prod) as f32;
-
-            LittleEndian::write_u32_into(&utmp, &mut scales);
-
-            let mut q4 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mut sumi1 = 0i32;
-            let mut sumi2 = 0i32;
-
-            for j in 0..QK_K / 64 {
-                let q4bits = vld1q_u8_x2(q4);
-                q4 = q4.add(32);
-                // TODO: dotprod
-                let q8bytes = vld1q_s8_x2(q8);
-                q8 = q8.add(32);
-                let q4bytes = int8x16x2_t(
-                    vreinterpretq_s8_u8(vandq_u8(q4bits.0, m4b)),
-                    vreinterpretq_s8_u8(vandq_u8(q4bits.1, m4b)),
-                );
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
-                );
-                sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * scales[2 * j] as i32;
-
-                let q8bytes = vld1q_s8_x2(q8);
-                q8 = q8.add(32);
-                let q4bytes = int8x16x2_t(
-                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.0, 4)),
-                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.1, 4)),
-                );
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
-                );
-                sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * scales[2 * j + 1] as i32;
-            }
-            sumf += d * (sumi1 + sumi2) as f32;
-        }
-    }
-    Ok(sumf)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q3k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut sumf = 0f32;
-    let mut utmp = [0u32; 4];
-    let mut aux = [0u32; 3];
-    const KMASK1: u32 = 0x03030303;
-    const KMASK2: u32 = 0x0f0f0f0f;
-
-    unsafe {
-        let m3b = vdupq_n_u8(0x3);
-        let m0 = vdupq_n_u8(1);
-        let m1 = vshlq_n_u8(m0, 1);
-        let m2 = vshlq_n_u8(m0, 2);
-        let m3 = vshlq_n_u8(m0, 3);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let mut q3 = x.qs.as_ptr();
-            let qh = x.hmask.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-
-            let mut qhbits = vld1q_u8_x2(qh);
-
-            let mut isum = 0i32;
-
-            // Set up scales
-            LittleEndian::read_u32_into(&x.scales, &mut aux);
-
-            utmp[3] = ((aux[1] >> 4) & KMASK2) | (((aux[2] >> 6) & KMASK1) << 4);
-            utmp[2] = ((aux[0] >> 4) & KMASK2) | (((aux[2] >> 4) & KMASK1) << 4);
-            utmp[1] = (aux[1] & KMASK2) | (((aux[2] >> 2) & KMASK1) << 4);
-            utmp[0] = (aux[0] & KMASK2) | ((aux[2] & KMASK1) << 4);
-
-            let mut scale = utmp.as_mut_ptr() as *mut i8;
-            for j in 0..16 {
-                *scale.add(j) -= 32i8
-            }
-
-            for j in 0..QK_K / 128 {
-                let q3bits = vld1q_u8_x2(q3);
-                q3 = q3.add(32);
-                let q8bytes_1 = vld1q_s8_x4(q8);
-                q8 = q8.add(64);
-                let q8bytes_2 = vld1q_s8_x4(q8);
-                q8 = q8.add(64);
-
-                let q3h_0 = vshlq_n_u8(vbicq_u8(m0, qhbits.0), 2);
-                let q3h_1 = vshlq_n_u8(vbicq_u8(m0, qhbits.1), 2);
-                let q3h_2 = vshlq_n_u8(vbicq_u8(m1, qhbits.0), 1);
-                let q3h_3 = vshlq_n_u8(vbicq_u8(m1, qhbits.1), 1);
-
-                let q3bytes_0 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(q3bits.0, m3b)),
-                    vreinterpretq_s8_u8(q3h_0),
-                );
-                let q3bytes_1 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(q3bits.1, m3b)),
-                    vreinterpretq_s8_u8(q3h_1),
-                );
-                let q3bytes_2 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.0, 2), m3b)),
-                    vreinterpretq_s8_u8(q3h_2),
-                );
-                let q3bytes_3 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.1, 2), m3b)),
-                    vreinterpretq_s8_u8(q3h_3),
-                );
-
-                // TODO: dotprod
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_1.0)),
-                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_1.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_1.1)),
-                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_1.1)),
-                );
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_1.2)),
-                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_1.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_1.3)),
-                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_1.3)),
-                );
-                isum += vaddvq_s16(p0) as i32 * *scale as i32
-                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
-                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
-                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
-                scale = scale.add(4);
-
-                let q3h_0 = vbicq_u8(m2, qhbits.0);
-                let q3h_1 = vbicq_u8(m2, qhbits.1);
-                let q3h_2 = vshrq_n_u8(vbicq_u8(m3, qhbits.0), 1);
-                let q3h_3 = vshrq_n_u8(vbicq_u8(m3, qhbits.1), 1);
-
-                let q3bytes_0 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.0, 4), m3b)),
-                    vreinterpretq_s8_u8(q3h_0),
-                );
-                let q3bytes_1 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.1, 4), m3b)),
-                    vreinterpretq_s8_u8(q3h_1),
-                );
-                let q3bytes_2 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.0, 6), m3b)),
-                    vreinterpretq_s8_u8(q3h_2),
-                );
-                let q3bytes_3 = vsubq_s8(
-                    vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.1, 6), m3b)),
-                    vreinterpretq_s8_u8(q3h_3),
-                );
-
-                // TODO: dotprod
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_2.0)),
-                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_2.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_2.1)),
-                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_2.1)),
-                );
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_2.2)),
-                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_2.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_2.3)),
-                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_2.3)),
-                );
-                isum += vaddvq_s16(p0) as i32 * *scale as i32
-                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
-                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
-                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
-                scale = scale.add(4);
-
-                if j == 0 {
-                    qhbits.0 = vshrq_n_u8(qhbits.0, 4);
-                    qhbits.1 = vshrq_n_u8(qhbits.1, 4);
-                }
-            }
-            sumf += d * isum as f32;
-        }
-    }
-    Ok(sumf)
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
-    }
-    let mut sumf = 0f32;
-    let mut aux = [0u8; 16];
-
-    unsafe {
-        let m3 = vdupq_n_u8(0x3);
-        let m4 = vdupq_n_u8(0xF);
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let d = y.d * x.d.to_f32();
-            let dmin = -y.d * x.dmin.to_f32();
-
-            let mut q2 = x.qs.as_ptr();
-            let mut q8 = y.qs.as_ptr();
-            let sc = x.scales.as_ptr();
-
-            let mins_and_scales = vld1q_u8(sc);
-            let scales = vandq_u8(mins_and_scales, m4);
-            vst1q_u8(aux.as_mut_ptr(), scales);
-
-            let mins = vshrq_n_u8(mins_and_scales, 4);
-            let q8sums = vld1q_s16_x2(y.bsums.as_ptr());
-            let mins16 = int16x8x2_t(
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins))),
-            );
-            let s0 = vaddq_s32(
-                vmull_s16(vget_low_s16(mins16.0), vget_low_s16(q8sums.0)),
-                vmull_s16(vget_high_s16(mins16.0), vget_high_s16(q8sums.0)),
-            );
-            let s1 = vaddq_s32(
-                vmull_s16(vget_low_s16(mins16.1), vget_low_s16(q8sums.1)),
-                vmull_s16(vget_high_s16(mins16.1), vget_high_s16(q8sums.1)),
-            );
-            sumf += dmin * vaddvq_s32(vaddq_s32(s0, s1)) as f32;
-
-            let mut isum = 0i32;
-            let mut is = 0usize;
-
-            // TODO: dotprod
-
-            for _j in 0..QK_K / 128 {
-                let q2bits = vld1q_u8_x2(q2);
-                q2 = q2.add(32);
-
-                let q8bytes = vld1q_s8_x2(q8);
-                q8 = q8.add(32);
-                let mut q2bytes = int8x16x2_t(
-                    vreinterpretq_s8_u8(vandq_u8(q2bits.0, m3)),
-                    vreinterpretq_s8_u8(vandq_u8(q2bits.1, m3)),
-                );
-                isum += multiply_accum_with_scale(&aux, is, 0, q2bytes, q8bytes);
-
-                let q8bytes = vld1q_s8_x2(q8);
-                q8 = q8.add(32);
-                q2bytes.0 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.0, 2), m3));
-                q2bytes.1 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.1, 2), m3));
-                isum += multiply_accum_with_scale(&aux, is, 2, q2bytes, q8bytes);
-
-                let q8bytes = vld1q_s8_x2(q8);
-                q8 = q8.add(32);
-                q2bytes.0 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.0, 4), m3));
-                q2bytes.1 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.1, 4), m3));
-                isum += multiply_accum_with_scale(&aux, is, 4, q2bytes, q8bytes);
-
-                let q8bytes = vld1q_s8_x2(q8);
-                q8 = q8.add(32);
-                q2bytes.0 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.0, 6), m3));
-                q2bytes.1 = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.1, 6), m3));
-                isum += multiply_accum_with_scale(&aux, is, 6, q2bytes, q8bytes);
-
-                is += 8;
-            }
-            sumf += d * isum as f32;
-        }
-    }
-    Ok(sumf)
-}
-
-#[inline(always)]
-unsafe fn multiply_accum_with_scale(
-    aux: &[u8; 16],
-    is: usize,
-    index: usize,
-    q2bytes: int8x16x2_t,
-    q8bytes: int8x16x2_t,
-) -> i32 {
-    let p1 = vaddq_s16(
-        vmull_s8(vget_low_s8(q2bytes.0), vget_low_s8(q8bytes.0)),
-        vmull_s8(vget_high_s8(q2bytes.0), vget_high_s8(q8bytes.0)),
-    );
-    let p2 = vaddq_s16(
-        vmull_s8(vget_low_s8(q2bytes.1), vget_low_s8(q8bytes.1)),
-        vmull_s8(vget_high_s8(q2bytes.1), vget_high_s8(q8bytes.1)),
-    );
-    vaddvq_s16(p1) as i32 * aux[is + index] as i32
-        + vaddvq_s16(p2) as i32 * aux[is + 1 + index] as i32
-}
--- a/candle-core/src/quantized/simd128.rs
+++ b/candle-core/src/quantized/simd128.rs
@ -1,427 +0,0 @@
-use super::k_quants::{BlockQ2K, BlockQ4K, BlockQ4_0, BlockQ6K, BlockQ8K, BlockQ8_0, QK8_0, QK_K};
-use crate::Result;
-use byteorder::{ByteOrder, LittleEndian};
-use half::f16;
-
-use core::arch::wasm32::*;
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {n} is not divisible by {qk}")
-    }
-    let nb = n / QK8_0;
-    if nb % 2 != 0 {
-        crate::bail!("vec_dot_q4_0_q8_0: {nb} is not even")
-    }
-    unsafe {
-        let mut acc = f32x4_splat(0.0f32);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let x1234 = v128_load(x.qs.as_ptr() as *const v128);
-            let x12 = v128_and(x1234, u8x16_splat(0x0F));
-            let x12 = i8x16_sub(x12, i8x16_splat(8));
-            let x34 = u8x16_shr(x1234, 4);
-            let x34 = i8x16_sub(x34, i8x16_splat(8));
-
-            let x1 = i16x8_extend_low_i8x16(x12);
-            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
-            let sum_xy = i32x4_dot_i16x8(x1, y1);
-
-            let x2 = i16x8_extend_high_i8x16(x12);
-            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
-
-            let x3 = i16x8_extend_low_i8x16(x34);
-            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
-
-            let x4 = i16x8_extend_high_i8x16(x34);
-            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
-
-            let sum_xy = f32x4_convert_i32x4(sum_xy);
-
-            // f32x4_relaxed_madd is nightly only.
-            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
-            let scaled = f32x4_mul(sum_xy, d);
-            acc = f32x4_add(acc, scaled)
-        }
-        let res = f32x4_extract_lane::<0>(acc)
-            + f32x4_extract_lane::<1>(acc)
-            + f32x4_extract_lane::<2>(acc)
-            + f32x4_extract_lane::<3>(acc);
-        Ok(res)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) -> Result<f32> {
-    let qk = QK8_0;
-    if n % QK8_0 != 0 {
-        crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
-    }
-    let nb = n / QK8_0;
-    if nb % 2 != 0 {
-        crate::bail!("vec_dot_q8_0_q8_0: {nb} is not even")
-    }
-    unsafe {
-        let mut acc = f32x4_splat(0.0f32);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let x1 = i16x8_load_extend_i8x8(x.qs.as_ptr());
-            let y1 = i16x8_load_extend_i8x8(y.qs.as_ptr());
-            let sum_xy = i32x4_dot_i16x8(x1, y1);
-
-            let x2 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(8));
-            let y2 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(8));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x2, y2));
-
-            let x3 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(16));
-            let y3 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(16));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x3, y3));
-
-            let x4 = i16x8_load_extend_i8x8(x.qs.as_ptr().add(24));
-            let y4 = i16x8_load_extend_i8x8(y.qs.as_ptr().add(24));
-            let sum_xy = i32x4_add(sum_xy, i32x4_dot_i16x8(x4, y4));
-
-            let sum_xy = f32x4_convert_i32x4(sum_xy);
-
-            // f32x4_relaxed_madd is nightly only.
-            let d = f32x4_splat(f16::to_f32(x.d) * f16::to_f32(y.d));
-            let scaled = f32x4_mul(sum_xy, d);
-            acc = f32x4_add(acc, scaled)
-        }
-        let res = f32x4_extract_lane::<0>(acc)
-            + f32x4_extract_lane::<1>(acc)
-            + f32x4_extract_lane::<2>(acc)
-            + f32x4_extract_lane::<3>(acc);
-        Ok(res)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q2k_q8k: {n} is not divisible by {QK_K}")
-    }
-    unsafe {
-        let mut sumf = f32x4_splat(0f32);
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let mut q2: &[_] = &x.qs;
-            let mut q8: &[_] = &y.qs;
-            let sc = &x.scales;
-
-            let mut summs = i32x4_splat(0);
-            for i in (0..(QK_K / 16)).step_by(4) {
-                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(i));
-                let scales = i32x4_shr(
-                    i32x4(
-                        sc[i] as i32,
-                        sc[i + 1] as i32,
-                        sc[i + 2] as i32,
-                        sc[i + 3] as i32,
-                    ),
-                    4,
-                );
-                summs = i32x4_add(summs, i32x4_mul(bsums, scales))
-            }
-            let summs = f32x4_convert_i32x4(summs);
-
-            let dall = y.d * x.d.to_f32();
-            let dmin = y.d * x.dmin.to_f32();
-
-            let mut isum = i32x4_splat(0);
-            let mut is = 0;
-            for _ in 0..(QK_K / 128) {
-                let mut shift = 0;
-                for _ in 0..4 {
-                    let d = (sc[is] & 0xF) as i32;
-                    is += 1;
-                    let mut isuml = i16x8_splat(0);
-                    for l in (0..16).step_by(8) {
-                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
-                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
-                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
-                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
-                    }
-                    let dd = i32x4_splat(d);
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
-                    let d = (sc[is] & 0xF) as i32;
-                    is += 1;
-                    let mut isuml = i16x8_splat(0);
-                    for l in (16..32).step_by(8) {
-                        let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(l));
-                        let q2 = i16x8_load_extend_u8x8(q2.as_ptr().add(l));
-                        let q2 = v128_and(i16x8_shr(q2, shift), i16x8_splat(3));
-                        isuml = i16x8_add(isuml, i16x8_mul(q2, q8))
-                    }
-                    let dd = i32x4_splat(d);
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_low_i16x8(isuml), dd));
-                    isum = i32x4_add(isum, i32x4_mul(i32x4_extend_high_i16x8(isuml), dd));
-                    shift += 2;
-                    // adjust the indexing
-                    q8 = &q8[32..];
-                }
-                // adjust the indexing
-                q2 = &q2[32..];
-            }
-            let isum = f32x4_convert_i32x4(isum);
-            sumf = f32x4_add(
-                sumf,
-                f32x4_sub(
-                    f32x4_mul(isum, f32x4_splat(dall)),
-                    f32x4_mul(summs, f32x4_splat(dmin)),
-                ),
-            );
-        }
-        let sumf = f32x4_extract_lane::<0>(sumf)
-            + f32x4_extract_lane::<1>(sumf)
-            + f32x4_extract_lane::<2>(sumf)
-            + f32x4_extract_lane::<3>(sumf);
-        Ok(sumf)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q4k_q8k: {n} is not divisible by {QK_K}")
-    }
-
-    const KMASK1: u32 = 0x3f3f3f3f;
-    const KMASK2: u32 = 0x0f0f0f0f;
-    const KMASK3: u32 = 0x03030303;
-
-    let mut utmp: [u32; 4] = [0; 4];
-    let mut scales: [u8; 8] = [0; 8];
-    let mut mins: [u8; 8] = [0; 8];
-
-    let mut aux8: [u8; QK_K] = [0; QK_K];
-    let mut sums = f32x4_splat(0f32);
-    unsafe {
-        for (y, x) in ys.iter().zip(xs.iter()) {
-            let q4 = &x.qs;
-            let q8 = &y.qs;
-
-            for j in 0..QK_K / 64 {
-                let q4_1 = v128_load(q4.as_ptr().add(32 * j) as *const v128);
-                let q4_2 = v128_load(q4.as_ptr().add(32 * j + 16) as *const v128);
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j) as *mut v128,
-                    v128_and(q4_1, u8x16_splat(0x0F)),
-                );
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j + 16) as *mut v128,
-                    v128_and(q4_2, u8x16_splat(0x0F)),
-                );
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j + 32) as *mut v128,
-                    u8x16_shr(q4_1, 4),
-                );
-                v128_store(
-                    aux8.as_mut_ptr().add(64 * j + 48) as *mut v128,
-                    u8x16_shr(q4_2, 4),
-                );
-            }
-
-            LittleEndian::read_u32_into(&x.scales, &mut utmp[0..3]);
-
-            utmp[3] = ((utmp[2] >> 4) & KMASK2) | (((utmp[1] >> 6) & KMASK3) << 4);
-            let uaux = utmp[1] & KMASK1;
-            utmp[1] = (utmp[2] & KMASK2) | (((utmp[0] >> 6) & KMASK3) << 4);
-            utmp[2] = uaux;
-            utmp[0] &= KMASK1;
-
-            //extract scales and mins
-            LittleEndian::write_u32_into(&utmp[0..2], &mut scales);
-            LittleEndian::write_u32_into(&utmp[2..4], &mut mins);
-
-            let mut sumi = i32x4_splat(0);
-            for j in (0..QK_K / 16).step_by(4) {
-                let bsums = i32x4_load_extend_i16x4(y.bsums.as_ptr().add(j));
-                let (m1, m2) = (mins[j / 2] as i32, mins[j / 2 + 1] as i32);
-                let mins = i32x4(m1, m1, m2, m2);
-                sumi = i32x4_add(sumi, i32x4_mul(bsums, mins));
-            }
-
-            let mut aux32 = i32x4_splat(0i32);
-            for (scale_i, scale) in scales.iter().enumerate() {
-                let scale = i32x4_splat(*scale as i32);
-                for j in 0..4 {
-                    let i = 32 * scale_i + 8 * j;
-                    let q8 = i16x8_load_extend_i8x8(q8.as_ptr().add(i));
-                    let aux8 = i16x8_load_extend_u8x8(aux8.as_ptr().add(i));
-                    let aux16 = i16x8_mul(q8, aux8);
-                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_low_i16x8(aux16)));
-                    aux32 = i32x4_add(aux32, i32x4_mul(scale, i32x4_extend_high_i16x8(aux16)));
-                }
-            }
-            let aux32 = f32x4_convert_i32x4(aux32);
-            let d = f32x4_splat(x.d.to_f32() * y.d);
-            sums = f32x4_add(sums, f32x4_mul(aux32, d));
-            let dmin = x.dmin.to_f32() * y.d;
-            let dmin = f32x4_splat(dmin);
-            let sumi = f32x4_convert_i32x4(sumi);
-            sums = f32x4_sub(sums, f32x4_mul(sumi, dmin));
-        }
-        let sums = f32x4_extract_lane::<0>(sums)
-            + f32x4_extract_lane::<1>(sums)
-            + f32x4_extract_lane::<2>(sums)
-            + f32x4_extract_lane::<3>(sums);
-        Ok(sums)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Result<f32> {
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q6k_q8k: {n} is not divisible by {QK_K}")
-    }
-
-    let mut aux8 = [0i8; QK_K];
-    unsafe {
-        let mut sums = f32x4_splat(0f32);
-
-        for (x, y) in xs.iter().zip(ys.iter()) {
-            let q4 = &x.ql;
-            let qh = &x.qh;
-            let q8 = &y.qs;
-            let mut aux32 = f32x4_splat(0f32);
-
-            for j in (0..QK_K).step_by(128) {
-                let aux8 = aux8.as_mut_ptr().add(j);
-                let q4 = &q4.as_ptr().add(j / 2);
-                let qh = &qh.as_ptr().add(j / 4);
-                for l in (0..32).step_by(16) {
-                    // aux8[l] = (((q4[l] & 0xF) | ((qh[l] & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        v128_and(v128_load(q4.add(l) as *const v128), u8x16_splat(0xF)),
-                        u8x16_shl(
-                            v128_and(v128_load(qh.add(l) as *const v128), u8x16_splat(3)),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-
-                    // aux8[l + 32] =
-                    //    (((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        v128_and(v128_load(q4.add(l + 32) as *const v128), u8x16_splat(0xF)),
-                        u8x16_shl(
-                            v128_and(
-                                u8x16_shr(v128_load(qh.add(l) as *const v128), 2),
-                                u8x16_splat(3),
-                            ),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l + 32) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-
-                    // aux8[l + 64] = (((q4[l] >> 4) | (((qh[l] >> 4) & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        u8x16_shr(v128_load(q4.add(l) as *const v128), 4),
-                        u8x16_shl(
-                            v128_and(
-                                u8x16_shr(v128_load(qh.add(l) as *const v128), 4),
-                                u8x16_splat(3),
-                            ),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l + 64) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-
-                    // aux8[l + 96] =
-                    //    (((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) as i32 - 32) as i8;
-                    let a8 = v128_or(
-                        u8x16_shr(v128_load(q4.add(l + 32) as *const v128), 4),
-                        u8x16_shl(
-                            v128_and(
-                                u8x16_shr(v128_load(qh.add(l) as *const v128), 6),
-                                u8x16_splat(3),
-                            ),
-                            4,
-                        ),
-                    );
-                    let a8_low = i16x8_sub(i16x8_extend_low_u8x16(a8), i16x8_splat(32));
-                    let a8_high = i16x8_sub(i16x8_extend_high_u8x16(a8), i16x8_splat(32));
-                    v128_store(
-                        aux8.add(l + 96) as *mut v128,
-                        i8x16_narrow_i16x8(a8_low, a8_high),
-                    );
-                }
-            }
-
-            for (j, &scale) in x.scales.iter().enumerate() {
-                let scale = f32x4_splat(scale as f32);
-                for offset in [0, 8] {
-                    let aux16 = i16x8_mul(
-                        i16x8_load_extend_i8x8(q8.as_ptr().add(16 * j + offset)),
-                        i16x8_load_extend_i8x8(aux8.as_ptr().add(16 * j + offset)),
-                    );
-                    aux32 = f32x4_add(
-                        aux32,
-                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_low_i16x8(aux16)), scale),
-                    );
-                    aux32 = f32x4_add(
-                        aux32,
-                        f32x4_mul(f32x4_convert_i32x4(i32x4_extend_high_i16x8(aux16)), scale),
-                    );
-                }
-            }
-
-            let d = f32x4_splat(x.d.to_f32() * y.d);
-            sums = f32x4_add(sums, f32x4_mul(aux32, d));
-        }
-        let sums = f32x4_extract_lane::<0>(sums)
-            + f32x4_extract_lane::<1>(sums)
-            + f32x4_extract_lane::<2>(sums)
-            + f32x4_extract_lane::<3>(sums);
-        Ok(sums)
-    }
-}
-
-#[inline(always)]
-pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Result<f32> {
-    let qk = QK_K;
-    if n % QK_K != 0 {
-        crate::bail!("vec_dot_q8k_q8k: {n} is not divisible by {qk}")
-    }
-
-    unsafe {
-        let mut acc = f32x4_splat(0.0f32);
-        for (xs, ys) in xs.iter().zip(ys.iter()) {
-            let x_qs = xs.qs.as_ptr();
-            let y_qs = ys.qs.as_ptr();
-            let mut sumi = i32x4_splat(0);
-            for j in (0..QK_K).step_by(8) {
-                let xs = i16x8_load_extend_i8x8(x_qs.add(j));
-                let ys = i16x8_load_extend_i8x8(y_qs.add(j));
-                let sum_xy = i32x4_dot_i16x8(xs, ys);
-                sumi = i32x4_add(sumi, sum_xy)
-            }
-            let d = f32x4_splat(xs.d * ys.d);
-            acc = f32x4_add(acc, f32x4_mul(f32x4_convert_i32x4(sumi), d))
-        }
-        let res = f32x4_extract_lane::<0>(acc)
-            + f32x4_extract_lane::<1>(acc)
-            + f32x4_extract_lane::<2>(acc)
-            + f32x4_extract_lane::<3>(acc);
-        Ok(res)
-    }
-}
--- a/candle-core/src/quantized/utils.rs
+++ b/candle-core/src/quantized/utils.rs
@ -1,326 +0,0 @@
-use crate::Result;
-
-pub(super) fn nearest_int(v: f32) -> i32 {
-    v.round() as i32
-}
-
-/// Validates that the input and output are the right size and returns an iterator which maps each
-/// input region `xs` to its corresponding output block in `ys`. Each output region is guaranteed
-/// to be `T::BLCK_SIZE` long.
-pub(super) fn group_for_quantization<'a, 'b, T: super::k_quants::GgmlType>(
-    xs: &'b [f32],
-    ys: &'a mut [T],
-) -> Result<Vec<(&'a mut T, &'b [f32])>> {
-    let block_size = T::BLCK_SIZE;
-    let dtype = T::DTYPE;
-
-    let expected_blocks = xs.len() / block_size;
-    let actual_blocks = ys.len();
-
-    // Validate that the input is the right size
-    if expected_blocks != actual_blocks {
-        crate::bail!("quantize {dtype:?}: expected {expected_blocks} blocks but only {actual_blocks} were provided!")
-    }
-
-    Ok(ys.iter_mut().zip(xs.chunks_exact(block_size)).collect())
-}
-
-/// Validates that the input and output are the right size and returns an iterator which maps each
-/// input block `xs` to its corresponding output region in `ys`. Each output region is guaranteed
-/// to be `T::BLCK_SIZE` long.
-pub(super) fn group_for_dequantization<'a, 'b, T: super::k_quants::GgmlType>(
-    xs: &'a [T],
-    ys: &'b mut [f32],
-) -> Result<Vec<(&'a T, &'b mut [f32])>> {
-    let block_size = T::BLCK_SIZE;
-    let dtype = T::DTYPE;
-
-    let actual_output_len = ys.len();
-    let expected_output_len = xs.len() * block_size;
-    // Validate that the output is the right size
-    if expected_output_len != actual_output_len {
-        crate::bail!("dequantize {dtype:?}: ys (len = {actual_output_len}) does not match the expected length of {expected_output_len}!")
-    }
-
-    // Zip the blocks and outputs together
-    Ok(xs.iter().zip(ys.chunks_exact_mut(block_size)).collect())
-}
-
-pub(super) fn get_scale_min_k4(j: usize, q: &[u8]) -> (u8, u8) {
-    if j < 4 {
-        let d = q[j] & 63;
-        let m = q[j + 4] & 63;
-        (d, m)
-    } else {
-        let d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
-        let m = (q[j + 4] >> 4) | ((q[j] >> 6) << 4);
-        (d, m)
-    }
-}
-
-pub(super) unsafe fn make_qx_quants(
-    n: usize,
-    nmax: i32,
-    x: *const f32,
-    ls: *mut i8,
-    rmse_type: i32,
-) -> f32 {
-    let mut max = 0f32;
-    let mut amax = 0f32;
-    for i in 0..n {
-        let x = *x.add(i);
-        let ax = x.abs();
-        if ax > amax {
-            amax = ax;
-            max = x;
-        }
-    }
-    if amax == 0. {
-        // all zero
-        for i in 0..n {
-            *ls.add(i) = 0;
-        }
-        return 0.;
-    }
-    let mut iscale = -(nmax as f32) / max;
-    if rmse_type == 0 {
-        for i in 0..n {
-            let x = *x.add(i);
-            let l = nearest_int(iscale * x);
-            *ls.add(i) = (nmax + l.clamp(-nmax, nmax - 1)) as i8;
-        }
-        return 1.0 / iscale;
-    }
-    let weight_type = rmse_type % 2;
-    let mut sumlx = 0f32;
-    let mut suml2 = 0f32;
-    for i in 0..n {
-        let x = *x.add(i);
-        let l = nearest_int(iscale * x);
-        let l = l.clamp(-nmax, nmax - 1);
-        *ls.add(i) = (l + nmax) as i8;
-        let w = if weight_type == 1 { x * x } else { 1.0 };
-        let l = l as f32;
-        sumlx += w * x * l;
-        suml2 += w * l * l;
-    }
-    let mut scale = sumlx / suml2;
-    let mut best = scale * sumlx;
-    for _itry in 0..3 {
-        let iscale = 1.0 / scale;
-        let mut slx = 0f32;
-        let mut sl2 = 0f32;
-        let mut changed = false;
-        for i in 0..n {
-            let x = *x.add(i);
-            let l = nearest_int(iscale * x);
-            let l = l.clamp(-nmax, nmax - 1);
-            if l + nmax != *ls.add(i) as i32 {
-                changed = true;
-            }
-            let w = if weight_type == 1 { x * x } else { 1f32 };
-            let l = l as f32;
-            slx += w * x * l;
-            sl2 += w * l * l;
-        }
-        if !changed || sl2 == 0.0 || slx * slx <= best * sl2 {
-            break;
-        }
-        for i in 0..n {
-            let x = *x.add(i);
-            let l = nearest_int(iscale * x);
-            *ls.add(i) = (nmax + l.clamp(-nmax, nmax - 1)) as i8;
-        }
-        sumlx = slx;
-        suml2 = sl2;
-        scale = sumlx / suml2;
-        best = scale * sumlx;
-    }
-    for _itry in 0..5 {
-        let mut n_changed = 0;
-        for i in 0..n {
-            let x = *x.add(i);
-            let w = if weight_type == 1 { x * x } else { 1. };
-            let l = *ls.add(i) as i32 - nmax;
-            let mut slx = sumlx - w * x * l as f32;
-            if slx > 0. {
-                let mut sl2 = suml2 - w * l as f32 * l as f32;
-                let new_l = nearest_int(x * sl2 / slx);
-                let new_l = new_l.clamp(-nmax, nmax - 1);
-                if new_l != l {
-                    slx += w * x * new_l as f32;
-                    sl2 += w * new_l as f32 * new_l as f32;
-                    if sl2 > 0. && slx * slx * suml2 > sumlx * sumlx * sl2 {
-                        *ls.add(i) = (nmax + new_l) as i8;
-                        sumlx = slx;
-                        suml2 = sl2;
-                        scale = sumlx / suml2;
-                        best = scale * sumlx;
-                        n_changed += 1;
-                    }
-                }
-            }
-        }
-        if n_changed == 0 {
-            break;
-        }
-    }
-    if rmse_type < 3 {
-        return scale;
-    }
-    for is in -4..4 {
-        if is == 0 {
-            continue;
-        }
-        iscale = -(nmax as f32 + 0.1f32 * is as f32) / max;
-        let mut sumlx = 0.;
-        let mut suml2 = 0.;
-        for i in 0..n {
-            let x = *x.add(i);
-            let l = nearest_int(iscale * x);
-            let l = l.clamp(-nmax, nmax - 1);
-            let w = if weight_type == 1 { x * x } else { 1. };
-            let l = l as f32;
-            sumlx += w * x * l;
-            suml2 += w * l * l;
-        }
-        if suml2 > 0. && sumlx * sumlx > best * suml2 {
-            for i in 0..n {
-                let x = *x.add(i);
-                let l = nearest_int(iscale * x);
-                *ls.add(i) = (nmax + l.clamp(-nmax, nmax - 1)) as i8;
-            }
-            scale = sumlx / suml2;
-            best = scale * sumlx;
-        }
-    }
-    scale
-}
-
-// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L224
-pub(super) fn make_qkx1_quants(nmax: i32, ntry: usize, x: &[f32]) -> (f32, f32) {
-    let n = x.len();
-    let mut l = vec![0; n];
-    // Get min/max
-    let min = *x
-        .iter()
-        .take(n)
-        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&x[0]);
-    let max = *x.iter().max_by(|a, b| a.total_cmp(b)).unwrap_or(&x[0]);
-
-    // If min == max, all values are the same => nothing to do here
-    if max == min {
-        return (0.0, 0.0);
-    }
-
-    // Ensure min <= 0.0
-    let mut min = min.min(0.);
-
-    // Compute scale and inverse scale
-    let mut iscale = nmax as f32 / (max - min);
-    let mut scale = 1.0 / iscale;
-
-    for _ in 0..ntry {
-        let mut sumlx = 0.0;
-        let mut suml2 = 0;
-        let mut did_change = false;
-
-        for (i, value) in x.iter().enumerate().take(n) {
-            let li = nearest_int(iscale * (value - min)).clamp(0, nmax);
-            let clamped_li = li as u8;
-            if clamped_li != l[i] {
-                l[i] = clamped_li;
-                did_change = true;
-            }
-            sumlx += (value - min) * li as f32;
-            suml2 += li * li;
-        }
-        scale = sumlx / suml2 as f32;
-
-        let sum: f32 = x
-            .iter()
-            .take(n)
-            .zip(l.iter().take(n))
-            .map(|(xi, &li)| xi - scale * li as f32)
-            .sum();
-
-        min = sum / n as f32;
-        if min > 0.0 {
-            min = 0.0;
-        }
-        iscale = 1.0 / scale;
-        if !did_change {
-            break;
-        }
-    }
-    (scale, -min)
-}
-
-// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L165
-pub(super) fn make_q3_quants(x: &[f32], nmax: i32, do_rmse: bool) -> f32 {
-    let n = x.len();
-    let mut l = vec![0i8; n];
-
-    let mut max = 0.0;
-    let mut amax = 0.0;
-    for &xi in x.iter().take(n) {
-        let ax = xi.abs();
-        if ax > amax {
-            amax = ax;
-            max = xi;
-        }
-    }
-
-    if amax == 0.0 {
-        return 0.0;
-    }
-
-    let iscale = -(nmax as f32) / max;
-    if do_rmse {
-        let mut sumlx = 0.0;
-        let mut suml2 = 0.0;
-        for i in 0..n {
-            let li = (iscale * x[i]).round() as i32;
-            let li = li.clamp(-nmax, nmax - 1);
-            l[i] = li as i8;
-            let w = x[i] * x[i];
-            sumlx += w * x[i] * li as f32;
-            suml2 += w * (li * li) as f32;
-        }
-        for _ in 0..5 {
-            let mut n_changed = 0;
-            for i in 0..n {
-                let w = x[i] * x[i];
-                let mut slx = sumlx - w * x[i] * l[i] as f32;
-                if slx > 0.0 {
-                    let mut sl2 = suml2 - w * (l[i] as i32 * l[i] as i32) as f32;
-                    let mut new_l = (x[i] * sl2 / slx).round() as i32;
-                    new_l = new_l.clamp(-nmax, nmax - 1);
-                    if new_l != l[i] as i32 {
-                        slx += w * x[i] * new_l as f32;
-                        sl2 += w * (new_l * new_l) as f32;
-                        if sl2 > 0.0 && slx * slx * suml2 > sumlx * sumlx * sl2 {
-                            l[i] = new_l as i8;
-                            sumlx = slx;
-                            suml2 = sl2;
-                            n_changed += 1;
-                        }
-                    }
-                }
-            }
-            if n_changed == 0 {
-                break;
-            }
-        }
-        for li in l.iter_mut() {
-            *li += nmax as i8;
-        }
-        return sumlx / suml2;
-    }
-    for i in 0..n {
-        let li = (iscale * x[i]).round() as i32;
-        l[i] = (li.clamp(-nmax, nmax - 1) + nmax) as i8;
-    }
-    1.0 / iscale
-}
--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
@ -1,16 +1,13 @@
 use crate::{DType, Device, Error, Result, Tensor, WithDType};
 use safetensors::tensor as st;
-use safetensors::tensor::SafeTensors;
+pub use safetensors::tensor::SafeTensors;
 use std::borrow::Cow;
-use std::collections::HashMap;
-use std::path::Path;

 impl From<DType> for st::Dtype {
    fn from(value: DType) -> Self {
        match value {
            DType::U8 => st::Dtype::U8,
            DType::U32 => st::Dtype::U32,
-            DType::I64 => st::Dtype::I64,
            DType::BF16 => st::Dtype::BF16,
            DType::F16 => st::Dtype::F16,
            DType::F32 => st::Dtype::F32,
@ -25,7 +22,6 @@ impl TryFrom<st::Dtype> for DType {
        match value {
            st::Dtype::U8 => Ok(DType::U8),
            st::Dtype::U32 => Ok(DType::U32),
-            st::Dtype::I64 => Ok(DType::I64),
            st::Dtype::BF16 => Ok(DType::BF16),
            st::Dtype::F16 => Ok(DType::F16),
            st::Dtype::F32 => Ok(DType::F32),
@ -56,101 +52,40 @@ impl st::View for Tensor {
    }
 }

-impl st::View for &Tensor {
-    fn dtype(&self) -> st::Dtype {
-        (*self).dtype().into()
-    }
-    fn shape(&self) -> &[usize] {
-        self.dims()
-    }
-
-    fn data(&self) -> Cow<[u8]> {
-        // This copies data from GPU to CPU.
-        // TODO: Avoid the unwrap here.
-        Cow::Owned(convert_back(self).unwrap())
-    }
-
-    fn data_len(&self) -> usize {
-        let n: usize = self.dims().iter().product();
-        let bytes_per_element = (*self).dtype().size_in_bytes();
-        n * bytes_per_element
-    }
-}
-
 impl Tensor {
-    pub fn save_safetensors<P: AsRef<Path>>(&self, name: &str, filename: P) -> Result<()> {
+    pub fn save_safetensors<P: AsRef<std::path::Path>>(
+        &self,
+        name: &str,
+        filename: P,
+    ) -> Result<()> {
        let data = [(name, self.clone())];
        Ok(st::serialize_to_file(data, &None, filename.as_ref())?)
    }
 }

-fn convert_slice<T: WithDType>(data: &[u8], shape: &[usize], device: &Device) -> Result<Tensor> {
-    let size_in_bytes = T::DTYPE.size_in_bytes();
-    let elem_count = data.len() / size_in_bytes;
-    if (data.as_ptr() as usize) % size_in_bytes == 0 {
-        // SAFETY This is safe because we just checked that this
-        // was correctly aligned.
-        let data: &[T] =
-            unsafe { std::slice::from_raw_parts(data.as_ptr() as *const T, elem_count) };
-        Tensor::from_slice(data, shape, device)
-    } else {
-        // XXX: We need to specify `T` here, otherwise the compiler will infer u8 because of the following cast
-        // Making this vector too small to fit a full f16/f32/f64 weights, resulting in out-of-bounds access
-        let mut c: Vec<T> = Vec::with_capacity(elem_count);
-        // SAFETY: We just created c, so the allocated memory is necessarily
-        // contiguous and non overlapping with the view's data.
-        // We're downgrading the `c` pointer from T to u8, which removes alignment
-        // constraints.
-        unsafe {
-            std::ptr::copy_nonoverlapping(data.as_ptr(), c.as_mut_ptr() as *mut u8, data.len());
-            c.set_len(elem_count)
-        }
-        Tensor::from_slice(&c, shape, device)
-    }
-}
-
-fn convert_slice_with_cast<T: Sized + Copy, U: WithDType, F: Fn(T) -> Result<U>>(
-    data: &[u8],
-    shape: &[usize],
-    device: &Device,
-    conv: F,
-) -> Result<Tensor> {
-    let size_in_bytes = std::mem::size_of::<T>();
-    let elem_count = data.len() / size_in_bytes;
-    if (data.as_ptr() as usize) % size_in_bytes == 0 {
-        // SAFETY This is safe because we just checked that this
-        // was correctly aligned.
-        let data: &[T] =
-            unsafe { std::slice::from_raw_parts(data.as_ptr() as *const T, elem_count) };
-        let data = data.iter().map(|t| conv(*t)).collect::<Result<Vec<_>>>()?;
-        Tensor::from_vec(data, shape, device)
-    } else {
-        // XXX: We need to specify `T` here, otherwise the compiler will infer u8 because of the following cast
-        // Making this vector too small to fit a full f16/f32/f64 weights, resulting in out-of-bounds access
-        let mut c: Vec<T> = Vec::with_capacity(elem_count);
-        // SAFETY: We just created c, so the allocated memory is necessarily
-        // contiguous and non overlapping with the view's data.
-        // We're downgrading the `c` pointer from T to u8, which removes alignment
-        // constraints.
-        unsafe {
-            std::ptr::copy_nonoverlapping(data.as_ptr(), c.as_mut_ptr() as *mut u8, data.len());
-            c.set_len(elem_count)
-        }
-        let c = c.into_iter().map(conv).collect::<Result<Vec<_>>>()?;
-        Tensor::from_vec(c, shape, device)
-    }
-}
-
-fn convert_with_cast_<T: Sized + Copy, U: WithDType, F: Fn(T) -> Result<U>>(
-    view: &st::TensorView<'_>,
-    device: &Device,
-    conv: F,
-) -> Result<Tensor> {
-    convert_slice_with_cast::<T, U, F>(view.data(), view.shape(), device, conv)
-}
-
 fn convert_<T: WithDType>(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
-    convert_slice::<T>(view.data(), view.shape(), device)
+    let v = view.data();
+    let size_in_bytes = T::DTYPE.size_in_bytes();
+    let elem_count = v.len() / size_in_bytes;
+    if (v.as_ptr() as usize) % size_in_bytes == 0 {
+        // SAFETY This is safe because we just checked that this
+        // was correctly aligned.
+        let data: &[T] = unsafe { std::slice::from_raw_parts(v.as_ptr() as *const T, elem_count) };
+        Tensor::from_slice(data, view.shape(), device)
+    } else {
+        // XXX: We need to specify `T` here, otherwise the compiler will infer u8 because of the following cast
+        // Making this vector too small to fit a full f16/f32/f64 weights, resulting in out-of-bounds access
+        let mut c: Vec<T> = Vec::with_capacity(elem_count);
+        // SAFETY: We just created c, so the allocated memory is necessarily
+        // contiguous and non overlapping with the view's data.
+        // We're downgrading the `c` pointer from T to u8, which removes alignment
+        // constraints.
+        unsafe {
+            std::ptr::copy_nonoverlapping(v.as_ptr(), c.as_mut_ptr() as *mut u8, v.len());
+            c.set_len(elem_count)
+        }
+        Tensor::from_slice(&c, view.shape(), device)
+    }
 }

 fn convert_back_<T: WithDType>(mut vs: Vec<T>) -> Vec<u8> {
@ -177,38 +112,10 @@ impl<'a> Load for st::TensorView<'a> {
    }
 }

-impl Tensor {
-    pub fn from_raw_buffer(
-        data: &[u8],
-        dtype: DType,
-        shape: &[usize],
-        device: &Device,
-    ) -> Result<Self> {
-        match dtype {
-            DType::U8 => convert_slice::<u8>(data, shape, device),
-            DType::U32 => convert_slice::<u32>(data, shape, device),
-            DType::I64 => convert_slice::<i64>(data, shape, device),
-            DType::BF16 => convert_slice::<half::bf16>(data, shape, device),
-            DType::F16 => convert_slice::<half::f16>(data, shape, device),
-            DType::F32 => convert_slice::<f32>(data, shape, device),
-            DType::F64 => convert_slice::<f64>(data, shape, device),
-        }
-    }
-}
-
-fn convert(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
+pub fn convert(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
    match view.dtype() {
        st::Dtype::U8 => convert_::<u8>(view, device),
-        st::Dtype::U16 => {
-            let conv = |x| Ok(u32::from(x));
-            convert_with_cast_::<u16, u32, _>(view, device, conv)
-        }
-        st::Dtype::U32 => convert_::<u32>(view, device),
-        st::Dtype::I32 => {
-            let conv = |x| Ok(i64::from(x));
-            convert_with_cast_::<i32, i64, _>(view, device, conv)
-        }
-        st::Dtype::I64 => convert_::<i64>(view, device),
+        st::Dtype::U32 => convert_::<u8>(view, device),
        st::Dtype::BF16 => convert_::<half::bf16>(view, device),
        st::Dtype::F16 => convert_::<half::f16>(view, device),
        st::Dtype::F32 => convert_::<f32>(view, device),
@ -217,13 +124,12 @@ fn convert(view: &st::TensorView<'_>, device: &Device) -> Result<Tensor> {
    }
 }

-fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
+pub fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
    // TODO: This makes an unnecessary copy when the tensor is on the cpu.
    let tensor = tensor.flatten_all()?;
    match tensor.dtype() {
        DType::U8 => Ok(convert_back_::<u8>(tensor.to_vec1()?)),
        DType::U32 => Ok(convert_back_::<u32>(tensor.to_vec1()?)),
-        DType::I64 => Ok(convert_back_::<i64>(tensor.to_vec1()?)),
        DType::F16 => Ok(convert_back_::<half::f16>(tensor.to_vec1()?)),
        DType::BF16 => Ok(convert_back_::<half::bf16>(tensor.to_vec1()?)),
        DType::F32 => Ok(convert_back_::<f32>(tensor.to_vec1()?)),
@ -231,158 +137,7 @@ fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
    }
 }

-pub fn load<P: AsRef<Path>>(filename: P, device: &Device) -> Result<HashMap<String, Tensor>> {
-    let data = std::fs::read(filename.as_ref())?;
-    load_buffer(&data[..], device)
-}
-
-pub fn load_buffer(data: &[u8], device: &Device) -> Result<HashMap<String, Tensor>> {
-    let st = safetensors::SafeTensors::deserialize(data)?;
-    st.tensors()
-        .into_iter()
-        .map(|(name, view)| Ok((name, view.load(device)?)))
-        .collect()
-}
-
-pub fn save<K: AsRef<str> + Ord + std::fmt::Display, P: AsRef<Path>>(
-    tensors: &HashMap<K, Tensor>,
-    filename: P,
-) -> Result<()> {
-    Ok(st::serialize_to_file(tensors, &None, filename.as_ref())?)
-}
-
-#[derive(yoke::Yokeable)]
-struct SafeTensors_<'a>(SafeTensors<'a>);
-
-pub struct MmapedSafetensors {
-    safetensors: Vec<yoke::Yoke<SafeTensors_<'static>, memmap2::Mmap>>,
-    routing: Option<HashMap<String, usize>>,
-}
-
-impl MmapedSafetensors {
-    /// Creates a wrapper around a memory mapped file and deserialize the safetensors header.
-    ///
-    /// # Safety
-    ///
-    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
-        let p = p.as_ref();
-        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
-        let file = memmap2::MmapOptions::new()
-            .map(&file)
-            .map_err(|e| Error::from(e).with_path(p))?;
-        let safetensors = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
-            file,
-            |data: &[u8]| {
-                let st = safetensors::SafeTensors::deserialize(data)
-                    .map_err(|e| Error::from(e).with_path(p))?;
-                Ok::<_, Error>(SafeTensors_(st))
-            },
-        )?;
-        Ok(Self {
-            safetensors: vec![safetensors],
-            routing: None,
-        })
-    }
-
-    /// Creates a wrapper around multiple memory mapped file and deserialize the safetensors headers.
-    ///
-    /// If a tensor name appears in multiple files, the last entry is returned.
-    ///
-    /// # Safety
-    ///
-    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn multi<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
-        let mut routing = HashMap::new();
-        let mut safetensors = vec![];
-        for (index, p) in paths.iter().enumerate() {
-            let p = p.as_ref();
-            let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
-            let file = memmap2::MmapOptions::new()
-                .map(&file)
-                .map_err(|e| Error::from(e).with_path(p))?;
-            let data = yoke::Yoke::<SafeTensors_<'static>, memmap2::Mmap>::try_attach_to_cart(
-                file,
-                |data: &[u8]| {
-                    let st = safetensors::SafeTensors::deserialize(data)
-                        .map_err(|e| Error::from(e).with_path(p))?;
-                    Ok::<_, Error>(SafeTensors_(st))
-                },
-            )?;
-            for k in data.get().0.names() {
-                routing.insert(k.to_string(), index);
-            }
-            safetensors.push(data)
-        }
-        Ok(Self {
-            safetensors,
-            routing: Some(routing),
-        })
-    }
-
-    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
-        self.get(name)?.load(dev)
-    }
-
-    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
-        let mut tensors = vec![];
-        for safetensors in self.safetensors.iter() {
-            tensors.push(safetensors.get().0.tensors())
-        }
-        tensors.into_iter().flatten().collect()
-    }
-
-    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
-        let index = match &self.routing {
-            None => 0,
-            Some(routing) => {
-                let index = routing.get(name).ok_or_else(|| {
-                    Error::CannotFindTensor {
-                        path: name.to_string(),
-                    }
-                    .bt()
-                })?;
-                *index
-            }
-        };
-        Ok(self.safetensors[index].get().0.tensor(name)?)
-    }
-}
-
-pub struct BufferedSafetensors {
-    safetensors: yoke::Yoke<SafeTensors_<'static>, Vec<u8>>,
-}
-
-impl BufferedSafetensors {
-    /// Creates a wrapper around a binary buffer and deserialize the safetensors header.
-    pub fn new(buffer: Vec<u8>) -> Result<Self> {
-        let safetensors = yoke::Yoke::<SafeTensors_<'static>, Vec<u8>>::try_attach_to_cart(
-            buffer,
-            |data: &[u8]| {
-                let st = safetensors::SafeTensors::deserialize(data)?;
-                Ok::<_, Error>(SafeTensors_(st))
-            },
-        )?;
-        Ok(Self { safetensors })
-    }
-
-    pub fn load(&self, name: &str, dev: &Device) -> Result<Tensor> {
-        self.get(name)?.load(dev)
-    }
-
-    pub fn tensors(&self) -> Vec<(String, st::TensorView<'_>)> {
-        self.safetensors.get().0.tensors()
-    }
-
-    pub fn get(&self, name: &str) -> Result<st::TensorView<'_>> {
-        Ok(self.safetensors.get().0.tensor(name)?)
-    }
-}
-
-pub struct MmapedFile {
-    path: std::path::PathBuf,
-    inner: memmap2::Mmap,
-}
+pub struct MmapedFile(memmap2::Mmap);

 impl MmapedFile {
    /// Creates a wrapper around a memory mapped file from which you can retrieve
@ -391,21 +146,14 @@ impl MmapedFile {
    /// # Safety
    ///
    /// The unsafe is inherited from [`memmap2::MmapOptions`].
-    pub unsafe fn new<P: AsRef<Path>>(p: P) -> Result<Self> {
-        let p = p.as_ref();
-        let file = std::fs::File::open(p).map_err(|e| Error::from(e).with_path(p))?;
-        let inner = memmap2::MmapOptions::new()
-            .map(&file)
-            .map_err(|e| Error::from(e).with_path(p))?;
-        Ok(Self {
-            inner,
-            path: p.to_path_buf(),
-        })
+    pub unsafe fn new<P: AsRef<std::path::Path>>(p: P) -> Result<Self> {
+        let file = std::fs::File::open(p)?;
+        let mmap = memmap2::MmapOptions::new().map(&file)?;
+        Ok(Self(mmap))
    }

    pub fn deserialize(&self) -> Result<SafeTensors<'_>> {
-        let st = safetensors::SafeTensors::deserialize(&self.inner)
-            .map_err(|e| Error::from(e).with_path(&self.path))?;
+        let st = safetensors::SafeTensors::deserialize(&self.0)?;
        Ok(st)
    }
 }
@ -425,15 +173,11 @@ mod tests {
    }

    #[test]
-    fn save_load_multiple_tensors() {
+    fn save_multiple_tensors() {
        let t = Tensor::zeros((2, 2), DType::F32, &Device::Cpu).unwrap();
        let u = Tensor::zeros((1, 2), DType::F32, &Device::Cpu).unwrap();
        let map: HashMap<_, _> = [("t", t), ("u", u)].into_iter().collect();
-        save(&map, "multi.safetensors").unwrap();
-
-        let weights = load("multi.safetensors", &Device::Cpu).unwrap();
-        assert_eq!(weights.get("t").unwrap().dims(), &[2, 2]);
-        assert_eq!(weights.get("u").unwrap().dims(), &[1, 2]);
+        st::serialize_to_file(map, &None, std::path::Path::new("multi.safetensors")).unwrap();
        let bytes = std::fs::read("multi.safetensors").unwrap();
        assert_eq!(bytes, b"x\0\0\0\0\0\0\0{\"t\":{\"dtype\":\"F32\",\"shape\":[2,2],\"data_offsets\":[0,16]},\"u\":{\"dtype\":\"F32\",\"shape\":[1,2],\"data_offsets\":[16,24]}}      \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
        std::fs::remove_file("multi.safetensors").unwrap();
--- a/candle-core/src/scalar.rs
+++ b/candle-core/src/scalar.rs
@ -1,23 +0,0 @@
-use crate::{Result, Tensor, WithDType};
-
-pub enum TensorScalar {
-    Tensor(Tensor),
-    Scalar(Tensor),
-}
-
-pub trait TensorOrScalar {
-    fn to_tensor_scalar(self) -> Result<TensorScalar>;
-}
-
-impl TensorOrScalar for &Tensor {
-    fn to_tensor_scalar(self) -> Result<TensorScalar> {
-        Ok(TensorScalar::Tensor(self.clone()))
-    }
-}
-
-impl<T: WithDType> TensorOrScalar for T {
-    fn to_tensor_scalar(self) -> Result<TensorScalar> {
-        let scalar = Tensor::new(self, &crate::Device::Cpu)?;
-        Ok(TensorScalar::Scalar(scalar))
-    }
-}
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -1,5 +1,3 @@
-//! The shape of a tensor is a tuple with the size of each of its dimensions.
-#![allow(clippy::redundant_closure_call)]
 use crate::{Error, Result};

 #[derive(Clone, PartialEq, Eq)]
@ -43,12 +41,6 @@ impl From<usize> for Shape {
    }
 }

-impl From<(usize,)> for Shape {
-    fn from(d1: (usize,)) -> Self {
-        Self(vec![d1.0])
-    }
-}
-
 impl From<(usize, usize)> for Shape {
    fn from(d12: (usize, usize)) -> Self {
        Self(vec![d12.0, d12.1])
@ -73,14 +65,6 @@ impl From<(usize, usize, usize, usize, usize)> for Shape {
    }
 }

-impl From<(usize, usize, usize, usize, usize, usize)> for Shape {
-    fn from(d123456: (usize, usize, usize, usize, usize, usize)) -> Self {
-        Self(vec![
-            d123456.0, d123456.1, d123456.2, d123456.3, d123456.4, d123456.5,
-        ])
-    }
-}
-
 impl From<Vec<usize>> for Shape {
    fn from(dims: Vec<usize>) -> Self {
        Self(dims)
@ -89,31 +73,20 @@ impl From<Vec<usize>> for Shape {

 macro_rules! extract_dims {
    ($fn_name:ident, $cnt:tt, $dims:expr, $out_type:ty) => {
-        pub fn $fn_name(dims: &[usize]) -> Result<$out_type> {
-            if dims.len() != $cnt {
-                Err(Error::UnexpectedNumberOfDims {
-                    expected: $cnt,
-                    got: dims.len(),
-                    shape: Shape::from(dims),
-                }
-                .bt())
-            } else {
-                Ok($dims(dims))
-            }
-        }
-
        impl Shape {
            pub fn $fn_name(&self) -> Result<$out_type> {
-                $fn_name(self.0.as_slice())
+                if self.0.len() != $cnt {
+                    Err(Error::UnexpectedNumberOfDims {
+                        expected: $cnt,
+                        got: self.0.len(),
+                        shape: self.clone(),
+                    }
+                    .bt())
+                } else {
+                    Ok($dims(&self.0))
+                }
            }
        }
-
-        impl crate::Tensor {
-            pub fn $fn_name(&self) -> Result<$out_type> {
-                self.shape().$fn_name()
-            }
-        }
-
        impl std::convert::TryInto<$out_type> for Shape {
            type Error = crate::Error;
            fn try_into(self) -> std::result::Result<$out_type, Self::Error> {
@ -128,7 +101,6 @@ impl Shape {
        Self(dims.to_vec())
    }

-    /// The rank is the number of dimensions, 0 for a scalar value, 1 for a vector, etc.
    pub fn rank(&self) -> usize {
        self.0.len()
    }
@ -137,12 +109,10 @@ impl Shape {
        self.0
    }

-    /// The dimensions as a slice of `usize`.
    pub fn dims(&self) -> &[usize] {
        &self.0
    }

-    /// The total number of elements, this is the product of all dimension sizes.
    pub fn elem_count(&self) -> usize {
        self.0.iter().product()
    }
@ -194,75 +164,10 @@ impl Shape {
        true
    }

-    /// Modifies the shape by adding a list of additional dimensions at the end of the existing
-    /// dimensions.
    pub fn extend(mut self, additional_dims: &[usize]) -> Self {
        self.0.extend(additional_dims);
        self
    }
-
-    /// Check whether the two shapes are compatible for broadcast, and if it is the case return the
-    /// broadcasted shape. This is to be used for binary pointwise ops.
-    pub(crate) fn broadcast_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<Shape> {
-        let lhs = self;
-        let lhs_dims = lhs.dims();
-        let rhs_dims = rhs.dims();
-        let lhs_ndims = lhs_dims.len();
-        let rhs_ndims = rhs_dims.len();
-        let bcast_ndims = usize::max(lhs_ndims, rhs_ndims);
-        let mut bcast_dims = vec![0; bcast_ndims];
-        for (idx, bcast_value) in bcast_dims.iter_mut().enumerate() {
-            let rev_idx = bcast_ndims - idx;
-            let l_value = if lhs_ndims < rev_idx {
-                1
-            } else {
-                lhs_dims[lhs_ndims - rev_idx]
-            };
-            let r_value = if rhs_ndims < rev_idx {
-                1
-            } else {
-                rhs_dims[rhs_ndims - rev_idx]
-            };
-            *bcast_value = if l_value == r_value {
-                l_value
-            } else if l_value == 1 {
-                r_value
-            } else if r_value == 1 {
-                l_value
-            } else {
-                Err(Error::ShapeMismatchBinaryOp {
-                    lhs: lhs.clone(),
-                    rhs: rhs.clone(),
-                    op,
-                }
-                .bt())?
-            }
-        }
-        Ok(Shape::from(bcast_dims))
-    }
-
-    pub(crate) fn broadcast_shape_matmul(&self, rhs: &Self) -> Result<(Shape, Shape)> {
-        let lhs = self;
-        let lhs_dims = lhs.dims();
-        let rhs_dims = rhs.dims();
-        if lhs_dims.len() < 2 || rhs_dims.len() < 2 {
-            crate::bail!("only 2d matrixes are supported {lhs:?} {rhs:?}")
-        }
-        let (m, lhs_k) = (lhs_dims[lhs_dims.len() - 2], lhs_dims[lhs_dims.len() - 1]);
-        let (rhs_k, n) = (rhs_dims[rhs_dims.len() - 2], rhs_dims[rhs_dims.len() - 1]);
-        if lhs_k != rhs_k {
-            crate::bail!("different inner dimensions in broadcast matmul {lhs:?} {rhs:?}")
-        }
-
-        let lhs_b = Self::from(&lhs_dims[..lhs_dims.len() - 2]);
-        let rhs_b = Self::from(&rhs_dims[..rhs_dims.len() - 2]);
-        let bcast = lhs_b.broadcast_shape_binary_op(&rhs_b, "broadcast_matmul")?;
-        let bcast_dims = bcast.dims();
-
-        let bcast_lhs = [bcast_dims, &[m, lhs_k]].concat();
-        let bcast_rhs = [bcast_dims, &[rhs_k, n]].concat();
-        Ok((Shape::from(bcast_lhs), Shape::from(bcast_rhs)))
-    }
 }

 pub trait Dim {
@ -423,56 +328,23 @@ impl<D1: Dim, D2: Dim, D3: Dim> Dims for (D1, D2, D3) {
    }
 }

-impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim> Dims for (D1, D2, D3, D4) {
-    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
-        let d0 = self.0.to_index(shape, op)?;
-        let d1 = self.1.to_index(shape, op)?;
-        let d2 = self.2.to_index(shape, op)?;
-        let d3 = self.3.to_index(shape, op)?;
-        Ok(vec![d0, d1, d2, d3])
-    }
-}
-
-impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim> Dims for (D1, D2, D3, D4, D5) {
-    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
-        let d0 = self.0.to_index(shape, op)?;
-        let d1 = self.1.to_index(shape, op)?;
-        let d2 = self.2.to_index(shape, op)?;
-        let d3 = self.3.to_index(shape, op)?;
-        let d4 = self.4.to_index(shape, op)?;
-        Ok(vec![d0, d1, d2, d3, d4])
-    }
-}
-
-impl<D1: Dim, D2: Dim, D3: Dim, D4: Dim, D5: Dim, D6: Dim> Dims for (D1, D2, D3, D4, D5, D6) {
-    fn to_indexes_internal(self, shape: &Shape, op: &'static str) -> Result<Vec<usize>> {
-        let d0 = self.0.to_index(shape, op)?;
-        let d1 = self.1.to_index(shape, op)?;
-        let d2 = self.2.to_index(shape, op)?;
-        let d3 = self.3.to_index(shape, op)?;
-        let d4 = self.4.to_index(shape, op)?;
-        let d5 = self.5.to_index(shape, op)?;
-        Ok(vec![d0, d1, d2, d3, d4, d5])
-    }
-}
-
-extract_dims!(dims0, 0, |_: &[usize]| (), ());
-extract_dims!(dims1, 1, |d: &[usize]| d[0], usize);
-extract_dims!(dims2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
+extract_dims!(r0, 0, |_: &Vec<usize>| (), ());
+extract_dims!(r1, 1, |d: &[usize]| d[0], usize);
+extract_dims!(r2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
 extract_dims!(
-    dims3,
+    r3,
    3,
    |d: &[usize]| (d[0], d[1], d[2]),
    (usize, usize, usize)
 );
 extract_dims!(
-    dims4,
+    r4,
    4,
    |d: &[usize]| (d[0], d[1], d[2], d[3]),
    (usize, usize, usize, usize)
 );
 extract_dims!(
-    dims5,
+    r5,
    5,
    |d: &[usize]| (d[0], d[1], d[2], d[3], d[4]),
    (usize, usize, usize, usize, usize)
@ -494,171 +366,3 @@ mod tests {
        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
    }
 }
-
-pub trait ShapeWithOneHole {
-    fn into_shape(self, el_count: usize) -> Result<Shape>;
-}
-
-impl<S: Into<Shape>> ShapeWithOneHole for S {
-    fn into_shape(self, _el_count: usize) -> Result<Shape> {
-        Ok(self.into())
-    }
-}
-
-impl ShapeWithOneHole for ((),) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        Ok(el_count.into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1) = self;
-        if el_count % d1 != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d1}")
-        }
-        Ok((el_count / d1, d1).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, ()) = self;
-        if el_count % d1 != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d1}")
-        }
-        Ok((d1, el_count / d1).into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1, d2) = self;
-        let d = d1 * d2;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((el_count / d, d1, d2).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, (), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, (), d2) = self;
-        let d = d1 * d2;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, el_count / d, d2).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, ()) = self;
-        let d = d1 * d2;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, d2, el_count / d).into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize, usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1, d2, d3) = self;
-        let d = d1 * d2 * d3;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((el_count / d, d1, d2, d3).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, (), usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, (), d2, d3) = self;
-        let d = d1 * d2 * d3;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, el_count / d, d2, d3).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, (), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, (), d3) = self;
-        let d = d1 * d2 * d3;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, d2, el_count / d, d3).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, d3, ()) = self;
-        let d = d1 * d2 * d3;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, d2, d3, el_count / d).into())
-    }
-}
-
-impl ShapeWithOneHole for ((), usize, usize, usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let ((), d1, d2, d3, d4) = self;
-        let d = d1 * d2 * d3 * d4;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((el_count / d, d1, d2, d3, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, (), usize, usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, (), d2, d3, d4) = self;
-        let d = d1 * d2 * d3 * d4;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, el_count / d, d2, d3, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, (), usize, usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, (), d3, d4) = self;
-        let d = d1 * d2 * d3 * d4;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, d2, el_count / d, d3, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, usize, (), usize) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, d3, (), d4) = self;
-        let d = d1 * d2 * d3 * d4;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, d2, d3, el_count / d, d4).into())
-    }
-}
-
-impl ShapeWithOneHole for (usize, usize, usize, usize, ()) {
-    fn into_shape(self, el_count: usize) -> Result<Shape> {
-        let (d1, d2, d3, d4, ()) = self;
-        let d = d1 * d2 * d3 * d4;
-        if el_count % d != 0 {
-            crate::bail!("tensor number of elements {el_count} is not divisible by {d}")
-        }
-        Ok((d1, d2, d3, d4, el_count / d).into())
-    }
-}
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -1,6 +1,5 @@
 use crate::backend::BackendStorage;
-use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp};
-use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};
+use crate::{op, CpuStorage, CudaStorage, DType, Device, Error, Layout, Result, Shape};

 // We do not want to implement Clone on Storage as cloning may fail because of
 // out of memory. Instead try_clone should be used.
@ -68,19 +67,6 @@ impl Storage {
        }
    }

-    pub(crate) fn powf(&self, layout: &Layout, alpha: f64) -> Result<Self> {
-        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.powf(layout, alpha)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.powf(layout, alpha)?;
-                Ok(Self::Cuda(storage))
-            }
-        }
-    }
-
    pub(crate) fn elu(&self, layout: &Layout, alpha: f64) -> Result<Self> {
        match self {
            Storage::Cpu(storage) => {
@ -94,48 +80,26 @@ impl Storage {
        }
    }

-    pub(crate) fn cmp(
-        &self,
-        op: CmpOp,
-        rhs: &Self,
-        lhs_layout: &Layout,
-        rhs_layout: &Layout,
-    ) -> Result<Self> {
-        self.same_device(rhs, "cmp")?;
-        self.same_dtype(rhs, "cmp")?;
-        match (self, rhs) {
-            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
-                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
+    pub(crate) fn sum(&self, layout: &Layout, s: &[usize]) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.sum(layout, s)?;
                Ok(Self::Cpu(storage))
            }
-            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
-                let storage = lhs.cmp(op, rhs, lhs_layout, rhs_layout)?;
+            Self::Cuda(storage) => {
+                let storage = storage.sum(layout, s)?;
                Ok(Self::Cuda(storage))
            }
-            (lhs, rhs) => {
-                // Should not happen because of the same device check above but we're defensive
-                // anyway.
-                Err(Error::DeviceMismatchBinaryOp {
-                    lhs: lhs.device().location(),
-                    rhs: rhs.device().location(),
-                    op: "cmp",
-                }
-                .bt())
-            }
        }
    }

-    pub(crate) fn reduce_op(&self, op: ReduceOp, layout: &Layout, s: &[usize]) -> Result<Self> {
+    // This assumes a contiguous layout and no offset.
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.reduce_op(op, layout, s)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.reduce_op(op, layout, s)?;
-                Ok(Self::Cuda(storage))
-            }
+            Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
+            Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
        }
+        Ok(())
    }

    pub(crate) fn to_dtype(&self, layout: &Layout, dtype: DType) -> Result<Self> {
@ -151,65 +115,8 @@ impl Storage {
        }
    }

-    pub(crate) fn apply_op1(&self, l: &Layout, c: &dyn CustomOp1) -> Result<(Self, Shape)> {
-        match self {
-            Self::Cpu(storage) => {
-                let (storage, shape) = c.cpu_fwd(storage, l)?;
-                Ok((Self::Cpu(storage), shape))
-            }
-            Self::Cuda(storage) => {
-                let (storage, shape) = c.cuda_fwd(storage, l)?;
-                Ok((Self::Cuda(storage), shape))
-            }
-        }
-    }
-
-    pub(crate) fn apply_op2(
-        &self,
-        l1: &Layout,
-        t2: &Self,
-        l2: &Layout,
-        c: &dyn CustomOp2,
-    ) -> Result<(Self, Shape)> {
-        self.same_device(t2, c.name())?;
-        match (self, t2) {
-            (Self::Cpu(s1), Self::Cpu(s2)) => {
-                let (s, shape) = c.cpu_fwd(s1, l1, s2, l2)?;
-                Ok((Self::Cpu(s), shape))
-            }
-            (Self::Cuda(s1), Self::Cuda(s2)) => {
-                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2)?;
-                Ok((Self::Cuda(s), shape))
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn apply_op3(
-        &self,
-        l1: &Layout,
-        t2: &Self,
-        l2: &Layout,
-        t3: &Self,
-        l3: &Layout,
-        c: &dyn CustomOp3,
-    ) -> Result<(Self, Shape)> {
-        self.same_device(t2, c.name())?;
-        self.same_device(t3, c.name())?;
-        match (self, t2, t3) {
-            (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => {
-                let (s, shape) = c.cpu_fwd(s1, l1, s2, l2, s3, l3)?;
-                Ok((Self::Cpu(s), shape))
-            }
-            (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => {
-                let (s, shape) = c.cuda_fwd(s1, l1, s2, l2, s3, l3)?;
-                Ok((Self::Cuda(s), shape))
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn unary_impl<B: op::UnaryOpT>(&self, layout: &Layout) -> Result<Self> {
+    pub(crate) fn unary_impl<B: op::UnaryOp>(&self, layout: &Layout) -> Result<Self> {
+        // TODO: Different code path for the contiguous case?
        match self {
            Storage::Cpu(storage) => {
                let storage = storage.unary_impl::<B>(layout)?;
@ -222,7 +129,7 @@ impl Storage {
        }
    }

-    pub(crate) fn binary_impl<B: op::BinaryOpT>(
+    pub(crate) fn binary_impl<B: op::BinaryOp>(
        &self,
        rhs: &Self,
        lhs_layout: &Layout,
@ -279,122 +186,6 @@ impl Storage {
        }
    }

-    pub(crate) fn conv2d(
-        &self,
-        l: &Layout,
-        kernel: &Self,
-        kernel_l: &Layout,
-        params: &crate::conv::ParamsConv2D,
-    ) -> Result<Self> {
-        self.same_device(kernel, "conv2d")?;
-        self.same_dtype(kernel, "conv2d")?;
-        match (self, &kernel) {
-            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
-                let s = inp.conv2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cpu(s))
-            }
-            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
-                let s = inp.conv2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cuda(s))
-            }
-            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
-                lhs: lhs.device().location(),
-                rhs: rhs.device().location(),
-                op: "conv2d",
-            }
-            .bt()),
-        }
-    }
-
-    pub(crate) fn conv_transpose2d(
-        &self,
-        l: &Layout,
-        kernel: &Self,
-        kernel_l: &Layout,
-        params: &crate::conv::ParamsConvTranspose2D,
-    ) -> Result<Self> {
-        self.same_device(kernel, "conv_transpose2d")?;
-        self.same_dtype(kernel, "conv_transpose2d")?;
-        match (self, &kernel) {
-            (Storage::Cpu(inp), Storage::Cpu(kernel)) => {
-                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cpu(s))
-            }
-            (Storage::Cuda(inp), Storage::Cuda(kernel)) => {
-                let s = inp.conv_transpose2d(l, kernel, kernel_l, params)?;
-                Ok(Self::Cuda(s))
-            }
-            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
-                lhs: lhs.device().location(),
-                rhs: rhs.device().location(),
-                op: "conv_transpose2d",
-            }
-            .bt()),
-        }
-    }
-
-    pub(crate) fn avg_pool2d(
-        &self,
-        layout: &Layout,
-        kernel_size: (usize, usize),
-        stride: (usize, usize),
-    ) -> Result<Self> {
-        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.avg_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Cuda(storage))
-            }
-        }
-    }
-
-    pub(crate) fn max_pool2d(
-        &self,
-        layout: &Layout,
-        kernel_size: (usize, usize),
-        stride: (usize, usize),
-    ) -> Result<Self> {
-        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.max_pool2d(layout, kernel_size, stride)?;
-                Ok(Self::Cuda(storage))
-            }
-        }
-    }
-
-    pub(crate) fn upsample_nearest1d(&self, layout: &Layout, sz: usize) -> Result<Self> {
-        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.upsample_nearest1d(layout, sz)?;
-                Ok(Self::Cuda(storage))
-            }
-        }
-    }
-
-    pub(crate) fn upsample_nearest2d(&self, layout: &Layout, h: usize, w: usize) -> Result<Self> {
-        match self {
-            Storage::Cpu(storage) => {
-                let storage = storage.upsample_nearest2d(layout, h, w)?;
-                Ok(Self::Cpu(storage))
-            }
-            Self::Cuda(storage) => {
-                let storage = storage.upsample_nearest2d(layout, h, w)?;
-                Ok(Self::Cuda(storage))
-            }
-        }
-    }
-
    pub(crate) fn where_cond(
        &self,
        layout: &Layout,
@ -424,96 +215,21 @@ impl Storage {
        }
    }

-    pub(crate) fn gather(
-        &self,
-        l: &Layout,
-        indexes: &Self,
-        indexes_l: &Layout,
-        d: usize,
-    ) -> Result<Self> {
-        self.same_device(indexes, "index-add")?;
-        match (self, indexes) {
-            (Self::Cpu(s), Self::Cpu(indexes)) => {
-                let storage = s.gather(l, indexes, indexes_l, d)?;
-                Ok(Self::Cpu(storage))
-            }
-            (Self::Cuda(s), Self::Cuda(indexes)) => {
-                let storage = s.gather(l, indexes, indexes_l, d)?;
-                Ok(Self::Cuda(storage))
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn scatter_add(
-        &self,
-        l: &Layout,
-        indexes: &Self,
-        indexes_l: &Layout,
-        source: &Self,
-        source_l: &Layout,
-        d: usize,
-    ) -> Result<Self> {
-        self.same_device(indexes, "scatter-add")?;
-        self.same_device(source, "scatter-add")?;
-        match (self, indexes, source) {
-            (Self::Cpu(s), Self::Cpu(indexes), Self::Cpu(source)) => {
-                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Cpu(storage))
-            }
-            (Self::Cuda(s), Self::Cuda(indexes), Self::Cuda(source)) => {
-                let storage = s.scatter_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Cuda(storage))
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn index_add(
-        &self,
-        l: &Layout,
-        indexes: &Self,
-        indexes_l: &Layout,
-        source: &Self,
-        source_l: &Layout,
-        d: usize,
-    ) -> Result<Self> {
-        self.same_device(indexes, "index-add")?;
-        self.same_device(source, "index-add")?;
-        match (self, indexes, source) {
-            (Self::Cpu(s), Self::Cpu(indexes), Self::Cpu(source)) => {
-                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Cpu(storage))
-            }
-            (Self::Cuda(s), Self::Cuda(indexes), Self::Cuda(source)) => {
-                let storage = s.index_add(l, indexes, indexes_l, source, source_l, d)?;
-                Ok(Self::Cuda(storage))
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    pub(crate) fn index_select(
-        &self,
-        rhs: &Self,
-        lhs_l: &Layout,
-        rhs_l: &Layout,
-        d: usize,
-    ) -> Result<Self> {
-        self.same_device(rhs, "index-select")?;
+    pub(crate) fn embedding(&self, layout: &Layout, rhs: &Self, rhs_l: &Layout) -> Result<Self> {
+        self.same_device(rhs, "embedding")?;
        match (self, rhs) {
-            (Self::Cpu(lhs), Self::Cpu(rhs)) => {
-                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
+            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
+                let storage = lhs.embedding(layout, rhs, rhs_l)?;
                Ok(Self::Cpu(storage))
            }
            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
-                let storage = lhs.index_select(rhs, lhs_l, rhs_l, d)?;
+                let storage = lhs.embedding(layout, rhs, rhs_l)?;
                Ok(Self::Cuda(storage))
            }
            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
                lhs: lhs.device().location(),
                rhs: rhs.device().location(),
-                op: "index-select",
+                op: "embedding",
            }
            .bt()),
        }
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -11,30 +11,16 @@ pub fn get_num_threads() -> usize {
    }
 }

-pub fn has_accelerate() -> bool {
-    cfg!(feature = "accelerate")
-}
-
 pub fn has_mkl() -> bool {
-    cfg!(feature = "mkl")
+    #[cfg(feature = "mkl")]
+    return true;
+    #[cfg(not(feature = "mkl"))]
+    return false;
 }

 pub fn cuda_is_available() -> bool {
-    cfg!(feature = "cuda")
-}
-
-pub fn with_avx() -> bool {
-    cfg!(target_feature = "avx")
-}
-
-pub fn with_neon() -> bool {
-    cfg!(target_feature = "neon")
-}
-
-pub fn with_simd128() -> bool {
-    cfg!(target_feature = "simd128")
-}
-
-pub fn with_f16c() -> bool {
-    cfg!(target_feature = "f16c")
+    #[cfg(feature = "cuda")]
+    return true;
+    #[cfg(not(feature = "cuda"))]
+    return false;
 }
--- a/candle-core/src/variable.rs
+++ b/candle-core/src/variable.rs
@ -34,50 +34,25 @@ impl Var {
        Ok(Self(inner))
    }

-    pub fn from_tensor(t: &Tensor) -> Result<Self> {
-        let inner = t.make_var()?;
-        Ok(Self(inner))
-    }
-
-    pub fn rand_f64<S: Into<Shape>>(
+    pub fn rand<S: Into<Shape>>(
+        s: S,
+        dtype: DType,
+        device: &Device,
        lo: f64,
        up: f64,
-        s: S,
-        dtype: DType,
-        device: &Device,
    ) -> Result<Self> {
-        let inner = Tensor::rand_f64_impl(lo, up, s, dtype, device, true)?;
+        let inner = Tensor::rand_impl(s, dtype, device, lo, up, true)?;
        Ok(Self(inner))
    }

-    pub fn randn_f64<S: Into<Shape>>(
+    pub fn randn<S: Into<Shape>>(
+        s: S,
+        dtype: DType,
+        device: &Device,
        mean: f64,
        std: f64,
-        s: S,
-        dtype: DType,
-        device: &Device,
    ) -> Result<Self> {
-        let inner = Tensor::randn_f64_impl(mean, std, s, dtype, device, true)?;
-        Ok(Self(inner))
-    }
-
-    pub fn rand<S: Into<Shape>, T: crate::FloatDType>(
-        lo: T,
-        up: T,
-        s: S,
-        device: &Device,
-    ) -> Result<Self> {
-        let inner = Tensor::rand_impl(lo, up, s, device, true)?;
-        Ok(Self(inner))
-    }
-
-    pub fn randn<S: Into<Shape>, T: crate::FloatDType>(
-        mean: T,
-        std: T,
-        s: S,
-        device: &Device,
-    ) -> Result<Self> {
-        let inner = Tensor::randn_impl(mean, std, s, device, true)?;
+        let inner = Tensor::randn_impl(s, dtype, device, mean, std, true)?;
        Ok(Self(inner))
    }

--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -1,495 +0,0 @@
-use anyhow::Result;
-use candle_core::{test_device, test_utils, Device, IndexOp, Tensor};
-
-/* This test is based on the following script.
-import torch
-torch.manual_seed(4242)
-
-t = torch.randn((1, 4, 5))
-w = torch.randn((2, 4, 3))
-print(t.flatten())
-print(w.flatten())
-res = torch.nn.functional.conv1d(t, w)
-print(res.flatten())
-res = torch.nn.functional.conv1d(t, w, padding=1)
-print(res.flatten())
-*/
-fn conv1d(dev: &Device) -> Result<()> {
-    let t = Tensor::new(
-        &[
-            0.4056f32, -0.8689, -0.0773, -1.5630, 1.2279, -0.9287, -1.7030, 0.1370, 0.1866, 0.4145,
-            1.8025, -0.1536, 2.2013, -0.6836, 0.2477, 1.3127, -0.6957, 0.3278, -1.0124, 0.5599,
-        ],
-        dev,
-    )?
-    .reshape((1, 4, 5))?;
-    let w = Tensor::new(
-        &[
-            -0.8404f32, -0.3490, 0.0130, 1.3123, 0.1763, -1.9249, 1.4270, 0.9421, 0.8670, -0.7181,
-            -1.1111, 0.8869, -1.2429, 1.8357, 1.6052, -1.3844, 0.3951, -1.2036, 0.6686, 1.6261,
-            -0.6451, -0.0840, -1.4247, 0.5512,
-        ],
-        dev,
-    )?
-    .reshape((2, 4, 3))?;
-    let res = t.conv1d(&w, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 2, 3]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [2.6357, -1.3336, 4.1393, -1.1784, 3.5675, 0.5069]
-    );
-    let res = t.conv1d(&w, /*padding*/ 1, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 2, 5]);
-    // Same as pytorch default padding: use zeros.
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
-    );
-    Ok(())
-}
-
-fn conv1d_small(dev: &Device) -> Result<()> {
-    let t = Tensor::new(&[0.4056f32, -0.8689, -0.0773, -1.5630], dev)?.reshape((1, 1, 4))?;
-    let w = Tensor::new(&[1f32, 0., 0.], dev)?.reshape((1, 1, 3))?;
-    let res = t.conv1d(&w, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 2]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [0.4056, -0.8689]
-    );
-    let res = t.conv1d(&w, /*padding*/ 1, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 4]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [0.0, 0.4056, -0.8689, -0.0773],
-    );
-    Ok(())
-}
-
-/* This test is based on the following script.
-import torch
-torch.manual_seed(4242)
-
-t = torch.randn((1, 4, 5, 5))
-w = torch.randn((2, 4, 3, 3))
-print(t.flatten())
-print(w.flatten())
-res = torch.nn.functional.conv2d(t, w)
-print(res.flatten())
-
-w_t = w.transpose(0, 1)
-res = torch.nn.functional.conv_transpose2d(t, w_t)
-print(res.shape)
-print(res)
-
-res = torch.nn.functional.conv2d(t, w, dilation=2)
-print(res.shape)
-print(res[0])
-
-res = torch.nn.functional.conv_transpose2d(t, w_t, dilation=2)
-print(res.shape)
-print(res)
-*/
-fn conv2d(dev: &Device) -> Result<()> {
-    let t = Tensor::new(
-        &[
-            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
-            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, 0.2395,
-            1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, -0.6836,
-            0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130, 1.3123,
-            1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071, 1.1586,
-            0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090, 0.2049,
-            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
-            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
-            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
-        ],
-        dev,
-    )?;
-    let w = Tensor::new(
-        &[
-            -0.9325f32, 0.6451, -0.8537, 0.2378, 0.8764, -0.1832, 0.2987, -0.6488, -0.2273,
-            -2.4184, -0.1192, -0.4821, -0.5079, -0.5766, -2.4729, 1.6734, 0.4558, 0.2851, 1.1514,
-            -0.9013, 1.0662, -0.1817, -0.0259, 0.1709, 0.5367, 0.7513, 0.8086, -2.2586, -0.5027,
-            0.9141, -1.3086, -1.3343, -1.5669, -0.1657, 0.7958, 0.1432, 0.3896, -0.4501, 0.1667,
-            0.0714, -0.0952, 1.2970, -0.1674, -0.3178, 1.0677, 0.3060, 0.7080, 0.1914, 1.1679,
-            -0.3602, 1.9265, -1.8626, -0.5112, -0.0982, 0.2621, 0.6565, 0.5908, 1.0089, -0.1646,
-            1.8032, -0.6286, 0.2016, -0.3370, 1.2555, 0.8009, -0.6488, -0.4652, -1.5685, 1.5860,
-            0.5583, 0.4623, 0.6026,
-        ],
-        dev,
-    )?;
-    let t = t.reshape((1, 4, 5, 5))?;
-    let w = w.reshape((2, 4, 3, 3))?;
-    let res = t.conv2d(&w, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 2, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [
-            -4.2812, 2.0923, 5.2187, 7.5184, 0.752, -14.9426, 10.0087, 4.391, 0.2918, 1.6715,
-            10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075
-        ]
-    );
-    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-    assert_eq!(res.dims(), [1, 2, 7, 7]);
-    assert_eq!(
-        test_utils::to_vec3_round(&res.i(0)?, 4)?,
-        [
-            [
-                [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277],
-                [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375],
-                [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889],
-                [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632],
-                [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985],
-                [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114],
-                [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579]
-            ],
-            [
-                [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211],
-                [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131],
-                [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621],
-                [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142],
-                [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059],
-                [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516],
-                [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171]
-            ]
-        ]
-    );
-    // Dilations.
-    let res = t.conv2d(&w, 0, 1, 2, 1)?;
-    assert_eq!(res.dims(), [1, 2, 1, 1]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [2.45, -2.3504],
-    );
-
-    // Transpose and dilations.
-    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?;
-    assert_eq!(res.dims(), [1, 2, 9, 9]);
-    assert_eq!(
-        test_utils::to_vec3_round(&res.i(0)?, 4)?,
-        [
-            [
-                [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277],
-                [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499],
-                [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376],
-                [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141],
-                [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822],
-                [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03],
-                [-2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, -3.5024],
-                [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787],
-                [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579]
-            ],
-            [
-                [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211],
-                [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278],
-                [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861],
-                [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185],
-                [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642],
-                [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957],
-                [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856],
-                [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908],
-                [-5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, 1.0171]
-            ]
-        ]
-    );
-    Ok(())
-}
-
-/* This test is based on the following script.
-import torch
-torch.manual_seed(4242)
-
-t = torch.randn((1, 2, 3, 3))
-w = torch.randn((1, 2, 1, 1))
-print(t.flatten())
-print(w.flatten())
-res = torch.nn.functional.conv2d(t, w)
-print(res.flatten())
-
-w_t = w.transpose(0, 1)
-res = torch.nn.functional.conv_transpose2d(t, w_t)
-print(res.shape)
-print(res.flatten())
-
-t_t = w.transpose(0, 1)
-res = torch.nn.functional.conv_transpose2d(t_t, w)
-print(res.shape)
-print(res.flatten())
-*/
-fn conv2d_small(dev: &Device) -> Result<()> {
-    let t = Tensor::new(
-        &[
-            0.4056f32, -0.8689, 0.6843, 0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.1866, 0.4145,
-            -0.6266, 0.3529, 2.2013, -0.6836, 0.2477, 1.3127, -0.6957, 0.3278,
-        ],
-        dev,
-    )?;
-    let w = Tensor::new(&[-0.9259f32, 1.3017], dev)?;
-    let t = t.reshape((1, 2, 3, 3))?;
-    let w = w.reshape((1, 2, 1, 1))?;
-    let res = t.conv2d(&w, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [0.164, -0.0111, -0.1742, 2.6437, -2.0268, 1.1823, 3.2855, -1.0324, 0.2539]
-    );
-    let res = t.conv2d(&w, 2, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 7, 7]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [
-            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
-            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1640, -0.0111, -0.1742, 0.0000, 0.0000,
-            0.0000, 0.0000, 2.6437, -2.0268, 1.1823, 0.0000, 0.0000, 0.0000, 0.0000, 3.2855,
-            -1.0324, 0.2539, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
-            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000
-        ]
-    );
-    let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [0.164, -0.0111, -0.1742, 2.6437, -2.0268, 1.1823, 3.2855, -1.0324, 0.2539],
-    );
-    let res = t.transpose(0, 1)?.conv_transpose2d(&w, 0, 0, 1, 1)?;
-    assert_eq!(res.dims(), [2, 2, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [
-            -0.3755, 0.8045, -0.6336, -0.2218, -1.1369, 0.8599, 1.5768, -0.1268, -0.1728, 0.528,
-            -1.131, 0.8908, 0.3118, 1.5984, -1.2089, -2.2168, 0.1783, 0.2429, -0.3838, 0.5802,
-            -0.3268, -2.0382, 0.6329, -0.2293, -1.2154, 0.6441, -0.3035, 0.5396, -0.8156, 0.4594,
-            2.8654, -0.8898, 0.3224, 1.7087, -0.9056, 0.4267
-        ]
-    );
-    Ok(())
-}
-
-fn conv2d_smaller(dev: &Device) -> Result<()> {
-    let t = Tensor::new(
-        &[
-            0.4056f32, -0.8689, 0.6843, 0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.1866,
-        ],
-        dev,
-    )?;
-    let w = Tensor::new(&[1f32, 1., 1., 1., 1., 1., 1., 1., 1.], dev)?;
-    let t = t.reshape((1, 1, 3, 3))?;
-    let w = w.reshape((1, 1, 3, 3))?;
-    let res = t.conv2d(&w, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 1, 1]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [-0.6197]
-    );
-    Ok(())
-}
-
-/* This test is based on the following script.
-import torch
-torch.manual_seed(4242)
-
-t = torch.randn((1, 2, 4, 2))
-w = torch.randn((1, 2, 1, 1))
-print(t.flatten())
-print(w.flatten())
-res = torch.nn.functional.conv2d(t, w)
-print(res.flatten())
-*/
-fn conv2d_non_square(dev: &Device) -> Result<()> {
-    let t = Tensor::new(
-        &[
-            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
-            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699,
-        ],
-        dev,
-    )?;
-    let w = Tensor::new(&[-1.1351f32, 1.3841], dev)?;
-    let t = t.reshape((1, 2, 4, 2))?;
-    let w = w.reshape((1, 2, 1, 1))?;
-    let res = t.conv2d(&w, 0, 1, 1, 1)?;
-    assert_eq!(res.dims(), [1, 1, 4, 2]);
-    assert_eq!(
-        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-        [0.2312, 5.2238, 2.3772, 1.9076, 2.0256, -0.5776, -1.6028, -1.467]
-    );
-    Ok(())
-}
-
-/*
-import torch
-torch.manual_seed(4242)
-
-t = torch.randn((1, 4, 5, 5), requires_grad=True)
-w = torch.randn((2, 4, 3, 3), requires_grad=True)
-print(t.flatten())
-print(w.flatten())
-res = torch.nn.functional.conv2d(t, w)
-print(res.flatten())
-loss = (res ** 2).sum()
-print(loss)
-loss.backward()
-print(t.grad.shape)
-print(t.grad.flatten())
-print(w.grad.shape)
-print(w.grad.flatten())
-
-t.grad.zero_()
-w.grad.zero_()
-res = torch.nn.functional.conv2d(t, w, stride=2)
-print(res.flatten())
-loss = (res ** 2).sum()
-print(loss)
-loss.backward()
-print(t.grad.shape)
-print(t.grad[0])
-print(w.grad.shape)
-print(w.grad[0])
-*/
-fn conv2d_grad(dev: &Device) -> Result<()> {
-    use candle_core::Var;
-    let t = Var::from_slice(
-        &[
-            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
-            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, 0.2395,
-            1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, -0.6836,
-            0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130, 1.3123,
-            1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071, 1.1586,
-            0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090, 0.2049,
-            0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712,
-            0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790,
-            -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006,
-            -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085,
-        ],
-        (1, 4, 5, 5),
-        dev,
-    )?;
-    let w = Var::from_slice(
-        &[
-            -0.9325f32, 0.6451, -0.8537, 0.2378, 0.8764, -0.1832, 0.2987, -0.6488, -0.2273,
-            -2.4184, -0.1192, -0.4821, -0.5079, -0.5766, -2.4729, 1.6734, 0.4558, 0.2851, 1.1514,
-            -0.9013, 1.0662, -0.1817, -0.0259, 0.1709, 0.5367, 0.7513, 0.8086, -2.2586, -0.5027,
-            0.9141, -1.3086, -1.3343, -1.5669, -0.1657, 0.7958, 0.1432, 0.3896, -0.4501, 0.1667,
-            0.0714, -0.0952, 1.2970, -0.1674, -0.3178, 1.0677, 0.3060, 0.7080, 0.1914, 1.1679,
-            -0.3602, 1.9265, -1.8626, -0.5112, -0.0982, 0.2621, 0.6565, 0.5908, 1.0089, -0.1646,
-            1.8032, -0.6286, 0.2016, -0.3370, 1.2555, 0.8009, -0.6488, -0.4652, -1.5685, 1.5860,
-            0.5583, 0.4623, 0.6026,
-        ],
-        (2, 4, 3, 3),
-        dev,
-    )?;
-    let res = t.conv2d(&w, 0, 1, 1, 1)?;
-    let loss = res.sqr()?.sum_all()?;
-    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 741.12f32);
-    let grads = loss.backward()?;
-    let grad_t = grads.get(&t).unwrap();
-    let grad_w = grads.get(&w).unwrap();
-    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
-    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec1_round(&grad_t.flatten_all()?, 2)?,
-        [
-            9.29, -2.84, -5.71, 3.38, -7.71, -19.15, 7.02, 29.1, 9.34, 34.73, -22.87, 24.35,
-            -39.88, -14.01, 21.08, 9.94, 13.63, -34.68, 11.21, -6.26, 7.72, -6.32, -16.64, -1.08,
-            -20.22, 21.73, -0.37, -4.06, 5.82, -3.65, -30.73, 14.55, 87.7, 31.6, 4.53, -89.78,
-            -75.37, -57.43, -7.56, 92.96, 18.79, -4.63, -159.75, -42.47, -47.26, 52.88, 37.32,
-            49.0, 12.82, 2.01, -8.98, 20.18, 16.62, 12.06, 15.38, 20.0, 2.57, -15.22, 72.62,
-            -10.75, 2.25, -31.2, 3.75, -0.2, 9.76, -0.68, 5.21, -40.44, -22.59, -61.61, 17.28,
-            20.41, 37.55, 5.23, 6.81, 23.54, 23.62, -9.99, -9.13, 4.87, -35.06, -26.1, 63.48,
-            25.81, -39.21, -70.68, -46.96, 2.33, 41.81, 82.42, -28.63, -11.78, -35.33, -10.28,
-            -28.57, -9.13, 7.21, -9.05, -9.62, -11.25
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(&grad_w.flatten_all()?, 2)?,
-        [
-            -28.92, -22.88, -141.23, 73.35, 61.07, 47.81, -20.0, -73.71, -41.82, -13.59, 21.5,
-            28.72, 28.57, -46.85, -90.19, 143.61, 16.68, 7.43, 18.88, -90.81, -20.29, 54.79, 82.63,
-            22.94, 77.81, -16.39, -13.2, 9.34, -40.39, -26.62, 5.33, -60.91, 9.09, -59.37, 7.08,
-            58.64, 5.55, 20.52, 2.5, -17.25, -6.8, 22.21, 30.15, -7.52, -37.46, 5.67, 22.58, 9.03,
-            47.05, 17.61, 37.31, -98.13, -14.61, -4.8, -6.36, 44.69, 23.34, 8.37, -13.52, 80.05,
-            -34.24, -16.36, -12.31, 1.92, -33.62, -14.1, -49.23, -7.39, 11.5, -9.98, 9.66, 29.6
-        ]
-    );
-
-    // Same as before but with stride.
-    let res = t.conv2d(&w, 0, 2, 1, 1)?;
-    let loss = res.sqr()?.sum_all()?;
-    assert_eq!(test_utils::to_vec0_round(&loss, 2)?, 277.16f32);
-    let grads = loss.backward()?;
-    let grad_t = grads.get(&t).unwrap();
-    let grad_w = grads.get(&w).unwrap();
-    assert_eq!(grad_t.dims(), [1, 4, 5, 5]);
-    assert_eq!(grad_w.dims(), [2, 4, 3, 3]);
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_t.i(0)?, 2)?,
-        [
-            [
-                [9.29, -7.03, 0.94, 3.49, -7.71],
-                [-1.8, -7.82, 8.9, 8.46, 7.43],
-                [-25.84, 22.09, -19.27, -0.22, 1.69],
-                [4.02, 18.53, -18.37, 2.3, -24.51],
-                [7.72, -9.68, -12.34, 5.6, -20.22]
-            ],
-            [
-                [21.73, 3.39, -18.27, 3.86, -3.65],
-                [8.25, 3.73, 30.73, -8.61, -11.93],
-                [-72.15, -15.36, -17.53, -12.32, -1.61],
-                [-22.32, -7.79, -91.82, 6.44, -37.69],
-                [52.88, 14.44, 42.75, 9.88, 2.01]
-            ],
-            [
-                [-8.98, 9.91, 6.75, -4.68, 15.38],
-                [4.93, -0.33, 9.94, -1.46, 14.78],
-                [13.62, -30.63, 3.96, -3.58, -4.48],
-                [-14.13, 1.19, -34.43, 3.08, -33.83],
-                [17.28, 12.94, 31.83, -3.35, 6.81]
-            ],
-            [
-                [23.54, 6.98, -24.52, 0.52, 4.87],
-                [9.65, 6.18, 1.71, -25.23, -4.93],
-                [-54.99, -23.66, 3.19, -3.73, 18.58],
-                [-21.35, -10.39, -39.88, 28.73, -30.76],
-                [-9.13, 11.12, -14.0, -8.23, -11.25]
-            ]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec3_round(&grad_w.i(0)?, 2)?,
-        [
-            [
-                [28.34, -7.91, -45.75],
-                [21.03, 3.86, 29.86],
-                [0.72, -36.58, -35.28]
-            ],
-            [
-                [-16.04, 11.53, -16.38],
-                [29.62, -16.32, -48.35],
-                [57.5, 28.29, 25.81]
-            ],
-            [
-                [2.93, -19.6, 1.57],
-                [27.15, 53.88, -24.64],
-                [12.74, -22.6, -26.2]
-            ],
-            [
-                [-0.18, -14.86, -6.82],
-                [-19.55, -2.72, 45.9],
-                [-2.54, 36.97, 27.11]
-            ]
-        ]
-    );
-    Ok(())
-}
-
-test_device!(conv1d, conv1d_cpu, conv1d_gpu);
-test_device!(conv1d_small, conv1d_small_cpu, conv1d_small_gpu);
-test_device!(conv2d, conv2d_cpu, conv2d_gpu);
-test_device!(
-    conv2d_non_square,
-    conv2d_non_square_cpu,
-    conv2d_non_square_gpu
-);
-test_device!(conv2d_small, conv2d_small_cpu, conv2d_small_gpu);
-test_device!(conv2d_smaller, conv2d_smaller_cpu, conv2d_smaller_gpu);
-test_device!(conv2d_grad, conv2d_grad_cpu, conv2d_grad_gpu);
--- a/candle-core/tests/custom_op_tests.rs
+++ b/candle-core/tests/custom_op_tests.rs
@ -1,114 +0,0 @@
-use candle_core::backend::BackendStorage;
-use candle_core::cpu_backend;
-use candle_core::test_utils::to_vec1_round;
-use candle_core::{CpuStorage, CustomOp1, DType, Device, Error, Layout, Result, Shape, Tensor};
-
-fn fwd<T: num_traits::Float>(v: T, alpha: f64) -> T {
-    if v.is_sign_positive() {
-        v
-    } else {
-        let alpha = T::from(alpha).unwrap_or(T::nan());
-        (v.exp() - T::one()) * alpha
-    }
-}
-
-struct Elu {
-    alpha: f64,
-}
-
-impl CustomOp1 for Elu {
-    fn name(&self) -> &'static str {
-        "elu"
-    }
-
-    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
-        let storage = candle_core::map_dtype!(
-            "elu",
-            s,
-            |s| cpu_backend::unary_map(s, l, |v| fwd(v, self.alpha)),
-            (BF16, F16, F32, F64)
-        );
-        Ok((storage, l.shape().clone()))
-    }
-}
-
-#[test]
-fn custom_op1_no_backward() -> Result<()> {
-    let cpu = &Device::Cpu;
-    let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?;
-    let t = (t - 5.)?;
-    let elu_t = t.apply_op1_no_bwd(&Elu { alpha: 1. })?;
-    assert_eq!(
-        to_vec1_round(&elu_t, 4)?,
-        &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
-    );
-    Ok(())
-}
-
-// Define a similar struct as Elu but with backward support.
-fn bwd<T: num_traits::Float>(v: T, alpha: f64) -> T {
-    if v.is_sign_positive() {
-        T::one()
-    } else {
-        let alpha = T::from(alpha).unwrap_or(T::nan());
-        v.exp() * alpha
-    }
-}
-
-struct EluBackward {
-    alpha: f64,
-}
-
-impl CustomOp1 for EluBackward {
-    fn name(&self) -> &'static str {
-        "elu-bwd"
-    }
-
-    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
-        let storage = candle_core::map_dtype!(
-            "elu-bwd",
-            s,
-            |s| cpu_backend::unary_map(s, l, |v| bwd(v, self.alpha)),
-            (BF16, F16, F32, F64)
-        );
-        Ok((storage, l.shape().clone()))
-    }
-}
-
-struct EluWithBackward(Elu);
-
-impl EluWithBackward {
-    fn new(alpha: f64) -> Self {
-        Self(Elu { alpha })
-    }
-}
-
-impl CustomOp1 for EluWithBackward {
-    fn name(&self) -> &'static str {
-        "elu"
-    }
-
-    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
-        self.0.cpu_fwd(s, l)
-    }
-
-    fn bwd(&self, arg: &Tensor, _res: &Tensor, grad_res: &Tensor) -> Result<Option<Tensor>> {
-        let alpha = self.0.alpha;
-        let bwd = arg.apply_op1(EluBackward { alpha })?;
-        Ok(Some(grad_res.mul(&bwd)?))
-    }
-}
-
-#[test]
-fn custom_op1_with_backward() -> Result<()> {
-    let cpu = &Device::Cpu;
-    let t = candle_core::Var::new(&[-2f32, 0f32, 2f32], cpu)?;
-    let elu_t = t.apply_op1(EluWithBackward::new(2.))?;
-    assert_eq!(to_vec1_round(&elu_t, 4)?, &[-1.7293, 0.0, 2.0]);
-
-    let grads = elu_t.backward()?;
-    let grad_x = grads.get(&t).unwrap();
-    assert_eq!(to_vec1_round(grad_x, 4)?, [0.2707, 1.0, 1.0]);
-
-    Ok(())
-}
--- a/candle-core/tests/display_tests.rs
+++ b/candle-core/tests/display_tests.rs
@ -1,5 +1,5 @@
 use anyhow::Result;
-use candle_core::{DType, Device::Cpu, Tensor};
+use candle::{DType, Device::Cpu, Tensor};

 #[test]
 fn display_scalar() -> Result<()> {
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -1,5 +1,6 @@
 use anyhow::{Context, Result};
-use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var};
+use candle::{Device, Shape, Var};
+mod test_utils;

 fn simple_grad(device: &Device) -> Result<()> {
    let x = Var::new(&[3f32, 1., 4.], device)?;
@ -78,168 +79,7 @@ fn grad_descent(device: &Device) -> Result<()> {
    Ok(())
 }

-fn unary_grad(device: &Device) -> Result<()> {
-    let x = Var::new(&[3f32, 1., 4., 0.15], device)?;
-    let x = x.as_tensor();
-    let y = (x.log()? + 1.)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [2.0986, 1.0, 2.3863, -0.8971]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [0.3333, 1.0, 0.25, 6.6667]
-    );
-    let y = x.exp()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(
-        y.to_vec1::<f32>()?,
-        [20.085537, 2.7182817, 54.59815, 1.1618342]
-    );
-    assert_eq!(
-        grad_x.to_vec1::<f32>()?,
-        [20.085537, 2.7182817, 54.59815, 1.1618342]
-    );
-    let y = x.exp()?.sqr()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(
-        y.to_vec1::<f32>()?,
-        [403.4288, 7.3890557, 2980.9578, 1.3498588]
-    );
-    // exp(x)^2 = exp(2*x)
-    assert_eq!(
-        grad_x.to_vec1::<f32>()?,
-        [806.8576, 14.778111, 5961.9155, 2.6997175]
-    );
-    let y = x.sin()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [0.1411, 0.8415, -0.7568, 0.1494],
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [-0.99, 0.5403, -0.6536, 0.9888],
-    );
-    let y = x.cos()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [-0.99, 0.5403, -0.6536, 0.9888],
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 4)?,
-        [-0.1411, -0.8415, 0.7568, -0.1494],
-    );
-    let y = x.sqr()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [9.0, 1.0, 16.0, 0.0225]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [6.0, 2.0, 8.0, 0.3]);
-    let y = x.sqr()?.sqrt()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [3.0, 1.0, 4.0, 0.15]);
-    assert_eq!(test_utils::to_vec1_round(grad_x, 4)?, [1.0, 1.0, 1.0, 1.0]);
-    let y = x.neg()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [-3.0, -1.0, -4.0, -0.15]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [-1.0, -1.0, -1.0, -1.0]);
-    let y = x.affine(0.2, 1.)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [1.6, 1.2, 1.8, 1.03]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [0.2, 0.2, 0.2, 0.2]);
-    let y = Tensor::new(1f32, device)?.broadcast_div(x)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(
-        test_utils::to_vec1_round(&y, 4)?,
-        [0.3333, 1.0, 0.25, 6.6667]
-    );
-    assert_eq!(
-        grad_x.to_vec1::<f32>()?,
-        [-0.11111111, -1.0, -0.0625, -44.444443],
-    );
-    let y = x.broadcast_div(&Tensor::new(0.5f32, device)?)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [6., 2., 8., 0.3]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [2., 2., 2., 2.]);
-
-    let x = Var::new(&[3f32, 1., 4., 0.15], device)?;
-    let y = x.powf(2.5)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(test_utils::to_vec1_round(&y, 2)?, [15.59, 1.0, 32.0, 0.01]);
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 2)?,
-        [12.99, 2.5, 20.0, 0.15]
-    );
-
-    let y = x.tanh()?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(&x).context("no grad for x")?;
-    assert_eq!(test_utils::to_vec1_round(&y, 2)?, [1.0, 0.76, 1.0, 0.15]);
-    assert_eq!(
-        test_utils::to_vec1_round(grad_x, 2)?,
-        [0.01, 0.42, 0.0, 0.98],
-    );
-    Ok(())
-}
-
-fn binary_grad(device: &Device) -> Result<()> {
-    let x = Var::new(&[3f32, 1., -4., -1.], device)?;
-    let x = x.as_tensor();
-    // leaky relu
-    let y = x.maximum(&(x * 0.1)?)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(x.to_vec1::<f32>()?, [3., 1., -4., -1.]);
-    assert_eq!(y.to_vec1::<f32>()?, [3., 1., -0.4, -0.1]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [1., 1., 0.1, 0.1]);
-
-    let y = x.minimum(&(x * 0.1)?)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [0.3, 0.1, -4., -1.]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [0.1, 0.1, 1., 1.]);
-
-    // This one is easy to mess up, we want the gradient to be one as it is the identity function.
-    let y = x.minimum(x)?;
-    let grads = y.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    assert_eq!(y.to_vec1::<f32>()?, [3., 1., -4., -1.]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [1., 1., 1., 1.]);
-
-    let x_var = Var::new(&[3f32, 1., -4., -1., 5., 9.], device)?;
-    let x = x_var.as_tensor();
-    let y_var = Var::new(&[2f32, 7., 1.], device)?;
-    let y = y_var.as_tensor();
-
-    let ss = x
-        .reshape((2, 3))?
-        .slice_scatter0(&y.reshape((1, 3))?, 1)?
-        .sqr()?;
-    let grads = ss.backward()?;
-    let grad_x = grads.get(x).context("no grad for x")?;
-    let grad_y = grads.get(y).context("no grad for y")?;
-    assert_eq!(ss.to_vec2::<f32>()?, [[9., 1., 16.], [4., 49., 1.]]);
-    assert_eq!(grad_x.to_vec1::<f32>()?, [6.0, 2.0, -8.0, 0.0, 0.0, 0.0]);
-    assert_eq!(grad_y.to_vec1::<f32>()?, [4.0, 14.0, 2.0]);
-    Ok(())
-}
-
 test_device!(simple_grad, simple_grad_cpu, simple_grad_gpu);
 test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu);
 test_device!(matmul_grad, matmul_grad_cpu, matmul_grad_gpu);
 test_device!(grad_descent, grad_descent_cpu, grad_descent_gpu);
-test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu);
-test_device!(binary_grad, binary_grad_cpu, binary_grad_gpu);
--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
@ -1,5 +1,7 @@
 use anyhow::Result;
-use candle_core::{Device, IndexOp, Tensor};
+use candle::{Device, IndexOp, Tensor};
+
+mod test_utils;

 #[test]
 fn integer_index() -> Result<()> {
@ -56,19 +58,6 @@ fn range_index() -> Result<()> {
    let result = tensor.i(..=1)?;
    assert_eq!(result.dims(), &[2, 3]);
    assert_eq!(result.to_vec2::<u32>()?, &[[0, 1, 2], [3, 4, 5]]);
-
-    // Empty range
-    let result = tensor.i(1..1)?;
-    assert_eq!(result.dims(), &[0, 3]);
-    let empty: [[u32; 3]; 0] = [];
-    assert_eq!(result.to_vec2::<u32>()?, &empty);
-
-    // Similar to PyTorch, allow empty ranges when the computed length is negative.
-    #[allow(clippy::reversed_empty_ranges)]
-    let result = tensor.i(1..0)?;
-    assert_eq!(result.dims(), &[0, 3]);
-    let empty: [[u32; 3]; 0] = [];
-    assert_eq!(result.to_vec2::<u32>()?, &empty);
    Ok(())
 }

--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -1,5 +1,5 @@
-use candle::{test_device, Device, IndexOp, Result, Tensor};
-use candle_core as candle;
+mod test_utils;
+use candle::{Device, IndexOp, Result, Tensor};

 fn contiguous(device: &Device) -> Result<()> {
    let tensor = Tensor::arange(0u32, 24u32, device)?.reshape((2, 3, 4))?;
--- a/candle-core/tests/pool_tests.rs
+++ b/candle-core/tests/pool_tests.rs
@ -1,112 +0,0 @@
-use candle_core::{test_device, test_utils, Device, IndexOp, Result, Tensor};
-
-// https://github.com/huggingface/candle/issues/364
-fn avg_pool2d(dev: &Device) -> Result<()> {
-    let data: Vec<f32> = vec![
-        1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-    ];
-    let t = Tensor::from_vec(data, (1, 1, 4, 4), dev)?;
-    let pool = t.avg_pool2d(2)?.squeeze(0)?.squeeze(0)?;
-    assert_eq!(pool.to_vec2::<f32>()?, [[0.5f32, 1.], [1., 1.]]);
-
-    let data: Vec<f32> = vec![
-        1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1.,
-    ];
-    let t = Tensor::from_vec(data, (1, 1, 2, 8), dev)?;
-    let pool = t.avg_pool2d(2)?.squeeze(0)?.squeeze(0)?;
-    assert_eq!(pool.to_vec2::<f32>()?, [[5. / 4., 6. / 4., 6. / 4., 1.]]);
-    Ok(())
-}
-
-fn max_pool2d(dev: &Device) -> Result<()> {
-    let data: Vec<f32> = vec![
-        1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1.,
-    ];
-    let t = Tensor::from_vec(data, (1, 1, 4, 4), dev)?;
-
-    let pool = t.max_pool2d(2)?.squeeze(0)?.squeeze(0)?;
-    assert_eq!(pool.to_vec2::<f32>()?, [[2f32, 3.], [5., 1.]]);
-
-    let t = t.reshape((1, 1, 2, 8))?;
-    let pool = t.max_pool2d(2)?.squeeze(0)?.squeeze(0)?;
-    assert_eq!(pool.to_vec2::<f32>()?, [[2.0, 3.0, 5.0, 1.0]]);
-    Ok(())
-}
-
-/* This test corresponds to the following PyTorch script.
-import torch
-torch.manual_seed(4242)
-
-t = torch.randn((1, 2, 4, 4))
-print(t.flatten())
-res = torch.nn.functional.avg_pool2d(t, 2)
-print(res)
-*/
-fn avg_pool2d_pytorch(dev: &Device) -> Result<()> {
-    let t = Tensor::new(
-        &[
-            0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616,
-            1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, 0.2395,
-            1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, -0.6836,
-            0.2477, 1.3127,
-        ],
-        dev,
-    )?
-    .reshape((1, 2, 4, 4))?;
-    let pool = t.avg_pool2d(2)?.squeeze(0)?;
-    assert_eq!(
-        test_utils::to_vec3_round(&pool, 4)?,
-        [
-            [[-1.1926, -0.0395], [0.2688, 0.1871]],
-            [[0.1835, -0.1606], [0.6249, 0.3217]]
-        ]
-    );
-    let pool = t.avg_pool2d(3)?.squeeze(0)?;
-    assert_eq!(
-        test_utils::to_vec3_round(&pool, 4)?,
-        [[[0.085]], [[0.0078]]]
-    );
-
-    let t = t.reshape((1, 1, 4, 8))?;
-    let pool = t.avg_pool2d(2)?.squeeze(0)?.squeeze(0)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&pool, 4)?,
-        [
-            [0.7745, 0.0276, -1.6983, 0.12],
-            [0.3542, 0.1625, 0.4542, -0.0014]
-        ]
-    );
-    Ok(())
-}
-
-fn upsample_nearest2d(dev: &Device) -> Result<()> {
-    let t = Tensor::arange(0f32, 6f32, dev)?.reshape((1, 1, 2, 3))?;
-    let upsampled = t.upsample_nearest2d(4, 6)?.i(0)?.i(0)?;
-    assert_eq!(
-        t.i(0)?.i(0)?.to_vec2::<f32>()?,
-        [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]
-    );
-    assert_eq!(
-        upsampled.to_vec2::<f32>()?,
-        [
-            [0.0, 0.0, 1.0, 1.0, 2.0, 2.0],
-            [0.0, 0.0, 1.0, 1.0, 2.0, 2.0],
-            [3.0, 3.0, 4.0, 4.0, 5.0, 5.0],
-            [3.0, 3.0, 4.0, 4.0, 5.0, 5.0]
-        ]
-    );
-    Ok(())
-}
-
-test_device!(avg_pool2d, avg_pool2d_cpu, avg_pool2d_gpu);
-test_device!(
-    avg_pool2d_pytorch,
-    avg_pool2d_pytorch_cpu,
-    avg_pool2d_pytorch_gpu
-);
-test_device!(max_pool2d, max_pool2d_cpu, max_pool2d_gpu);
-test_device!(
-    upsample_nearest2d,
-    upsample_nearest2d_cpu,
-    upsample_nearest2d_gpu
-);
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
@ -1,722 +0,0 @@
-use candle_core::{
-    quantized::{self, GgmlDType},
-    test_utils::to_vec2_round,
-    Device, Result, Tensor,
-};
-use quantized::{k_quants, GgmlType};
-use rand::prelude::*;
-
-const GGML_TEST_SIZE: usize = 32 * 128;
-
-const GGML_MAX_QUANTIZATION_TOTAL_ERROR: f32 = 0.002;
-const GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS: f32 = 0.0075;
-const GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS: f32 = 0.0040;
-const GGML_MAX_DOT_PRODUCT_ERROR: f32 = 0.02;
-
-#[test]
-fn quantized_matmul() -> Result<()> {
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (3, 64, 4);
-    let lhs = (0..(m * k)).map(|v| v as f32).collect::<Vec<_>>();
-    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), cpu)?;
-    let mut dst = vec![42.; 3 * 4];
-    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
-    let rhs = (0..(k * n)).map(|v| v as f32).collect::<Vec<_>>();
-    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), cpu)?.t()?;
-    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
-    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
-    assert_eq!(
-        dst.iter().map(|x| x.round()).collect::<Vec<_>>(),
-        &[
-            85120.0, 214562.0, 345455.0, 474748.0, 213475.0, 604465.0, 1000686.0, 1388317.0,
-            341876.0, 994283.0, 1655709.0, 2301518.0
-        ]
-    );
-    let mm = tensor_lhs.matmul(&tensor_rhs)?;
-    assert_eq!(
-        mm.to_vec2::<f32>()?,
-        &[
-            [85344.0, 214368.0, 343392.0, 472416.0],
-            [214368.0, 605536.0, 996704.0, 1387872.0],
-            [343392.0, 996704.0, 1650016.0, 2303328.0]
-        ]
-    );
-
-    let qtensor = quantized::QTensor::new(rhs_t, (4, 64))?;
-    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
-    let res = matmul.forward(&tensor_lhs)?;
-    assert_eq!(
-        to_vec2_round(&res, 0)?,
-        &[
-            [85120.0, 214562.0, 345455.0, 474748.0],
-            [213475.0, 604465.0, 1000686.0, 1388317.0],
-            [341876.0, 994283.0, 1655709.0, 2301518.0]
-        ]
-    );
-
-    Ok(())
-}
-
-#[test]
-fn quantized_matmul_neg() -> Result<()> {
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (3, 64, 4);
-    let lhs = (0..(m * k))
-        .map(|v| v as f32 - (m * k) as f32 / 2.0)
-        .collect::<Vec<_>>();
-    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), cpu)?;
-    let mut dst = vec![42.; 3 * 4];
-    let mut rhs_t = vec![k_quants::BlockQ4_0::zeros(); 8];
-    let rhs = (0..k * n)
-        .map(|v| v as f32 - (k * n) as f32 / 3.0)
-        .collect::<Vec<_>>();
-    let tensor_rhs = Tensor::from_slice(&rhs, (n, k), cpu)?.t()?;
-    k_quants::BlockQ4_0::from_float(&rhs, &mut rhs_t)?;
-    k_quants::matmul((m, k, n), &lhs, &rhs_t, &mut dst)?;
-    assert_eq!(
-        dst.iter().map(|x| x.round()).collect::<Vec<_>>(),
-        &[
-            243524.0, -19596.0, -285051.0, -549815.0, 23777.0, 21651.0, 19398.0, 18367.0,
-            -196472.0, 63012.0, 324585.0, 587902.0
-        ]
-    );
-    let mm = tensor_lhs.matmul(&tensor_rhs)?;
-    assert_eq!(
-        to_vec2_round(&mm, 0)?,
-        &[
-            [244064.0, -20128.0, -284320.0, -548512.0],
-            [23563.0, 21515.0, 19467.0, 17419.0],
-            [-196939.0, 63157.0, 323253.0, 583349.0]
-        ]
-    );
-
-    let qtensor = quantized::QTensor::new(rhs_t, (4, 64))?;
-    let matmul = quantized::QMatMul::from_qtensor(qtensor)?;
-    let res = matmul.forward(&tensor_lhs)?;
-    assert_eq!(
-        to_vec2_round(&res, 0)?,
-        &[
-            [243524.0, -19596.0, -285051.0, -549815.0],
-            [23777.0, 21651.0, 19398.0, 18367.0],
-            [-196472.0, 63012.0, 324585.0, 587902.0]
-        ]
-    );
-
-    Ok(())
-}
-
-#[test]
-fn quantize_q4_0() -> Result<()> {
-    use k_quants::BlockQ4_0;
-
-    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let mut dst = vec![0f32; 32 * 4];
-    let mut quant = vec![BlockQ4_0::zeros(); 4];
-    BlockQ4_0::from_float(&src, &mut quant)?;
-    BlockQ4_0::to_float(&quant, dst.as_mut_slice())?;
-    assert_eq!(
-        dst,
-        &[
-            -0.0, -0.0, 3.875, 3.875, 3.875, 3.875, 7.75, 7.75, 7.75, 7.75, 11.625, 11.625, 11.625,
-            11.625, 15.5, 15.5, 15.5, 15.5, 19.375, 19.375, 19.375, 19.375, 23.25, 23.25, 23.25,
-            23.25, 27.125, 27.125, 27.125, 27.125, 31.0, 31.0, 31.5, 31.5, 31.5, 31.5, 39.375,
-            39.375, 39.375, 39.375, 39.375, 39.375, 39.375, 39.375, 47.25, 47.25, 47.25, 47.25,
-            47.25, 47.25, 47.25, 47.25, 55.125, 55.125, 55.125, 55.125, 55.125, 55.125, 55.125,
-            55.125, 63.0, 63.0, 63.0, 63.0, 59.375, 59.375, 71.25, 71.25, 71.25, 71.25, 71.25,
-            71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 83.125, 83.125, 83.125, 83.125,
-            83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 95.0, 95.0, 95.0, 95.0,
-            95.0, 95.0, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 95.25, 111.125, 111.125,
-            111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125, 111.125,
-            111.125, 111.125, 111.125, 111.125, 111.125, 127.0, 127.0, 127.0, 127.0, 127.0, 127.0,
-            127.0, 127.0
-        ]
-    );
-    ggml_quantization_error_test::<BlockQ4_0>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-    Ok(())
-}
-
-#[test]
-fn quantize_q4_1() -> Result<()> {
-    use k_quants::BlockQ4_1;
-
-    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let mut dst = vec![0f32; 32 * 4];
-    let mut quant = vec![BlockQ4_1::zeros(); 4];
-    BlockQ4_1::from_float(&src, &mut quant)?;
-    BlockQ4_1::to_float(&quant, dst.as_mut_slice())?;
-    assert_eq!(
-        round_vector(&dst),
-        &[
-            0.0, 0.0, 2.066, 2.066, 4.133, 4.133, 6.199, 6.199, 8.266, 8.266, 10.332, 10.332,
-            12.398, 12.398, 14.465, 14.465, 16.531, 16.531, 18.598, 18.598, 20.664, 20.664, 22.73,
-            22.73, 24.797, 24.797, 26.863, 26.863, 28.93, 28.93, 30.996, 30.996, 32.0, 32.0,
-            34.066, 34.066, 36.133, 36.133, 38.199, 38.199, 40.266, 40.266, 42.332, 42.332, 44.398,
-            44.398, 46.465, 46.465, 48.531, 48.531, 50.598, 50.598, 52.664, 52.664, 54.73, 54.73,
-            56.797, 56.797, 58.863, 58.863, 60.93, 60.93, 62.996, 62.996, 64.0, 64.0, 66.066,
-            66.066, 68.133, 68.133, 70.199, 70.199, 72.266, 72.266, 74.332, 74.332, 76.398, 76.398,
-            78.465, 78.465, 80.531, 80.531, 82.598, 82.598, 84.664, 84.664, 86.73, 86.73, 88.797,
-            88.797, 90.863, 90.863, 92.93, 92.93, 94.996, 94.996, 96.0, 96.0, 98.066, 98.066,
-            100.133, 100.133, 102.199, 102.199, 104.266, 104.266, 106.332, 106.332, 108.398,
-            108.398, 110.465, 110.465, 112.531, 112.531, 114.598, 114.598, 116.664, 116.664,
-            118.73, 118.73, 120.797, 120.797, 122.863, 122.863, 124.93, 124.93, 126.996, 126.996
-        ]
-    );
-    ggml_quantization_error_test::<BlockQ4_1>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-    Ok(())
-}
-
-#[test]
-fn quantize_q5_0() -> Result<()> {
-    use k_quants::BlockQ5_0;
-
-    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let mut dst = vec![0f32; 32 * 4];
-    let mut quant = vec![BlockQ5_0::zeros(); 4];
-    BlockQ5_0::from_float(&src, &mut quant)?;
-    BlockQ5_0::to_float(&quant, dst.as_mut_slice())?;
-    assert_eq!(
-        round_vector(&dst),
-        &[
-            -0.0, 1.938, 1.938, 3.875, 3.875, 5.813, 5.813, 7.75, 7.75, 9.688, 9.688, 11.625,
-            11.625, 13.563, 13.563, 15.5, 15.5, 17.438, 17.438, 19.375, 19.375, 21.313, 21.313,
-            23.25, 23.25, 25.188, 25.188, 27.125, 27.125, 29.063, 29.063, 31.0, 31.5, 31.5, 35.438,
-            35.438, 35.438, 35.438, 39.375, 39.375, 39.375, 39.375, 43.313, 43.313, 43.313, 43.313,
-            47.25, 47.25, 47.25, 47.25, 51.188, 51.188, 51.188, 51.188, 55.125, 55.125, 55.125,
-            55.125, 59.063, 59.063, 59.063, 59.063, 63.0, 63.0, 65.313, 65.313, 65.313, 65.313,
-            65.313, 71.25, 71.25, 71.25, 71.25, 71.25, 71.25, 77.188, 77.188, 77.188, 77.188,
-            77.188, 77.188, 83.125, 83.125, 83.125, 83.125, 83.125, 83.125, 89.063, 89.063, 89.063,
-            89.063, 89.063, 89.063, 95.0, 95.0, 95.0, 95.25, 95.25, 95.25, 95.25, 103.188, 103.188,
-            103.188, 103.188, 103.188, 103.188, 103.188, 103.188, 111.125, 111.125, 111.125,
-            111.125, 111.125, 111.125, 111.125, 111.125, 119.063, 119.063, 119.063, 119.063,
-            119.063, 119.063, 119.063, 119.063, 127.0, 127.0, 127.0, 127.0
-        ]
-    );
-    ggml_quantization_error_test::<BlockQ5_0>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-    Ok(())
-}
-
-#[test]
-fn quantize_q5_1() -> Result<()> {
-    use k_quants::BlockQ5_1;
-
-    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
-    let mut dst = vec![0f32; 32 * 4];
-    let mut quant = vec![BlockQ5_1::zeros(); 4];
-    BlockQ5_1::from_float(&src, &mut quant)?;
-    BlockQ5_1::to_float(&quant, dst.as_mut_slice())?;
-    assert_eq!(
-        dst,
-        &[
-            0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
-            16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
-            30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0,
-            44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0,
-            58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0,
-            72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0,
-            86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0,
-            100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0,
-            112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0,
-            124.0, 125.0, 126.0, 127.0
-        ]
-    );
-
-    ggml_quantization_error_test::<BlockQ5_1>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-    Ok(())
-}
-
-/// Generates a small test vector ranging from -`bound` to `bound` with `size` steps
-fn get_test_vector(bound: f32, size: usize) -> (Vec<f32>, Vec<f32>) {
-    assert!(
-        size % crate::quantized::k_quants::QK_K == 0,
-        "size must be a multiple of {}",
-        crate::quantized::k_quants::QK_K
-    );
-
-    let src = (0..size)
-        .map(|v| (v as f32 - size as f32 / 2.) * bound / (size as f32 / 2.))
-        .collect::<Vec<_>>();
-
-    let dst = vec![0f32; size];
-    assert_eq!([src[0], src[size / 2]], [-bound, 0.0]);
-    (src, dst)
-}
-
-/// Round a vector
-fn round_vector(values: &[f32]) -> Vec<f32> {
-    values
-        .iter()
-        .map(|x| (1000. * x).round() / 1000.)
-        .collect::<Vec<_>>()
-}
-
-fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) {
-    for (i, (value, expected_value)) in values.iter().zip(expected.iter()).enumerate() {
-        let difference = (value - expected_value).abs();
-
-        assert!(
-            difference < tolerance,
-            "Error at index {}: value = {}, expected = {}. Difference = {} exceeds tolerance = {}.",
-            i,
-            value,
-            expected_value,
-            difference,
-            tolerance
-        );
-    }
-}
-
-/// Creates a vector simillarly to the one used in GGML unit tests: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
-fn create_ggml_like_vector(offset: f32) -> Vec<f32> {
-    (0..GGML_TEST_SIZE)
-        .map(|i| 0.1 + 2.0 * (i as f32 + offset).cos())
-        .collect()
-}
-
-/// Calculates the root mean square error between two vectors
-fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 {
-    assert_eq!(a.len(), b.len());
-    let sum = a
-        .iter()
-        .zip(b)
-        .map(|(a, b)| (a - b).powi(2))
-        .sum::<f32>()
-        .sqrt();
-    sum / a.len() as f32
-}
-
-/// Mirrores the GGML quanitzation unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
-fn ggml_quantization_error_test<T: GgmlType>(max_error: f32) -> Result<()> {
-    let src = create_ggml_like_vector(0.0);
-    let mut dst = vec![0.0; GGML_TEST_SIZE];
-    let _quant = quantize_roundtrip::<T>(src.as_slice(), dst.as_mut_slice())?;
-    let error = calculate_rmse(src.as_slice(), dst.as_slice());
-    if error > max_error {
-        candle_core::bail!(
-            "Quantization error {} exceeds max error {}",
-            error,
-            max_error
-        );
-    }
-    Ok(())
-}
-
-fn quantize_roundtrip<T: GgmlType>(src: &[f32], dst: &mut [f32]) -> Result<Vec<T>> {
-    let mut quant = vec![T::zeros(); src.len() / T::BLCK_SIZE];
-    T::from_float(src, &mut quant)?;
-    T::to_float(&quant, dst)?;
-    Ok(quant)
-}
-
-#[test]
-fn quantize_q2k() -> Result<()> {
-    use k_quants::BlockQ2K;
-
-    let (src, mut dst) = get_test_vector(0.5, 1024);
-    let _quant = quantize_roundtrip::<BlockQ2K>(src.as_slice(), dst.as_mut_slice())?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.1);
-
-    // Test some specific values
-    assert_eq!(
-        [src[0], src[128], src[256], src[512], src[800], src[1023]],
-        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
-    );
-    let dst = round_vector(&dst);
-    assert_eq!(
-        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.499, -0.366, -0.249, 0.0, 0.295, 0.492]
-    );
-
-    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
-    let _quant_big = quantize_roundtrip::<BlockQ2K>(src_big.as_slice(), dst_big.as_mut_slice())?;
-    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 6.0);
-
-    ggml_quantization_error_test::<BlockQ2K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR_2BITS)?;
-    Ok(())
-}
-
-#[test]
-fn quantize_q3k() -> Result<()> {
-    use k_quants::BlockQ3K;
-
-    let (src, mut dst) = get_test_vector(0.5, 1024);
-    let _quant = quantize_roundtrip::<BlockQ3K>(src.as_slice(), dst.as_mut_slice())?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.03);
-
-    // Test some specific values
-    assert_eq!(
-        [src[0], src[128], src[256], src[512], src[800], src[1023]],
-        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
-    );
-    let dst = round_vector(&dst);
-    assert_eq!(
-        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.493, -0.37, -0.243, -0.0, 0.292, 0.492]
-    );
-
-    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
-    let _quant_big = quantize_roundtrip::<BlockQ3K>(src_big.as_slice(), dst_big.as_mut_slice())?;
-    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 3.5);
-
-    ggml_quantization_error_test::<BlockQ3K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR_3BITS)?;
-    Ok(())
-}
-
-#[test]
-fn quantize_q4k() -> Result<()> {
-    use k_quants::BlockQ4K;
-
-    let (src, mut dst) = get_test_vector(0.5, 1024);
-    let _quant = quantize_roundtrip::<BlockQ4K>(src.as_slice(), dst.as_mut_slice())?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.017);
-
-    // Test some specific values
-    assert_eq!(
-        [src[0], src[128], src[256], src[512], src[800], src[1023]],
-        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
-    );
-    let dst = round_vector(&dst);
-    assert_eq!(
-        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.5, -0.373, -0.25, 0.0, 0.288, 0.498]
-    );
-
-    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
-    let _quant_big = quantize_roundtrip::<BlockQ4K>(src_big.as_slice(), dst_big.as_mut_slice())?;
-    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 4.5);
-
-    ggml_quantization_error_test::<BlockQ4K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-    Ok(())
-}
-
-#[test]
-fn quantize_q5k() -> Result<()> {
-    use k_quants::BlockQ5K;
-
-    let (src, mut dst) = get_test_vector(0.5, 1024);
-    let _quant = quantize_roundtrip::<BlockQ5K>(src.as_slice(), dst.as_mut_slice())?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);
-
-    // Test some specific values
-    assert_eq!(
-        [src[0], src[128], src[256], src[512], src[800], src[1023]],
-        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
-    );
-    let dst = round_vector(&dst);
-    assert_eq!(
-        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.499, -0.372, -0.249, 0.001, 0.279, 0.499]
-    );
-
-    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
-    let _quant_big = quantize_roundtrip::<BlockQ5K>(src_big.as_slice(), dst_big.as_mut_slice())?;
-    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.5);
-
-    ggml_quantization_error_test::<BlockQ5K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-
-    Ok(())
-}
-
-#[test]
-fn quantize_q6k() -> Result<()> {
-    use k_quants::BlockQ6K;
-
-    let (src, mut dst) = get_test_vector(0.5, 1024);
-    let _quant = quantize_roundtrip::<BlockQ6K>(src.as_slice(), dst.as_mut_slice())?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.008);
-
-    // Test some specific values
-    assert_eq!(
-        [src[0], src[128], src[256], src[512], src[800], src[1023]],
-        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
-    );
-    let dst = round_vector(&dst);
-    assert_eq!(
-        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.497, -0.372, -0.25, -0.0, 0.284, 0.5]
-    );
-
-    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
-    let _quant_big = quantize_roundtrip::<BlockQ6K>(src_big.as_slice(), dst_big.as_mut_slice())?;
-    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 2.0);
-
-    ggml_quantization_error_test::<BlockQ6K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-
-    Ok(())
-}
-
-#[test]
-fn quantize_q8k() -> Result<()> {
-    use k_quants::BlockQ8K;
-
-    let (src, mut dst) = get_test_vector(0.5, 1024);
-    let _quant = quantize_roundtrip::<BlockQ8K>(src.as_slice(), dst.as_mut_slice())?;
-    compare_with_error(dst.as_slice(), src.as_slice(), 0.003);
-
-    // Test some specific values
-    assert_eq!(
-        [src[0], src[128], src[256], src[512], src[800], src[1023]],
-        [-0.5, -0.375, -0.25, 0.0, 0.28125, 0.49902344]
-    );
-    let dst = round_vector(&dst);
-    assert_eq!(
-        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.5, -0.375, -0.25, -0.0, 0.281, 0.499]
-    );
-
-    let (src_big, mut dst_big) = get_test_vector(128.0, 1024);
-    let _quant_big = quantize_roundtrip::<BlockQ8K>(src_big.as_slice(), dst_big.as_mut_slice())?;
-    compare_with_error(dst_big.as_slice(), src_big.as_slice(), 0.6);
-
-    ggml_quantization_error_test::<BlockQ8K>(GGML_MAX_QUANTIZATION_TOTAL_ERROR)?;
-
-    Ok(())
-}
-
-/// Very simple dot product implementation
-fn vec_dot_reference(a: &[f32], b: &[f32]) -> f32 {
-    a.iter().zip(b).map(|(a, b)| a * b).sum()
-}
-
-/// Returns the error achieved by the GGML matmul unit test.
-fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> {
-    let err = match dtype {
-        GgmlDType::F16 => 0.000010,
-        GgmlDType::Q2K => 0.004086,
-        GgmlDType::Q3K => 0.016148,
-        GgmlDType::Q4K => 0.002425,
-        GgmlDType::Q5K => 0.000740,
-        GgmlDType::Q6K => 0.000952,
-        GgmlDType::Q4_0 => 0.001143,
-        GgmlDType::Q4_1 => 0.007784,
-        GgmlDType::Q5_0 => 0.001353,
-        GgmlDType::Q5_1 => 0.001363,
-        GgmlDType::Q8_0 => 0.000092,
-
-        // Not from the ggml repo.
-        GgmlDType::Q8K => 0.00065,
-        _ => candle_core::bail!("No GGML results for quantization type {dtype:?}",),
-    };
-    Ok(err)
-}
-
-/// Mirrores the GGML matmul unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
-fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
-    let a = create_ggml_like_vector(0.0);
-    let b = create_ggml_like_vector(1.0);
-    let length = a.len();
-
-    let mut a_quant = vec![T::zeros(); length / T::BLCK_SIZE];
-    let mut b_quant = vec![T::VecDotType::zeros(); length / T::VecDotType::BLCK_SIZE];
-    T::from_float(&a, &mut a_quant)?;
-    T::VecDotType::from_float(&b, &mut b_quant)?;
-
-    let result = T::vec_dot(length, &a_quant, &b_quant)?;
-    let result_unopt = T::vec_dot_unopt(length, &a_quant, &b_quant)?;
-    let reference_result = vec_dot_reference(&a, &b);
-
-    if (result - result_unopt).abs() / length as f32 > 1e-6 {
-        candle_core::bail!(
-            "the opt and unopt vec-dot returned different values, opt {result}, unopt {result_unopt}"
-        )
-    }
-
-    let error = (result - reference_result).abs() / length as f32;
-
-    let ggml_error = ggml_reference_matmul_error(T::DTYPE)?;
-
-    if !error.is_finite() || error > GGML_MAX_DOT_PRODUCT_ERROR {
-        candle_core::bail!(
-            "Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",
-        );
-    }
-
-    // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML
-    // => we use a slightly higher error threshold
-    const ERROR_LENIENCY: f32 = 0.00001;
-    if error - ERROR_LENIENCY > ggml_error {
-        candle_core::bail!(
-            "Dot product error {} exceeds ggml reference error {}",
-            error,
-            ggml_error
-        );
-    }
-    Ok(())
-}
-
-/// generates random tensors of size `m x k` and `n x k` and calculates their expected matrix multiplication result.
-fn get_random_tensors(
-    m: usize,
-    k: usize,
-    n: usize,
-    device: &Device,
-) -> Result<(Tensor, Tensor, Tensor)> {
-    let mut rng = StdRng::seed_from_u64(314159265358979);
-
-    let lhs = (0..m * k)
-        .map(|_| rng.gen::<f32>() - 0.5)
-        .collect::<Vec<_>>();
-    let rhs = (0..n * k)
-        .map(|_| rng.gen::<f32>() - 0.5)
-        .collect::<Vec<_>>();
-
-    let lhs = Tensor::from_vec(lhs, (m, k), device)?;
-    let rhs = Tensor::from_vec(rhs, (n, k), device)?;
-
-    let mm = lhs.matmul(&rhs.t()?)?;
-    Ok((lhs, rhs, mm))
-}
-
-#[test]
-fn quantized_matmul_q2k() -> Result<()> {
-    use k_quants::BlockQ2K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize::<BlockQ2K>(&rhs)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [0.916, 0.422, 0.215, 1.668]);
-
-    ggml_matmul_error_test::<BlockQ2K>()?;
-
-    Ok(())
-}
-
-#[test]
-fn quantized_matmul_q3k() -> Result<()> {
-    use k_quants::BlockQ3K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize::<BlockQ3K>(&rhs)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.029, 1.418, -0.314, 1.495]);
-
-    ggml_matmul_error_test::<BlockQ3K>()?;
-
-    Ok(())
-}
-
-#[test]
-fn quantized_matmul_q4k() -> Result<()> {
-    use k_quants::BlockQ4K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize::<BlockQ4K>(&rhs)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.125, 1.435, -0.201, 1.589]);
-
-    ggml_matmul_error_test::<BlockQ4K>()?;
-
-    Ok(())
-}
-
-#[test]
-fn quantized_matmul_q5k() -> Result<()> {
-    use k_quants::BlockQ5K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize::<BlockQ5K>(&rhs)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.192, 1.491, -0.18, 1.743]);
-
-    //Expected: 0.000740408897
-    ggml_matmul_error_test::<BlockQ5K>()?;
-
-    Ok(())
-}
-
-#[test]
-fn quantized_matmul_q6k() -> Result<()> {
-    use k_quants::BlockQ6K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize::<BlockQ6K>(&rhs)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.324, 1.49, -0.164, 1.741]);
-
-    ggml_matmul_error_test::<BlockQ6K>()?;
-    Ok(())
-}
-
-#[test]
-fn quantized_matmul_q8k() -> Result<()> {
-    use k_quants::BlockQ8K;
-
-    let cpu = &Device::Cpu;
-    let (m, k, n) = (11, 512, 21);
-    let (lhs, rhs, mm) = get_random_tensors(m, k, n, cpu)?;
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.262, 1.513, -0.208, 1.702]);
-
-    let rhs = quantized::QTensor::quantize::<BlockQ8K>(&rhs)?;
-    let rhs = quantized::QMatMul::from_qtensor(rhs)?;
-    let mm = rhs.forward(&lhs)?;
-
-    assert_eq!(mm.dims(), [m, n]);
-    let dst = mm.flatten_all()?.to_vec1::<f32>()?;
-    let dst = round_vector(&[dst[0], dst[m * n / 3], dst[m * n * 2 / 3], dst[m * n - 1]]);
-    assert_eq!(dst, [1.266, 1.504, -0.204, 1.7]);
-
-    ggml_matmul_error_test::<BlockQ8K>()?;
-    Ok(())
-}
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -1,41 +1,18 @@
-use candle_core::{test_device, test_utils, DType, Device, IndexOp, Result, Tensor};
+mod test_utils;
+use candle::{DType, Device, IndexOp, Result, Tensor};
+use test_utils::to_vec3_round;

 fn zeros(device: &Device) -> Result<()> {
    let tensor = Tensor::zeros((5, 2), DType::F32, device)?;
-    let (dim1, dim2) = tensor.dims2()?;
+    let (dim1, dim2) = tensor.shape().r2()?;
    assert_eq!(dim1, 5);
    assert_eq!(dim2, 2);
    Ok(())
 }

-fn ones(device: &Device) -> Result<()> {
-    assert_eq!(
-        Tensor::ones((2, 3), DType::U8, device)?.to_vec2::<u8>()?,
-        [[1, 1, 1], [1, 1, 1]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::U32, device)?.to_vec2::<u32>()?,
-        [[1, 1, 1], [1, 1, 1]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::I64, device)?.to_vec2::<i64>()?,
-        [[1, 1, 1], [1, 1, 1]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::F32, device)?.to_vec2::<f32>()?,
-        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
-    );
-    assert_eq!(
-        Tensor::ones((2, 3), DType::F64, device)?.to_vec2::<f64>()?,
-        [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
-    );
-
-    Ok(())
-}
-
 fn add_mul(device: &Device) -> Result<()> {
    let tensor = Tensor::new(&[3f32, 1., 4.], device)?;
-    let dim1 = tensor.dims1()?;
+    let dim1 = tensor.shape().r1()?;
    assert_eq!(dim1, 3);
    let content: Vec<f32> = tensor.to_vec1()?;
    assert_eq!(content, [3., 1., 4.]);
@ -51,79 +28,20 @@ fn add_mul(device: &Device) -> Result<()> {
 fn tensor_2d(device: &Device) -> Result<()> {
    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
    let tensor = Tensor::new(data, device)?;
-    let dims = tensor.dims2()?;
+    let dims = tensor.shape().r2()?;
    assert_eq!(dims, (2, 5));
    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
    assert_eq!(content, data);
    Ok(())
 }

-fn clamp(device: &Device) -> Result<()> {
-    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
-    let tensor = Tensor::new(data, device)?;
-    let tensor = tensor.clamp(1.5, 6.2)?;
-    assert_eq!(
-        tensor.to_vec2::<f32>()?,
-        [[3.0, 1.5, 4.0, 1.5, 5.0], [2.0, 1.5, 6.2, 6.2, 2.0]],
-    );
-    Ok(())
-}
-
-fn unary_op(device: &Device) -> Result<()> {
-    let data = &[[-3f32, 1., 4., -0.1, 0.5], [2.7, -1.8, -0.28, 1.8, 2.8]];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.gelu()?, 4)?,
-        [
-            [-0.0036, 0.8412, 3.9999, -0.046, 0.3457],
-            [2.6911, -0.0647, -0.1091, 1.7353, 2.7933]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.gelu_erf()?, 4)?,
-        [
-            [-0.004, 0.8413, 3.9999, -0.046, 0.3457],
-            [2.6906, -0.0647, -0.1091, 1.7353, 2.7928]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.erf()?, 4)?,
-        [
-            [-1.0, 0.8427, 1.0, -0.1125, 0.5205],
-            [0.9999, -0.9891, -0.3079, 0.9891, 0.9999]
-        ]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.ceil()?, 4)?,
-        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -1.0, -0.0, 2.0, 3.0]]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.floor()?, 4)?,
-        [[-3.0, 1.0, 4.0, -1.0, 0.0], [2.0, -2.0, -1.0, 1.0, 2.0]]
-    );
-    assert_eq!(
-        test_utils::to_vec2_round(&tensor.round()?, 4)?,
-        [[-3.0, 1.0, 4.0, -0.0, 1.0], [3.0, -2.0, -0.0, 2.0, 3.0]]
-    );
-    let tensor = Tensor::new(&[2997.9246, 314.15926f32], device)?;
-    assert_eq!(
-        test_utils::to_vec1_round(&tensor.round_to(2)?, 4)?,
-        [2997.92, 314.16]
-    );
-    assert_eq!(
-        test_utils::to_vec1_round(&tensor.round_to(-2)?, 4)?,
-        [3000.0, 300.]
-    );
-    Ok(())
-}
-
 fn binary_op(device: &Device) -> Result<()> {
    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
-    let tensor1 = Tensor::new(data, device)?;
+    let tensor = Tensor::new(data, device)?;
    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 1., 7., 8., 2.]];
    let tensor2 = Tensor::new(data2, device)?;
-    let tensor = (&tensor1 + (&tensor1 * &tensor1)? / (&tensor1 + &tensor2))?;
-    let dims = tensor.dims2()?;
+    let tensor = (&tensor + (&tensor * &tensor)? / (&tensor + &tensor2))?;
+    let dims = tensor.shape().r2()?;
    assert_eq!(dims, (2, 5));
    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
    assert_eq!(content[0], [4.125, 1.1666666, 5.7777777, 1.1666666, 7.5]);
@ -132,24 +50,13 @@ fn binary_op(device: &Device) -> Result<()> {
    let tensor = (&tensor - &tensor)?;
    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
    assert_eq!(content[0], [0., 0., 0., 0., 0.]);
-
-    let min = tensor1.minimum(&(&tensor2 * 0.5)?)?;
-    let max = tensor1.maximum(&(&tensor2 * 0.5)?)?;
-    assert_eq!(
-        min.to_vec2::<f32>()?,
-        [[2.5, 1.0, 2.5, 1.0, 2.5], [1.0, 0.5, 3.5, 4.0, 1.0]],
-    );
-    assert_eq!(
-        max.to_vec2::<f32>()?,
-        [[3.0, 2.5, 4.0, 2.5, 5.0], [2.0, 1.0, 7.0, 8.0, 2.0]]
-    );
    Ok(())
 }

 fn transpose(device: &Device) -> Result<()> {
    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
    let tensor = Tensor::new(data, device)?.t()?;
-    let dims = tensor.dims2()?;
+    let dims = tensor.shape().r2()?;
    assert_eq!(dims, (5, 2));
    assert_eq!(
        tensor.to_vec2::<f32>()?,
@ -161,6 +68,42 @@ fn transpose(device: &Device) -> Result<()> {
    Ok(())
 }

+fn softmax(device: &Device) -> Result<()> {
+    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
+    let tensor = Tensor::new(data, device)?;
+    let t0 = tensor.log()?.softmax(0)?;
+    let t1 = tensor.log()?.softmax(1)?;
+    let t2 = tensor.log()?.softmax(2)?;
+    assert_eq!(
+        to_vec3_round(t0, 4)?,
+        &[
+            // 3/5, 1/2, 4/11
+            [[0.6, 0.5, 0.3636], [0.1111, 0.7143, 0.5294]],
+            // 2/5, 1/2, 7/11
+            [[0.4, 0.5, 0.6364], [0.8889, 0.2857, 0.4706]]
+        ]
+    );
+    assert_eq!(
+        to_vec3_round(t1, 4)?,
+        &[
+            // 3/4, 1/6, 4/13
+            [[0.75, 0.1667, 0.3077], [0.25, 0.8333, 0.6923]],
+            // 2/10, 1/3, 7/15
+            [[0.2, 0.3333, 0.4667], [0.8, 0.6667, 0.5333]]
+        ]
+    );
+    assert_eq!(
+        to_vec3_round(t2, 4)?,
+        &[
+            // (3, 1, 4) / 8, (1, 5, 9) / 15
+            [[0.375, 0.125, 0.5], [0.0667, 0.3333, 0.6]],
+            // (2, 1, 7) / 10, (8, 2, 8) / 18
+            [[0.2, 0.1, 0.7], [0.4444, 0.1111, 0.4444]]
+        ]
+    );
+    Ok(())
+}
+
 fn sum(device: &Device) -> Result<()> {
    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
    let tensor = Tensor::new(data, device)?;
@ -258,278 +201,6 @@ fn sum(device: &Device) -> Result<()> {
    Ok(())
 }

-fn min(device: &Device) -> Result<()> {
-    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        tensor.min_keepdim(2)?.to_vec3::<u32>()?,
-        &[[[1], [1]], [[1], [2]]]
-    );
-    assert_eq!(
-        tensor.min_keepdim(0)?.to_vec3::<u32>()?,
-        &[[[2, 1, 4], [1, 2, 8]]],
-    );
-    let data: Vec<u32> = (200..4000u32).collect();
-    let tensor = Tensor::new(data.as_slice(), device)?;
-    assert_eq!(tensor.min_keepdim(0)?.to_vec1::<u32>()?, &[200]);
-    let tensor = tensor.reshape((1900, 2))?;
-    assert_eq!(
-        tensor.min_keepdim(0)?.min_keepdim(1)?.to_vec2::<u32>()?,
-        &[[200]]
-    );
-    assert_eq!(
-        tensor.min_keepdim(1)?.min_keepdim(0)?.to_vec2::<u32>()?,
-        &[[200]]
-    );
-    assert_eq!(tensor.min_keepdim(0)?.to_vec2::<u32>()?, &[[200, 201]]);
-
-    // Make the tensor non contiguous.
-    let tensor = tensor.t()?.contiguous()?.t()?;
-    assert_eq!(
-        tensor.min_keepdim(0)?.min_keepdim(1)?.to_vec2::<u32>()?,
-        &[[200]]
-    );
-    assert_eq!(
-        tensor.min_keepdim(1)?.min_keepdim(0)?.to_vec2::<u32>()?,
-        &[[200]]
-    );
-    assert_eq!(tensor.min_keepdim(0)?.to_vec2::<u32>()?, &[[200, 201]]);
-
-    let t1 = tensor.reshape((190, 5, 4))?;
-    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
-    for tensor in [t1, t2] {
-        assert_eq!(
-            tensor
-                .min_keepdim(0)?
-                .min_keepdim(2)?
-                .min_keepdim(1)?
-                .to_vec3::<u32>()?,
-            &[[[200]]]
-        );
-        assert_eq!(
-            tensor.min_keepdim(0)?.to_vec3::<u32>()?,
-            &[[
-                [200, 201, 202, 203],
-                [204, 205, 206, 207],
-                [208, 209, 210, 211],
-                [212, 213, 214, 215],
-                [216, 217, 218, 219]
-            ]]
-        );
-    }
-    Ok(())
-}
-
-fn max(device: &Device) -> Result<()> {
-    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        tensor.max_keepdim(2)?.to_vec3::<u32>()?,
-        &[[[4], [9]], [[7], [8]]]
-    );
-    assert_eq!(
-        tensor.max_keepdim(0)?.to_vec3::<u32>()?,
-        &[[[3, 1, 7], [8, 5, 9]]],
-    );
-    let data: Vec<u32> = (200..4000u32).collect();
-    let tensor = Tensor::new(data.as_slice(), device)?;
-    assert_eq!(tensor.max_keepdim(0)?.to_vec1::<u32>()?, &[3999]);
-    let tensor = tensor.reshape((1900, 2))?;
-    assert_eq!(
-        tensor.max_keepdim(0)?.max_keepdim(1)?.to_vec2::<u32>()?,
-        &[[3999]]
-    );
-    assert_eq!(
-        tensor.max_keepdim(1)?.max_keepdim(0)?.to_vec2::<u32>()?,
-        &[[3999]]
-    );
-    assert_eq!(tensor.max_keepdim(0)?.to_vec2::<u32>()?, &[[3998, 3999]]);
-
-    // Make the tensor non contiguous.
-    let tensor = tensor.t()?.contiguous()?.t()?;
-    assert_eq!(
-        tensor.max_keepdim(0)?.max_keepdim(1)?.to_vec2::<u32>()?,
-        &[[3999]]
-    );
-    assert_eq!(
-        tensor.max_keepdim(1)?.max_keepdim(0)?.to_vec2::<u32>()?,
-        &[[3999]]
-    );
-    assert_eq!(tensor.max_keepdim(0)?.to_vec2::<u32>()?, &[[3998, 3999]]);
-
-    let t1 = tensor.reshape((190, 5, 4))?;
-    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
-    for tensor in [t1, t2] {
-        assert_eq!(
-            tensor
-                .max_keepdim(0)?
-                .max_keepdim(2)?
-                .max_keepdim(1)?
-                .to_vec3::<u32>()?,
-            &[[[3999]]]
-        );
-        assert_eq!(
-            tensor.max_keepdim(0)?.to_vec3::<u32>()?,
-            &[[
-                [3980, 3981, 3982, 3983],
-                [3984, 3985, 3986, 3987],
-                [3988, 3989, 3990, 3991],
-                [3992, 3993, 3994, 3995],
-                [3996, 3997, 3998, 3999]
-            ]]
-        );
-    }
-    Ok(())
-}
-
-fn argmin(device: &Device) -> Result<()> {
-    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        tensor.argmin_keepdim(2)?.to_vec3::<u32>()?,
-        &[[[1], [0]], [[1], [1]]]
-    );
-    assert_eq!(
-        tensor.argmin_keepdim(0)?.to_vec3::<u32>()?,
-        &[[[1, 0, 0], [0, 1, 1]]],
-    );
-    let data: Vec<u32> = (200..4000u32).collect();
-    let tensor = Tensor::new(data.as_slice(), device)?;
-    assert_eq!(tensor.argmin_keepdim(0)?.to_vec1::<u32>()?, &[0]);
-    let tensor = tensor.reshape((1900, 2))?;
-    assert_eq!(
-        tensor
-            .argmin_keepdim(0)?
-            .argmin_keepdim(1)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(
-        tensor
-            .argmin_keepdim(1)?
-            .argmin_keepdim(0)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(tensor.argmin_keepdim(0)?.to_vec2::<u32>()?, &[[0, 0]]);
-
-    // Make the tensor non contiguous.
-    let tensor = tensor.t()?.contiguous()?.t()?;
-    assert_eq!(
-        tensor
-            .argmin_keepdim(0)?
-            .argmin_keepdim(1)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(
-        tensor
-            .argmin_keepdim(1)?
-            .argmin_keepdim(0)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(tensor.argmin_keepdim(0)?.to_vec2::<u32>()?, &[[0, 0]]);
-
-    let t1 = tensor.reshape((190, 5, 4))?;
-    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
-    for tensor in [t1, t2] {
-        assert_eq!(
-            tensor
-                .argmin_keepdim(0)?
-                .argmin_keepdim(2)?
-                .argmin_keepdim(1)?
-                .to_vec3::<u32>()?,
-            &[[[0]]]
-        );
-        assert_eq!(
-            tensor.argmin_keepdim(0)?.to_vec3::<u32>()?,
-            &[[
-                [0, 0, 0, 0],
-                [0, 0, 0, 0],
-                [0, 0, 0, 0],
-                [0, 0, 0, 0],
-                [0, 0, 0, 0],
-            ]]
-        );
-    }
-    Ok(())
-}
-
-fn argmax(device: &Device) -> Result<()> {
-    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
-    let tensor = Tensor::new(data, device)?;
-    assert_eq!(
-        tensor.argmax_keepdim(2)?.to_vec3::<u32>()?,
-        &[[[2], [2]], [[2], [0]]]
-    );
-    assert_eq!(
-        tensor.argmax_keepdim(0)?.to_vec3::<u32>()?,
-        &[[[0, 0, 1], [1, 0, 0]]],
-    );
-    let data: Vec<u32> = (200..4000u32).collect();
-    let tensor = Tensor::new(data.as_slice(), device)?;
-    assert_eq!(tensor.argmax_keepdim(0)?.to_vec1::<u32>()?, &[3799]);
-    let tensor = tensor.reshape((1900, 2))?;
-    assert_eq!(
-        tensor
-            .argmax_keepdim(0)?
-            .argmax_keepdim(1)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(
-        tensor
-            .argmax_keepdim(1)?
-            .argmax_keepdim(0)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(tensor.argmax_keepdim(0)?.to_vec2::<u32>()?, &[[1899, 1899]]);
-
-    // Make the tensor non contiguous.
-    let tensor = tensor.t()?.contiguous()?.t()?;
-    assert_eq!(
-        tensor
-            .argmax_keepdim(0)?
-            .argmax_keepdim(1)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(
-        tensor
-            .argmax_keepdim(1)?
-            .argmax_keepdim(0)?
-            .to_vec2::<u32>()?,
-        &[[0]]
-    );
-    assert_eq!(tensor.argmax_keepdim(0)?.to_vec2::<u32>()?, &[[1899, 1899]]);
-
-    let t1 = tensor.reshape((190, 5, 4))?;
-    let t2 = t1.transpose(0, 2)?.contiguous()?.transpose(0, 2)?;
-    for tensor in [t1, t2] {
-        assert_eq!(
-            tensor
-                .argmax_keepdim(0)?
-                .argmax_keepdim(2)?
-                .argmax_keepdim(1)?
-                .to_vec3::<u32>()?,
-            &[[[0]]]
-        );
-        assert_eq!(
-            tensor.argmax_keepdim(0)?.to_vec3::<u32>()?,
-            &[[
-                [189, 189, 189, 189],
-                [189, 189, 189, 189],
-                [189, 189, 189, 189],
-                [189, 189, 189, 189],
-                [189, 189, 189, 189],
-            ]]
-        );
-    }
-    Ok(())
-}
-
 fn narrow(device: &Device) -> Result<()> {
    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
    let tensor = Tensor::new(data, device)?;
@ -599,11 +270,7 @@ fn cat(device: &Device) -> Result<()> {
            [2.0, 7.0, 1.0, 8.0, 2.0]
        ]
    );
-    // PyTorch equivalent:
-    //     import torch
-    //     t1 = torch.tensor([[3, 1, 4, 1, 5], [2, 7, 1, 8, 2]])
-    //     t2 = torch.tensor([[5]*5, [2, 7, 1, 8, 2]])
-    //     torch.cat([t1.t(), t2.t()], dim=1).t()
+    // TODO: This is not the expected answer, to be fixed!
    assert_eq!(
        Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
            .t()?
@ -615,6 +282,7 @@ fn cat(device: &Device) -> Result<()> {
            [2.0, 7.0, 1.0, 8.0, 2.0]
        ]
    );
+    // TODO: This is not the expected answer, to be fixed!
    assert_eq!(
        Tensor::cat(&[&t1, &t2], 1)?.to_vec2::<f32>()?,
        [
@ -628,233 +296,8 @@ fn cat(device: &Device) -> Result<()> {
 fn embeddings(device: &Device) -> Result<()> {
    let ids = Tensor::new(&[0u32, 2u32, 1u32], device)?;
    let t = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], device)?;
-    let hs = t.embedding(&ids)?;
+    let hs = Tensor::embedding(&ids, &t)?;
    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
-    let hs = t.index_select(&ids, 0)?;
-    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]);
-    Ok(())
-}
-
-fn cmp(device: &Device) -> Result<()> {
-    let t1 = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], device)?;
-    let t2 = Tensor::new(&[[1f32, 0f32], [3f32, 3f32], [4f32, 7f32]], device)?;
-    assert_eq!(t1.eq(&t2)?.to_vec2::<u8>()?, &[[0, 0], [0, 1], [1, 0]]);
-    assert_eq!(t1.ne(&t2)?.to_vec2::<u8>()?, &[[1, 1], [1, 0], [0, 1]]);
-    assert_eq!(t1.le(&t2)?.to_vec2::<u8>()?, &[[1, 0], [1, 1], [1, 1]]);
-    assert_eq!(t1.lt(&t2)?.to_vec2::<u8>()?, &[[1, 0], [1, 0], [0, 1]]);
-    assert_eq!(t1.gt(&t2)?.to_vec2::<u8>()?, &[[0, 1], [0, 0], [0, 0]]);
-    assert_eq!(t1.ge(&t2)?.to_vec2::<u8>()?, &[[0, 1], [0, 1], [1, 0]]);
-    Ok(())
-}
-
-fn index_select(device: &Device) -> Result<()> {
-    let ids = Tensor::new(&[0u32, 2u32, 1u32], device)?;
-    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    let hs = t.index_select(&ids, 1)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[
-            [0.0, 2.0, 1.0],
-            [3.0, 5.0, 4.0],
-            [6.0, 8.0, 7.0],
-            [9.0, 11.0, 10.0]
-        ]
-    );
-    let hs = t.index_select(&ids, 0)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]]
-    );
-    // Prior to https://github.com/huggingface/candle/pull/1022
-    // There would be a bug where the last values in the result tensor would be set to 0.
-    let ids = Tensor::new(&[0u32, 2u32, 1u32, 0u32, 2u32, 1u32], device)?;
-    let hs = t.index_select(&ids, 0)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [6.0, 7.0, 8.0],
-            [3.0, 4.0, 5.0],
-            [0.0, 1.0, 2.0],
-            [6.0, 7.0, 8.0],
-            [3.0, 4.0, 5.0],
-        ]
-    );
-
-    // Test when selecting dim > 0 with ids size different from elem count of
-    // target dim in source/input.
-    let ids = Tensor::new(&[1u32, 0u32, 1u32], device)?;
-    let t = Tensor::arange(1f32, 5f32, device)?.reshape((2, 2))?;
-    assert_eq!(t.to_vec2::<f32>()?, &[[1.0, 2.0], [3.0, 4.0]]);
-    let hs = t.index_select(&ids, 1)?;
-    assert_eq!(hs.to_vec2::<f32>()?, &[[2.0, 1.0, 2.0], [4.0, 3.0, 4.0]]);
-
-    Ok(())
-}
-
-fn index_add(device: &Device) -> Result<()> {
-    let ids = Tensor::new(&[0u32, 1u32, 1u32], device)?;
-    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    let init = Tensor::ones((4, 2), DType::F32, device)?;
-    let hs = init.index_add(&ids, &t, 1)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[[1.0, 4.0], [4.0, 10.0], [7.0, 16.0], [10.0, 22.0]],
-    );
-    let init = Tensor::zeros((4, 2), DType::F32, device)?;
-    let ids = Tensor::new(&[1u32, 0u32, 0u32], device)?;
-    let hs = init.index_add(&ids, &t, 1)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[[3.0, 0.0], [9.0, 3.0], [15.0, 6.0], [21.0, 9.0]],
-    );
-
-    let init = Tensor::zeros((6, 3), DType::F32, device)?;
-    let ids = Tensor::new(&[5u32, 0u32, 1u32, 0u32], device)?;
-    let hs = init.index_add(&ids, &t, 0)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[
-            [12.0, 14.0, 16.0],
-            [6.0, 7.0, 8.0],
-            [0.0, 0.0, 0.0],
-            [0.0, 0.0, 0.0],
-            [0.0, 0.0, 0.0],
-            [0.0, 1.0, 2.0]
-        ]
-    );
-    Ok(())
-}
-
-fn slice_scatter(device: &Device) -> Result<()> {
-    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    let src = Tensor::arange(100f32, 106f32, device)?.reshape((2, 3))?;
-    assert_eq!(
-        t.slice_scatter0(&src, 0)?.to_vec2::<f32>()?,
-        &[
-            [100.0, 101.0, 102.0],
-            [103.0, 104.0, 105.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    assert_eq!(
-        t.slice_scatter0(&src, 1)?.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [100.0, 101.0, 102.0],
-            [103.0, 104.0, 105.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    assert_eq!(
-        t.slice_scatter0(&src, 2)?.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [100.0, 101.0, 102.0],
-            [103.0, 104.0, 105.0],
-        ]
-    );
-    Ok(())
-}
-
-fn scatter_add(device: &Device) -> Result<()> {
-    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    let ids = Tensor::new(&[[0u32, 1, 2], [3, 4, 0], [3, 3, 1], [2, 0, 4]], device)?;
-    let init = Tensor::ones((4, 5), DType::F32, device)?;
-    let hs = init.scatter_add(&ids, &t, 1)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[
-            [1.0, 2.0, 3.0, 1.0, 1.0],
-            [6.0, 1.0, 1.0, 4.0, 5.0],
-            [1.0, 9.0, 1.0, 14.0, 1.0],
-            [11.0, 1.0, 10.0, 1.0, 12.0]
-        ]
-    );
-
-    let init = Tensor::ones((6, 3), DType::F32, device)?;
-    let hs = init.scatter_add(&ids, &t, 0)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[
-            [1.0, 11.0, 6.0],
-            [1.0, 2.0, 9.0],
-            [10.0, 1.0, 3.0],
-            [10.0, 8.0, 1.0],
-            [1.0, 5.0, 12.0],
-            [1.0, 1.0, 1.0]
-        ]
-    );
-    Ok(())
-}
-
-fn gather(device: &Device) -> Result<()> {
-    let ids = Tensor::new(&[[0u32], [2u32], [1u32], [0u32]], device)?;
-    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
-    assert_eq!(
-        t.to_vec2::<f32>()?,
-        &[
-            [0.0, 1.0, 2.0],
-            [3.0, 4.0, 5.0],
-            [6.0, 7.0, 8.0],
-            [9.0, 10.0, 11.0]
-        ]
-    );
-    let hs = t.gather(&ids, 1)?;
-    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0], [5.0], [7.0], [9.0]]);
-    let ids = Tensor::new(
-        &[[0u32, 0u32], [2u32, 0u32], [1u32, 1u32], [0u32, 2u32]],
-        device,
-    )?;
-    let hs = t.gather(&ids, 1)?;
-    assert_eq!(
-        hs.to_vec2::<f32>()?,
-        &[[0.0, 0.0], [5.0, 3.0], [7.0, 7.0], [9.0, 11.0]]
-    );
-    let ids = Tensor::new(&[[0u32, 2u32, 0u32]], device)?;
-    let hs = t.gather(&ids, 0)?;
-    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 7.0, 2.0]]);
-    let ids = Tensor::new(&[[0u32, 2u32, 0u32], [0u32, 1u32, 1u32]], device)?;
-    let hs = t.gather(&ids, 0)?;
-    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 7.0, 2.0], [0.0, 4.0, 5.0]]);
    Ok(())
 }

@ -907,25 +350,6 @@ fn matmul(device: &Device) -> Result<()> {
    Ok(())
 }

-fn broadcast_matmul(device: &Device) -> Result<()> {
-    let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?;
-    let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?;
-    let out = lhs.broadcast_matmul(&rhs)?;
-    assert_eq!(out.dims(), &[3, 6, 4, 2]);
-    for idx1 in 0..3 {
-        for idx2 in 0..6 {
-            let out = out.i((idx1, idx2))?;
-            let lhs = lhs.i((idx1, 0))?;
-            let rhs = rhs.i(idx2)?;
-            let out2 = lhs.matmul(&rhs);
-            let sum_diff2 = (out - out2)?.sqr()?.sum_all()?;
-            // With cuda, we see errors of up to ~1e-12.
-            assert!(sum_diff2.to_vec0::<f32>()? < 1e-6)
-        }
-    }
-    Ok(())
-}
-
 fn broadcasting(device: &Device) -> Result<()> {
    let t1 = Tensor::arange(0f32, 24f32, device)?.reshape((4, 2, 3))?;
    let t2 = Tensor::new(&[100f32, 200f32], device)?;
@ -1027,49 +451,16 @@ fn broadcasting(device: &Device) -> Result<()> {
    Ok(())
 }

-fn randn(device: &Device) -> Result<()> {
-    let tensor = Tensor::randn(0f32, 1f32, (5, 3), device)?;
-    assert_eq!(tensor.dims(), [5, 3]);
-    let tensor = Tensor::rand(0f32, 1f32, (5, 3), device)?;
-    assert_eq!(tensor.dims(), [5, 3]);
-    Ok(())
-}
-
 test_device!(zeros, zeros_cpu, zeros_gpu);
-test_device!(ones, ones_cpu, ones_gpu);
 test_device!(add_mul, add_mul_cpu, add_mul_gpu);
 test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu);
 test_device!(narrow, narrow_cpu, narrow_gpu);
 test_device!(broadcast, broadcast_cpu, broadcast_gpu);
 test_device!(cat, cat_cpu, cat_gpu);
 test_device!(sum, sum_cpu, sum_gpu);
-test_device!(min, min_cpu, min_gpu);
-test_device!(max, max_cpu, max_gpu);
-test_device!(argmax, argmax_cpu, argmax_gpu);
-test_device!(argmin, argmin_cpu, argmin_gpu);
 test_device!(transpose, transpose_cpu, transpose_gpu);
-test_device!(unary_op, unary_op_cpu, unary_op_gpu);
 test_device!(binary_op, binary_op_cpu, binary_op_gpu);
+test_device!(softmax, softmax_cpu, softmax_gpu);
 test_device!(embeddings, embeddings_cpu, embeddings_gpu);
-test_device!(cmp, cmp_cpu, cmp_gpu);
 test_device!(matmul, matmul_cpu, matmul_gpu);
-test_device!(broadcast_matmul, broadcast_matmul_cpu, broadcast_matmul_gpu);
 test_device!(broadcasting, broadcasting_cpu, broadcasting_gpu);
-test_device!(index_select, index_select_cpu, index_select_gpu);
-test_device!(index_add, index_add_cpu, index_add_gpu);
-test_device!(gather, gather_cpu, gather_gpu);
-test_device!(scatter_add, scatter_add_cpu, scatter_add_gpu);
-test_device!(slice_scatter, slice_scatter_cpu, slice_scatter_gpu);
-test_device!(randn, randn_cpu, randn_gpu);
-test_device!(clamp, clamp_cpu, clamp_gpu);
-
-// There was originally a bug on the CPU implementation for randn
-// https://github.com/huggingface/candle/issues/381
-#[test]
-fn randn_hasneg() -> Result<()> {
-    let t = Tensor::randn(0f32, 1f32, 200, &Device::Cpu)?.to_vec1::<f32>()?;
-    if t.iter().all(|&v| v >= 0.) {
-        candle_core::bail!("all values in tensors are non-negative")
-    }
-    Ok(())
-}
--- a/candle-core/tests/test_utils.rs
+++ b/candle-core/tests/test_utils.rs
@ -1,4 +1,6 @@
-use crate::{Result, Tensor};
+#![allow(dead_code)]
+
+use candle::{Result, Tensor};

 #[macro_export]
 macro_rules! test_device {
@ -18,30 +20,7 @@ macro_rules! test_device {
    };
 }

-pub fn to_vec0_round(t: &Tensor, digits: i32) -> Result<f32> {
-    let b = 10f32.powi(digits);
-    let t = t.to_vec0::<f32>()?;
-    Ok(f32::round(t * b) / b)
-}
-
-pub fn to_vec1_round(t: &Tensor, digits: i32) -> Result<Vec<f32>> {
-    let b = 10f32.powi(digits);
-    let t = t.to_vec1::<f32>()?;
-    let t = t.iter().map(|t| f32::round(t * b) / b).collect();
-    Ok(t)
-}
-
-pub fn to_vec2_round(t: &Tensor, digits: i32) -> Result<Vec<Vec<f32>>> {
-    let b = 10f32.powi(digits);
-    let t = t.to_vec2::<f32>()?;
-    let t = t
-        .iter()
-        .map(|t| t.iter().map(|t| f32::round(t * b) / b).collect())
-        .collect();
-    Ok(t)
-}
-
-pub fn to_vec3_round(t: &Tensor, digits: i32) -> Result<Vec<Vec<Vec<f32>>>> {
+pub fn to_vec3_round(t: Tensor, digits: i32) -> Result<Vec<Vec<Vec<f32>>>> {
    let b = 10f32.powi(digits);
    let t = t.to_vec3::<f32>()?;
    let t = t
--- a/Show More
+++ b/Show More