Refactor the hierarchy.

2025-06-21 20:22:49 +00:00 · 2023-06-27 11:57:27 +02:00
parent 6c4a960b15
commit d7f729fb8f
41 changed files with 35 additions and 33 deletions
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -0,0 +1,32 @@
+[package]
+name = "candle"
+version = "0.1.0"
+edition = "2021"
+
+description = "Minimalist ML framework."
+repository = "https://github.com/LaurentMazare/candle"
+keywords = ["blas", "tensor", "machine-learning"]
+categories = ["science"]
+license = "MIT/Apache-2.0"
+readme = "README.md"
+
+[dependencies]
+safetensors = "0.3.1"
+thiserror = "1"
+cudarc = { version = "0.9.9", optional = true, features = ["f16"] }
+candle-kernels = { path = "../candle-kernels", optional = true }
+gemm = "0.15.4"
+zip = { version = "0.6.6", default-features=false }
+byteorder = "1.4.3"
+half = { version = "2.3.1", features = ["num-traits"] }
+num-traits = "0.2.15"
+
+[dev-dependencies]
+anyhow = "1"
+clap = { version = "4.2.4", features = ["derive"] }
+rand = "0.8.5"
+tokenizers = { version = "0.13.3", default-features=false, features=["onig"] }
+
+[features]
+default = ["cuda"]
+cuda = ["dep:cudarc", "dep:candle-kernels"]
--- a/candle-core/LICENSE
+++ b/candle-core/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/candle-core/Makefile
+++ b/candle-core/Makefile
@ -0,0 +1,12 @@
+clean-ptx:
+	find target -name "*.ptx" -type f -delete
+	echo "" > kernels/src/lib.rs
+	touch kernels/build.rs
+
+clean:
+	cargo clean
+
+test:
+	cargo test
+
+all: test
--- a/candle-core/README.md
+++ b/candle-core/README.md
@ -0,0 +1,2 @@
+# candle
+Minimalist ML framework for Rust
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -0,0 +1,21 @@
+use anyhow::Result;
+use candle::{Device, Tensor};
+
+fn main() -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 7., 1., 8., 2.]];
+    let t1 = Tensor::new(data, &Device::Cpu)?;
+    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 7., 1., 8., 2.]];
+    let t2 = Tensor::new(data2, &Device::Cpu)?;
+    assert_eq!(
+        Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
+            .t()?
+            .to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0],
+            [5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    Ok(())
+}
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -0,0 +1,31 @@
+use anyhow::Result;
+use candle::{Device, Tensor};
+
+fn main() -> Result<()> {
+    let device = Device::new_cuda(0)?;
+    let ids = Tensor::new(&[0u32, 2u32, 1u32], &device)?;
+    let t = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], &device)?;
+    let hs = Tensor::embedding(&ids, &t)?;
+    println!("> {:?}", hs.to_vec2::<f32>());
+
+    let x = Tensor::new(&[3f32, 1., 4., 1., 5.], &device)?;
+    println!("{:?}", x.to_vec1::<f32>()?);
+    let y = Tensor::new(&[2f32, 7., 1., 8., 2.], &device)?;
+    let z = (y + x * 3.)?;
+    println!("{:?}", z.to_vec1::<f32>()?);
+    println!("{:?}", z.sqrt()?.to_vec1::<f32>()?);
+    let x = Tensor::new(&[[11f32, 22.], [33., 44.], [55., 66.], [77., 78.]], &device)?;
+    let y = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &device)?;
+    println!("{:?}", y.to_vec2::<f32>()?);
+    let z = x.matmul(&y)?;
+    println!("{:?}", z.to_vec2::<f32>()?);
+    let x = Tensor::new(
+        &[[11f32, 22.], [33., 44.], [55., 66.], [77., 78.]],
+        &Device::Cpu,
+    )?;
+    let y = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
+    println!("{:?}", y.to_vec2::<f32>()?);
+    let z = x.matmul(&y)?;
+    println!("{:?}", z.to_vec2::<f32>()?);
+    Ok(())
+}
--- a/candle-core/examples/llama/convert_checkpoint.py
+++ b/candle-core/examples/llama/convert_checkpoint.py
@ -0,0 +1,68 @@
+# Adapted from https://github.com/Lightning-AI/lit-llama/blob/main/scripts/convert_checkpoint.py
+import sys
+import torch
+import numpy as np
+from typing import Dict
+from pathlib import Path
+
+def tr(v):
+    return np.ascontiguousarray(np.transpose(v))
+
+def convert_state_dict(state_dict: Dict[str, torch.Tensor], dtype: torch.dtype = torch.float32) -> Dict[str, torch.Tensor]:
+    print("start conv")
+
+    def get_and_remove(key, transpose=False):
+        v = state_dict[key].to(dtype).numpy()
+        if transpose:
+            v = tr(v)
+        del state_dict[key]
+        return v
+
+    converted = {}
+    converted["transformer.wte.weight"] = get_and_remove("tok_embeddings.weight")
+    converted["lm_head.weight"] = get_and_remove("output.weight", transpose=True)
+    converted["transformer.ln_f.scale"] = get_and_remove("norm.weight")
+
+    for layer_idx in sorted(set([k.split(".")[1] for k in state_dict if k.startswith("layers")])):
+        print(layer_idx)
+
+        # attention
+        # the wq, wk, wv from the FB model are stacked in our model as c_attn
+        converted[f"transformer.h.{layer_idx}.attn.c_attn.weight"] = tr(np.concatenate(
+            (
+                get_and_remove(f"layers.{layer_idx}.attention.wq.weight"),
+                get_and_remove(f"layers.{layer_idx}.attention.wk.weight"),
+                get_and_remove(f"layers.{layer_idx}.attention.wv.weight"),
+            )
+        ))
+        converted[f"transformer.h.{layer_idx}.attn.c_proj.weight"] = tr(get_and_remove(
+            f"layers.{layer_idx}.attention.wo.weight"
+            ))
+        # mlp
+        converted[f"transformer.h.{layer_idx}.mlp.c_fc1.weight"] = get_and_remove(
+            f"layers.{layer_idx}.feed_forward.w1.weight", transpose=True,
+            )
+        converted[f"transformer.h.{layer_idx}.mlp.c_proj.weight"] = get_and_remove(
+            f"layers.{layer_idx}.feed_forward.w2.weight", transpose=True,
+            )
+        converted[f"transformer.h.{layer_idx}.mlp.c_fc2.weight"] = get_and_remove(
+            f"layers.{layer_idx}.feed_forward.w3.weight", transpose=True,
+            )
+        # rms norm
+        converted[f"transformer.h.{layer_idx}.rms_1.scale"] = get_and_remove(f"layers.{layer_idx}.attention_norm.weight")
+        converted[f"transformer.h.{layer_idx}.rms_2.scale"] = get_and_remove(f"layers.{layer_idx}.ffn_norm.weight")
+    return converted
+
+def convert_weights(llama_ckpt, *, output_npz: Path = Path("llama.npz"), dtype: str = "float32") -> None:
+    dt = getattr(torch, dtype, None)
+    if not isinstance(dt, torch.dtype):
+        raise ValueError(f"{dtype} is not a valid dtype.")
+    checkpoint = torch.load(llama_ckpt, map_location="cpu")
+    converted = convert_state_dict(checkpoint, dtype=dt)
+    del checkpoint
+    np.savez(output_npz, **converted)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        raise ValueError(f"usage: convert_checkpoint.py ..../LLaMA/7B/consolidated.00.pth")
+    convert_weights(sys.argv[1])
--- a/candle-core/examples/llama/main.rs
+++ b/candle-core/examples/llama/main.rs
@ -0,0 +1,485 @@
+// An implementation of LLaMA https://github.com/facebookresearch/llama
+//
+// This is based on nanoGPT in a similar way to:
+// https://github.com/Lightning-AI/lit-llama/blob/main/lit_llama/model.py
+//
+// The tokenizer config can be retrieved from:
+// https://huggingface.co/hf-internal-testing/llama-tokenizer/raw/main/tokenizer.json
+//
+// In order to convert the llama weights to a .npz file, run:
+// python examples/llama/convert_checkpoint.py ..../LLaMA/7B/consolidated.00.pth
+
+// TODO: This does not use a batch dimension. If adding it back, be cautious about the
+// transposition operations.
+use anyhow::{Error as E, Result};
+use clap::Parser;
+
+use candle::{DType, Device, Tensor};
+
+mod var_store;
+use var_store::VarBuilder;
+
+const CONTEXT_SIZE: usize = 512;
+const START_PROMPT: &str = r"
+EDWARD:
+I wonder how our princely father 'scaped,
+Or whether he be 'scaped away or no
+From Clifford's and Northumberland's pursuit:
+Had he been ta'en, we should have heard the news;
+Had he been slain, we should have heard the news;
+Or had he 'scaped, methinks we should have heard
+The happy tidings of his good escape.
+How fares my brother? why is he so sad?
+
+RICHARD:
+I cannot joy, until I be resolved
+Where our right valiant father is become.
+I saw him in the battle range about;
+And watch'd him how he singled Clifford forth.
+Methought he bore him in the thickest troop
+As doth a lion in a herd of neat;
+Or as a bear, encompass'd round with dogs,
+Who having pinch'd a few and made them cry,
+The rest stand all aloof, and bark at him.
+So fared our father with his enemies;
+So fled his enemies my warlike father:
+Methinks, 'tis prize enough to be his son.
+See how the morning opes her golden gates,
+And takes her farewell of the glorious sun!
+How well resembles it the prime of youth,
+Trimm'd like a younker prancing to his love!
+
+EDWARD:
+Dazzle mine eyes, or do I see three suns?
+
+RICHARD:
+Three glorious suns, each one a perfect sun;
+Not separated with the racking clouds,
+But sever'd in a pale clear-shining sky.
+See, see! they join, embrace, and seem to kiss,
+As if they vow'd some league inviolable:
+Now are they but one lamp, one light, one sun.
+In this the heaven figures some event.
+
+EDWARD:
+'Tis wondrous strange, the like yet never heard of.
+I think it cites us, brother, to the field,
+That we, the sons of brave Plantagenet,
+Each one already blazing by our meeds,
+Should notwithstanding join our lights together
+And over-shine the earth as this the world.
+Whate'er it bodes, henceforward will I bear
+Upon my target three fair-shining suns.
+";
+
+#[allow(dead_code)]
+struct Config {
+    block_size: usize,
+    vocab_size: usize,
+    n_layer: usize,
+    n_head: usize,
+    n_embd: usize,
+}
+
+#[allow(dead_code)]
+impl Config {
+    fn config_7b() -> Self {
+        Self {
+            block_size: 4096,
+            vocab_size: 32000,
+            n_layer: 32,
+            n_head: 32,
+            n_embd: 4096,
+        }
+    }
+
+    fn config_13b() -> Self {
+        Self {
+            block_size: 4096,
+            vocab_size: 32000,
+            n_layer: 40,
+            n_head: 40,
+            n_embd: 5120,
+        }
+    }
+
+    fn config_30b() -> Self {
+        Self {
+            block_size: 4096,
+            vocab_size: 32000,
+            n_layer: 60,
+            n_head: 52,
+            n_embd: 6656,
+        }
+    }
+
+    fn config_65b() -> Self {
+        Self {
+            block_size: 4096,
+            vocab_size: 32000,
+            n_layer: 80,
+            n_head: 64,
+            n_embd: 8192,
+        }
+    }
+}
+
+struct Embedding {
+    embeddings: Tensor,
+}
+
+impl Embedding {
+    fn new(mut vb: VarBuilder, vocab_size: usize, n_embd: usize) -> Result<Self> {
+        let embeddings = vb.var("weight", (vocab_size, n_embd))?;
+        Ok(Self { embeddings })
+    }
+
+    fn forward(&self, indexes: &Tensor) -> Result<Tensor> {
+        Ok(Tensor::embedding(
+            indexes,
+            &self.embeddings.to_dtype(DType::F32)?,
+        )?)
+    }
+}
+
+struct Linear {
+    ws: Tensor,
+    bs: Option<Tensor>,
+}
+
+impl Linear {
+    #[allow(dead_code)]
+    fn new(mut vb: VarBuilder, in_size: usize, out_size: usize) -> Result<Self> {
+        let ws = vb.var("weight", (in_size, out_size))?;
+        let bs = vb.var("bias", out_size)?;
+        Ok(Self { ws, bs: Some(bs) })
+    }
+
+    fn new_no_bias(mut vb: VarBuilder, in_size: usize, out_size: usize) -> Result<Self> {
+        let ws = vb.var("weight", (in_size, out_size))?;
+        Ok(Self { ws, bs: None })
+    }
+
+    fn forward(&self, x: &Tensor) -> Result<Tensor> {
+        let x = x.matmul(&self.ws.to_dtype(DType::F32)?)?;
+        let y = match &self.bs {
+            None => x,
+            Some(bs) => x.broadcast_add(&bs.to_dtype(DType::F32)?)?,
+        };
+        Ok(y)
+    }
+}
+
+struct RmsNorm {
+    scale: Tensor,
+    size: usize,
+}
+
+impl RmsNorm {
+    fn new(mut vb: VarBuilder, size: usize) -> Result<Self> {
+        let scale = vb.var("scale", &[size])?;
+        Ok(Self { scale, size })
+    }
+
+    fn forward(&self, x: &Tensor) -> Result<Tensor> {
+        let (seq_len, hidden_size) = x.shape().r2()?;
+        let norm_x = ((x * x)?.sum(&[1])? / hidden_size as f64)?;
+        let norm_x = norm_x.broadcast_as((seq_len, hidden_size))?;
+        let x_normed = (x / (norm_x + 1e-5)?.sqrt()?)?;
+        let scale = self
+            .scale
+            .to_dtype(DType::F32)?
+            .broadcast_as((seq_len, self.size))?;
+        Ok((scale * x_normed)?)
+    }
+}
+
+struct Mlp {
+    c_fc1: Linear,
+    c_fc2: Linear,
+    c_proj: Linear,
+}
+
+fn silu(xs: &Tensor) -> Result<Tensor> {
+    Ok((xs / (xs.neg()?.exp()? + 1.0)?)?)
+}
+
+impl Mlp {
+    fn new(vb: VarBuilder, n_embd: usize) -> Result<Self> {
+        let n_hidden = 8 * n_embd / 3;
+        let n_hidden = (n_hidden - 1) / 256 * 256 + 256;
+        let c_fc1 = Linear::new_no_bias(&vb / "c_fc1", n_embd, n_hidden)?;
+        let c_fc2 = Linear::new_no_bias(&vb / "c_fc2", n_embd, n_hidden)?;
+        let c_proj = Linear::new_no_bias(&vb / "c_proj", n_hidden, n_embd)?;
+        Ok(Self {
+            c_fc1,
+            c_fc2,
+            c_proj,
+        })
+    }
+
+    fn forward(&self, x: &Tensor) -> Result<Tensor> {
+        let x = (silu(&self.c_fc1.forward(x)?)? * self.c_fc2.forward(x)?)?;
+        self.c_proj.forward(&x)
+    }
+}
+
+fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result<Tensor> {
+    let shape = mask.shape();
+    let on_true = Tensor::new(on_true, &on_false.device())?.broadcast_as(shape.dims())?;
+    let m = mask.where_cond(&on_true, on_false)?;
+    Ok(m)
+}
+
+struct CausalSelfAttention {
+    c_attn: Linear,
+    c_proj: Linear,
+    n_head: usize,
+    n_embd: usize,
+}
+
+impl CausalSelfAttention {
+    fn new(vb: VarBuilder, n_head: usize, n_embd: usize) -> Result<Self> {
+        let c_attn = Linear::new_no_bias(&vb / "c_attn", n_embd, 3 * n_embd)?;
+        let c_proj = Linear::new_no_bias(&vb / "c_proj", n_embd, n_embd)?;
+        Ok(Self {
+            c_attn,
+            c_proj,
+            n_head,
+            n_embd,
+        })
+    }
+
+    fn apply_rotary_emb(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
+        let mut dims = x.dims().to_vec();
+        let v = dims.pop().unwrap();
+        dims.push(v / 2);
+        dims.push(2);
+        let x = x.reshape(dims)?;
+        let rank = x.rank();
+        let re_x = x.narrow(rank - 1, 0, 1)?;
+        let im_x = x.narrow(rank - 1, 1, 1)?;
+        let re_f = freqs_cis
+            .narrow(rank - 1, 0, 1)?
+            .broadcast_as(re_x.shape())?;
+        let im_f = freqs_cis
+            .narrow(rank - 1, 1, 1)?
+            .broadcast_as(im_x.shape())?;
+        let re = ((&re_x * &re_f)? - (&im_x * &im_f)?)?;
+        let im = ((&re_x * &im_f)? + (&im_x * &re_f)?)?;
+        let rope = Tensor::cat(&[&re, &im], rank - 1)?;
+        // TODO: Add the flatten op.
+        let mut dims = rope.dims().to_vec();
+        let v1 = dims.pop().unwrap();
+        let v2 = dims.pop().unwrap();
+        dims.push(v1 * v2);
+        let rope = rope.reshape(dims)?;
+        Ok(rope)
+    }
+
+    fn forward(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
+        let (t, c) = x.shape().r2()?;
+        let qkv = self.c_attn.forward(x)?;
+        let n_embd = self.n_embd;
+        let q = qkv.narrow(1, 0, n_embd)?;
+        let k = qkv.narrow(1, n_embd, n_embd)?;
+        let v = qkv.narrow(1, 2 * n_embd, n_embd)?;
+        let target_dim = [t, self.n_head, c / self.n_head];
+        let k = k.reshape(target_dim.as_slice())?.transpose(0, 1)?;
+        let q = q.reshape(target_dim.as_slice())?.transpose(0, 1)?;
+        let v = v.reshape(target_dim.as_slice())?.transpose(0, 1)?;
+        let q = self.apply_rotary_emb(&q, freqs_cis)?;
+        let k = self.apply_rotary_emb(&k, freqs_cis)?;
+        let k_shape = k.shape();
+        let att = (q.matmul(&k.t()?)? / (*k_shape.dims().last().unwrap() as f64).sqrt())?;
+        let device = x.device();
+        // TODO: If we support bool or u8 tensors, this would be better.
+        let mask: Vec<_> = (0..t)
+            .flat_map(|i| (0..t).map(move |j| u32::from(j > i)))
+            .collect();
+        // Once lower_triangle is available, use the following:
+        //let mask = Tensor::new(1u32, &device)?
+        //    .broadcast_as(&[t, t])?
+        //    .lower_triangle()?
+        let mask = Tensor::from_slice(&mask, (t, t), &device)?.broadcast_as(att.shape())?;
+        let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?;
+        let att = att.softmax(att.rank() - 1)?;
+        // Convert to contiguous as matmul doesn't support strided vs for now.
+        let y = att.matmul(&v.contiguous()?)?;
+        let y = y.transpose(0, 1)?.reshape(&[t, c])?;
+        let y = self.c_proj.forward(&y)?;
+        Ok(y)
+    }
+}
+
+struct Block {
+    rms_1: RmsNorm,
+    attn: CausalSelfAttention,
+    rms_2: RmsNorm,
+    mlp: Mlp,
+}
+
+impl Block {
+    fn new(vb: VarBuilder, config: &Config) -> Result<Self> {
+        let rms_1 = RmsNorm::new(&vb / "rms_1", config.n_embd)?;
+        let attn = CausalSelfAttention::new(&vb / "attn", config.n_head, config.n_embd)?;
+        let rms_2 = RmsNorm::new(&vb / "rms_2", config.n_embd)?;
+        let mlp = Mlp::new(&vb / "mlp", config.n_embd)?;
+        Ok(Self {
+            rms_1,
+            attn,
+            rms_2,
+            mlp,
+        })
+    }
+
+    fn forward(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
+        let x = (self.attn.forward(&self.rms_1.forward(x)?, freqs_cis)? + x)?;
+        let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + x)?;
+        Ok(x)
+    }
+}
+
+struct Llama {
+    wte: Embedding,
+    blocks: Vec<Block>,
+    ln_f: RmsNorm,
+    lm_head: Linear,
+}
+
+impl Llama {
+    fn new(vb: VarBuilder, config: &Config) -> Result<Self> {
+        let lm_head = Linear::new_no_bias(&vb / "lm_head", config.n_embd, config.vocab_size)?;
+        let wte = Embedding::new(
+            &vb / "transformer" / "wte",
+            config.vocab_size,
+            config.n_embd,
+        )?;
+        let blocks = (0..config.n_layer)
+            .map(|i| Block::new(&vb / "transformer" / "h" / i, config))
+            .collect::<Result<Vec<_>>>()?;
+        let ln_f = RmsNorm::new(&vb / "transformer" / "ln_f", config.n_embd)?;
+        Ok(Self {
+            wte,
+            blocks,
+            ln_f,
+            lm_head,
+        })
+    }
+
+    fn forward(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
+        // TODO: Support for mini-batches? (i.e. r2)
+        let t = x.shape().r1()?;
+        let mut x = self.wte.forward(x)?;
+        for block in self.blocks.iter() {
+            x = block.forward(&x, freqs_cis)?;
+        }
+        let x = self.ln_f.forward(&x)?;
+        let x = x.narrow(0, t - 1, 1)?;
+        let logits = self.lm_head.forward(&x)?;
+        let (b, vocab_size) = logits.shape().r2()?;
+        assert_eq!(b, 1);
+        Ok(logits.reshape(vocab_size)?)
+    }
+}
+
+fn precompute_freqs_cis(config: &Config, device: &Device) -> Result<Tensor> {
+    let seq_len = CONTEXT_SIZE;
+    let n_elem = config.n_embd / config.n_head;
+    let theta: Vec<_> = (0..n_elem)
+        .step_by(2)
+        .map(|i| 1f32 / 10000f32.powf(i as f32 / n_elem as f32))
+        .collect();
+    let arange: Vec<_> = (0..seq_len).map(|c| c as f32).collect();
+    let theta = Tensor::new(theta.as_slice(), device)?;
+    let arange = Tensor::new(arange.as_slice(), device)?;
+    let idx_theta = arange
+        .reshape((arange.elem_count(), 1))?
+        .matmul(&theta.reshape((1, theta.elem_count()))?)?;
+    let shape = [1, seq_len, n_elem / 2, 1];
+    let idx_theta_cos = idx_theta.cos()?.reshape(&shape)?;
+    let idx_theta_sin = idx_theta.sin()?.reshape(&shape)?;
+    let last_dim = idx_theta_cos.rank() - 1;
+    Ok(Tensor::cat(&[&idx_theta_cos, &idx_theta_sin], last_dim)?)
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// The temperature used to generate samples.
+    #[arg(long, default_value_t = 1.0)]
+    temperature: f64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, default_value_t = 100)]
+    sample_len: usize,
+}
+
+fn main() -> Result<()> {
+    use rand::prelude::*;
+    use tokenizers::Tokenizer;
+
+    let args = Args::parse();
+    let device = if args.cpu {
+        Device::Cpu
+    } else {
+        Device::new_cuda(0)?
+    };
+    println!("loading tokenizer config");
+    let tokenizer = Tokenizer::from_file("llama-tokenizer.json").map_err(E::msg)?;
+    let mut tokens = tokenizer
+        .encode(START_PROMPT, true)
+        .map_err(E::msg)?
+        .get_ids()
+        .to_vec();
+
+    let weight_path = std::path::Path::new("llama.npz");
+    let weights = if weight_path.exists() {
+        println!("loading weights from {weight_path:?}");
+        let start_load = std::time::Instant::now();
+        let tensors = Tensor::read_npz(weight_path)?;
+        println!("loaded weights in {:?}", start_load.elapsed());
+        let tensors: std::collections::HashMap<String, Tensor> = tensors.into_iter().collect();
+        Some(tensors)
+    } else {
+        println!("cannot find {weight_path:?}, using zero weights");
+        None
+    };
+    let vb = VarBuilder::new::<f32>(&device, weights);
+
+    println!("building the model");
+    let config = Config::config_7b();
+    let llama = Llama::new(vb, &config)?;
+
+    println!("pre-computing the positional embeddings");
+    let freqs_cis = precompute_freqs_cis(&config, &device)?;
+    println!("starting the inference loop");
+    let mut new_tokens = vec![];
+    let mut rng = thread_rng();
+    for index in 0..args.sample_len {
+        let ctxt = &tokens[tokens.len().saturating_sub(CONTEXT_SIZE)..];
+        let input = Tensor::new(ctxt, &device)?;
+        let logits = llama.forward(&input, &freqs_cis)?;
+        let prs = (&logits / args.temperature)?.softmax(logits.rank() - 1)?;
+        let logits_v: Vec<f32> = prs.to_vec1()?;
+        let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
+        let next_token = distr.sample(&mut rng) as u32;
+        tokens.push(next_token);
+        new_tokens.push(next_token);
+        println!(
+            "{} token: {} '{}'",
+            index + 1,
+            next_token,
+            tokenizer.decode(vec![next_token], true).map_err(E::msg)?
+        );
+    }
+    println!(
+        "----\n{}\n----",
+        tokenizer.decode(new_tokens, true).map_err(E::msg)?
+    );
+    Ok(())
+}
--- a/candle-core/examples/llama/var_store.rs
+++ b/candle-core/examples/llama/var_store.rs
@ -0,0 +1,92 @@
+use candle::{DType, Device, Result, Shape, Tensor, WithDType};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+#[allow(dead_code)]
+#[derive(Clone)]
+struct NamedVar {
+    path: String,
+    dtype: DType,
+    shape: Shape,
+}
+
+#[derive(Clone)]
+pub struct VarBuilder {
+    path: Vec<String>,
+    vars: std::rc::Rc<std::cell::RefCell<Vec<NamedVar>>>,
+    default_dtype: DType,
+    default_device: Device,
+    tensors: Arc<Option<HashMap<String, Tensor>>>,
+}
+
+#[allow(dead_code)]
+pub struct VarStore {
+    vars: Vec<NamedVar>,
+}
+
+impl VarBuilder {
+    pub fn new<B: WithDType>(device: &Device, tensors: Option<HashMap<String, Tensor>>) -> Self {
+        let vars = std::rc::Rc::new(std::cell::RefCell::new(vec![]));
+        Self {
+            path: vec![],
+            vars,
+            default_dtype: B::DTYPE,
+            tensors: Arc::new(tensors),
+            default_device: device.clone(),
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.vars.borrow().len()
+    }
+
+    pub fn var<S: Into<Shape>>(&mut self, s: &str, shape: S) -> Result<Tensor> {
+        let shape = shape.into();
+        let path = format!("{}.{s}", self.path.join("."));
+        let mut vars = self.vars.borrow_mut();
+        let parameter = match self.tensors.as_ref() {
+            None => Tensor::zeros(&shape, self.default_dtype, &self.default_device)?,
+            Some(tensors) => match tensors.get(&path) {
+                Some(tensor) => tensor.to_device(&self.default_device)?,
+                None => panic!("cannot find tensor for {path}"),
+            },
+        };
+        vars.push(NamedVar {
+            path,
+            dtype: self.default_dtype,
+            shape,
+        });
+        Ok(parameter)
+    }
+
+    pub fn into_store(self) -> VarStore {
+        let vars = self.vars.borrow();
+        VarStore {
+            vars: vars.to_vec(),
+        }
+    }
+}
+
+impl<S: ToString> std::ops::Div<S> for &VarBuilder {
+    type Output = VarBuilder;
+
+    fn div(self, rhs: S) -> VarBuilder {
+        let mut path = self.path.clone();
+        path.push(rhs.to_string());
+        VarBuilder {
+            path,
+            vars: self.vars.clone(),
+            default_dtype: self.default_dtype,
+            default_device: self.default_device.clone(),
+            tensors: self.tensors.clone(),
+        }
+    }
+}
+
+impl<S: ToString> std::ops::Div<S> for VarBuilder {
+    type Output = VarBuilder;
+
+    fn div(self, rhs: S) -> VarBuilder {
+        &self / rhs
+    }
+}
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -0,0 +1,274 @@
+use crate::{op::Op, Error, Result, Tensor, TensorId};
+use std::collections::HashMap;
+
+impl Tensor {
+    /// Return all the nodes that lead to this value in a topologically sorted vec, the first
+    /// elements having dependencies on the latter ones, e.g. the first element if any is the
+    /// argument.
+    /// This assumes that the op graph is a DAG.
+    fn sorted_nodes(&self) -> Vec<&Tensor> {
+        // The vec of sorted nodes is passed as an owned value rather than a mutable reference
+        // to get around some lifetime limitations.
+        fn walk<'a>(
+            node: &'a Tensor,
+            nodes: Vec<&'a Tensor>,
+            already_seen: &mut HashMap<TensorId, bool>,
+        ) -> (bool, Vec<&'a Tensor>) {
+            if let Some(&tg) = already_seen.get(&node.id()) {
+                return (tg, nodes);
+            }
+            let mut track_grad = false;
+            let mut nodes = if node.is_variable() {
+                // Do not call recursively on the "leaf" nodes.
+                track_grad = true;
+                nodes
+            } else if let Some(op) = node.op() {
+                match op {
+                    Op::WhereCond(t1, t2, t3) => {
+                        let (tg, nodes) = walk(t1, nodes, already_seen);
+                        track_grad |= tg;
+                        let (tg, nodes) = walk(t2, nodes, already_seen);
+                        track_grad |= tg;
+                        let (tg, nodes) = walk(t3, nodes, already_seen);
+                        track_grad |= tg;
+                        nodes
+                    }
+                    Op::Add(lhs, rhs)
+                    | Op::Mul(lhs, rhs)
+                    | Op::Sub(lhs, rhs)
+                    | Op::Div(lhs, rhs)
+                    | Op::Embedding(lhs, rhs)
+                    | Op::Matmul(lhs, rhs) => {
+                        let (tg, nodes) = walk(lhs, nodes, already_seen);
+                        track_grad |= tg;
+                        let (tg, nodes) = walk(rhs, nodes, already_seen);
+                        track_grad |= tg;
+                        nodes
+                    }
+                    Op::Cat(args, _) => args.iter().fold(nodes, |nodes, arg| {
+                        let (tg, nodes) = walk(arg, nodes, already_seen);
+                        track_grad |= tg;
+                        nodes
+                    }),
+                    Op::Affine { arg, mul, .. } => {
+                        if *mul == 0. {
+                            nodes
+                        } else {
+                            let (tg, nodes) = walk(arg, nodes, already_seen);
+                            track_grad |= tg;
+                            nodes
+                        }
+                    }
+                    Op::Reshape(node)
+                    | Op::Broadcast(node)
+                    | Op::Sum(node, _)
+                    | Op::ToDType(node)
+                    | Op::ToDevice(node)
+                    | Op::Transpose(node, _, _)
+                    | Op::Narrow(node, _, _, _)
+                    | Op::Softmax(node, _)
+                    | Op::Sqr(node)
+                    | Op::Sqrt(node)
+                    | Op::Gelu(node)
+                    | Op::Exp(node)
+                    | Op::Log(node)
+                    | Op::Sin(node)
+                    | Op::Cos(node)
+                    | Op::Abs(node)
+                    | Op::Neg(node) => {
+                        let (tg, nodes) = walk(node, nodes, already_seen);
+                        track_grad |= tg;
+                        nodes
+                    }
+                }
+            } else {
+                nodes
+            };
+            already_seen.insert(node.id(), track_grad);
+            if track_grad {
+                nodes.push(node);
+            }
+            (track_grad, nodes)
+        }
+        let (_tg, mut nodes) = walk(self, vec![], &mut HashMap::new());
+        nodes.reverse();
+        nodes
+    }
+
+    pub fn backward(&self) -> Result<GradStore> {
+        let sorted_nodes = self.sorted_nodes();
+        let mut grads = GradStore::new();
+        grads.insert(self, self.ones_like()?);
+        for node in sorted_nodes.iter() {
+            if node.is_variable() {
+                continue;
+            }
+            let grad = grads.remove(node).unwrap();
+            // TODO: We should perform all these operations in place (or at least not track the
+            // whole graph).
+            // The only drawback would be if we wanted to support grad of grad but this is out of
+            // scope.
+            if let Some(op) = node.op() {
+                match op {
+                    Op::Add(lhs, rhs) => {
+                        let lhs_sum_grad = grads.or_insert(lhs)?;
+                        *lhs_sum_grad = lhs_sum_grad.add(&grad)?;
+                        let rhs_sum_grad = grads.or_insert(rhs)?;
+                        *rhs_sum_grad = rhs_sum_grad.add(&grad)?;
+                    }
+                    Op::Sub(lhs, rhs) => {
+                        let lhs_sum_grad = grads.or_insert(lhs)?;
+                        *lhs_sum_grad = lhs_sum_grad.add(&grad)?;
+                        let rhs_sum_grad = grads.or_insert(rhs)?;
+                        *rhs_sum_grad = rhs_sum_grad.sub(&grad)?;
+                    }
+                    Op::Mul(lhs, rhs) => {
+                        let lhs_grad = grad.mul(rhs)?;
+                        let lhs_sum_grad = grads.or_insert(lhs)?;
+                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
+                        let rhs_grad = grad.mul(lhs)?;
+                        let rhs_sum_grad = grads.or_insert(rhs)?;
+                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
+                    }
+                    Op::Div(lhs, rhs) => {
+                        let lhs_grad = grad.div(rhs)?;
+                        let lhs_sum_grad = grads.or_insert(lhs)?;
+                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
+                        let rhs_grad = grad.mul(lhs)?.div(&rhs.sqr()?)?;
+                        let rhs_sum_grad = grads.or_insert(rhs)?;
+                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
+                    }
+                    Op::WhereCond(_pred, _t, _f) => {
+                        return Err(Error::BackwardNotSupported { op: "where_cond" })
+                    }
+                    Op::Embedding(_lhs, _rhs) => {
+                        return Err(Error::BackwardNotSupported { op: "embedding" })
+                    }
+                    Op::Matmul(lhs, rhs) => {
+                        // Skipping checks, the op went ok, we can skip
+                        // the matmul size checks for now.
+
+                        let lhs_grad = grad.matmul(&rhs.t()?)?;
+                        let lhs_sum_grad = grads.or_insert(lhs)?;
+                        *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
+
+                        let rhs_grad = lhs.t()?.matmul(&grad)?;
+                        let rhs_sum_grad = grads.or_insert(rhs)?;
+                        *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
+                    }
+                    Op::Cat(args, dim) => {
+                        let mut start_idx = 0;
+                        for arg in args {
+                            let len = arg.dims()[*dim];
+                            let arg_grad = grad.narrow(*dim, start_idx, len)?;
+                            let sum_grad = grads.or_insert(arg)?;
+                            *sum_grad = sum_grad.add(&arg_grad)?;
+                            start_idx += len;
+                        }
+                    }
+                    Op::Broadcast(_arg) => {
+                        return Err(Error::BackwardNotSupported { op: "broadcast" })
+                    }
+                    Op::Sum(_arg, _sum_dims) => {
+                        return Err(Error::BackwardNotSupported { op: "sum" })
+                    }
+                    Op::ToDType(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&grad.to_dtype(node.dtype())?)?
+                    }
+                    Op::Affine { arg, mul, .. } => {
+                        let arg_grad = grad.affine(*mul, 0.)?;
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&arg_grad)?
+                    }
+                    Op::Log(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&(&grad * *node)?)?
+                    }
+                    Op::Sin(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&(&grad * arg.cos())?)?
+                    }
+                    Op::Cos(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.sub(&(&grad * arg.sin())?)?
+                    }
+                    Op::Abs(_args) => return Err(Error::BackwardNotSupported { op: "abs" }),
+                    Op::Exp(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&(&grad / arg)?)?
+                    }
+                    Op::Neg(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.sub(&grad)?
+                    }
+                    Op::Narrow(_arg, _, _, _) => {
+                        return Err(Error::BackwardNotSupported { op: "narrow" })
+                    }
+                    Op::Softmax(_arg, _) => {
+                        return Err(Error::BackwardNotSupported { op: "softmax" })
+                    }
+                    Op::Reshape(_arg) => return Err(Error::BackwardNotSupported { op: "reshape" }),
+                    Op::Gelu(_) => return Err(Error::BackwardNotSupported { op: "gelu" }),
+                    Op::Sqr(arg) => {
+                        let arg_grad = arg.mul(&grad)?.affine(2., 0.)?;
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&arg_grad)?
+                    }
+                    Op::Sqrt(arg) => {
+                        let arg_grad = grad.div(arg)?.affine(0.5, 0.)?;
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&arg_grad)?
+                    }
+                    Op::ToDevice(arg) => {
+                        let sum_grad = grads.or_insert(arg)?;
+                        let arg_grad = grad.to_device(&sum_grad.device())?;
+                        *sum_grad = sum_grad.add(&arg_grad)?
+                    }
+                    Op::Transpose(arg, dim1, dim2) => {
+                        let arg_grad = grad.transpose(*dim1, *dim2)?;
+                        let sum_grad = grads.or_insert(arg)?;
+                        *sum_grad = sum_grad.add(&arg_grad)?
+                    }
+                };
+            }
+        }
+        Ok(grads)
+    }
+}
+
+pub struct GradStore(HashMap<TensorId, Tensor>);
+
+impl GradStore {
+    fn new() -> Self {
+        GradStore(HashMap::new())
+    }
+
+    pub fn get_id(&self, id: TensorId) -> Option<&Tensor> {
+        self.0.get(&id)
+    }
+
+    pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> {
+        self.0.get(&tensor.id())
+    }
+
+    pub fn remove(&mut self, tensor: &Tensor) -> Option<Tensor> {
+        self.0.remove(&tensor.id())
+    }
+
+    pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option<Tensor> {
+        self.0.insert(tensor.id(), grad)
+    }
+
+    fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> {
+        use std::collections::hash_map::Entry;
+        let grad = match self.0.entry(tensor.id()) {
+            Entry::Occupied(entry) => entry.into_mut(),
+            Entry::Vacant(entry) => {
+                let grad = tensor.zeros_like()?;
+                entry.insert(grad)
+            }
+        };
+        Ok(grad)
+    }
+}
--- a/candle-core/src/cpu_backend.rs
+++ b/candle-core/src/cpu_backend.rs
@ -0,0 +1,886 @@
+use crate::op::{BinaryOp, UnaryOp};
+use crate::{DType, Error, Result, Shape, StridedIndex};
+use gemm::{gemm, Parallelism};
+use half::{bf16, f16};
+
+// TODO: Think about whether we would be better off with a dtype and
+// a buffer as an owned slice of bytes.
+// TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
+// intercept the oom errors to avoid panicking and provide a proper error.
+#[derive(Debug, Clone)]
+pub enum CpuStorage {
+    U32(Vec<u32>),
+    BF16(Vec<bf16>),
+    F16(Vec<f16>),
+    F32(Vec<f32>),
+    F64(Vec<f64>),
+}
+
+fn wcond<T: Copy>(
+    pred: &[u32],
+    shape: &Shape,
+    stride: &[usize],
+    t: &[T],
+    stride_t: &[usize],
+    f: &[T],
+    stride_f: &[usize],
+) -> Vec<T> {
+    if shape.is_contiguous(stride) && shape.is_contiguous(stride_t) && shape.is_contiguous(stride_f)
+    {
+        let elem_count = shape.elem_count();
+        let pred = &pred[..elem_count];
+        let t = &t[..elem_count];
+        let f = &f[..elem_count];
+        pred.iter()
+            .zip(t.iter().zip(f.iter()))
+            .map(|(&p, (&t, &f))| if p > 0 { t } else { f })
+            .collect::<Vec<_>>()
+    } else {
+        let dims = shape.dims();
+        let it_p = StridedIndex::new(dims, stride);
+        let it_t = StridedIndex::new(dims, stride_t);
+        let it_f = StridedIndex::new(dims, stride_f);
+        it_p.zip(it_t.zip(it_f))
+            .map(|(i_p, (i_t, i_f))| if pred[i_p] > 0 { t[i_t] } else { f[i_f] })
+            .collect::<Vec<_>>()
+    }
+}
+
+macro_rules! map1 {
+    ($v: expr, $fn: ident, $( $args:expr ),*) => {{
+        let v = match $v {
+            CpuStorage::BF16(__s) => CpuStorage::BF16($fn::<bf16>(__s, $($args),*)?),
+            CpuStorage::F16(__s) => CpuStorage::F16($fn::<f16>(__s, $($args),*)?),
+            CpuStorage::F32(__s) => CpuStorage::F32($fn::<f32>(__s, $($args),*)?),
+            CpuStorage::F64(__s) => CpuStorage::F64($fn::<f64>(__s, $($args),*)?),
+            CpuStorage::U32(__s) => CpuStorage::U32($fn::<u32>(__s, $($args),*)?),
+        };
+        Ok(v)
+    }};
+}
+
+fn sum_impl1<T: Copy + num_traits::NumAssign>(
+    src: &[T],
+    dst_shape: &Shape,
+    src_dims: &[usize],
+    stride: &[usize],
+    to_dst_index: impl Fn(usize) -> usize,
+) -> Result<Vec<T>> {
+    let mut dst = vec![T::zero(); dst_shape.elem_count()];
+    for (unstr_index, src_index) in StridedIndex::new(src_dims, stride).enumerate() {
+        dst[to_dst_index(unstr_index)] += src[src_index];
+    }
+    Ok(dst)
+}
+
+fn unary_map<T: Copy, U: Copy, F: FnMut(T) -> U>(
+    vs: &[T],
+    shape: &Shape,
+    stride: &[usize],
+    mut f: F,
+) -> Vec<U> {
+    if shape.is_contiguous(stride) {
+        vs[..shape.elem_count()].iter().map(|&v| f(v)).collect()
+    } else {
+        StridedIndex::new(shape.dims(), stride)
+            .map(|i| f(vs[i]))
+            .collect()
+    }
+}
+
+// This function maps over two strided index sequences.
+fn binary_map<T: Copy, F: FnMut(T, T) -> T>(
+    shape: &Shape,
+    lhs_stride: &[usize],
+    rhs_stride: &[usize],
+    lhs: &[T],
+    rhs: &[T],
+    mut f: F,
+) -> Vec<T> {
+    let dims = shape.dims();
+    if shape.is_contiguous(lhs_stride) && shape.is_contiguous(rhs_stride) {
+        (0..shape.elem_count()).map(|i| f(lhs[i], rhs[i])).collect()
+    } else {
+        let lhs_index = StridedIndex::new(dims, lhs_stride);
+        let rhs_index = StridedIndex::new(dims, rhs_stride);
+        lhs_index
+            .zip(rhs_index)
+            .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
+            .collect()
+    }
+}
+
+fn take_impl1<T: Copy>(
+    vs: &[T],
+    ids: &[u32],
+    shape: &Shape,
+    stride: &[usize],
+    vocab_size: usize,
+    hidden_size: usize,
+) -> Result<Vec<T>> {
+    let mut values = Vec::with_capacity(shape.elem_count() * hidden_size);
+    for index in StridedIndex::new(shape.dims(), stride) {
+        let index = ids[index].try_into()?;
+        if index >= vocab_size {
+            return Err(Error::InvalidIndex {
+                index,
+                vocab_size,
+                op: "take",
+            });
+        } else {
+            values.extend(&vs[hidden_size * index..hidden_size * (index + 1)]);
+        }
+    }
+    Ok(values)
+}
+
+fn copy_strided_src_<T: Copy + std::fmt::Display>(
+    src: &[T],
+    dst: &mut [T],
+    dst_offset: usize,
+    src_shape: &Shape,
+    src_stride: &[usize],
+    src_offset: usize,
+) {
+    let src = &src[src_offset..];
+    if src_shape.is_contiguous(src_stride) {
+        let elem_to_copy = (dst.len() - dst_offset).min(src.len());
+        dst[dst_offset..dst_offset + elem_to_copy].copy_from_slice(&src[..elem_to_copy])
+    } else {
+        let src_indexes = StridedIndex::new(src_shape.dims(), src_stride);
+        for (dst_index, src_index) in src_indexes.enumerate() {
+            let dst_index = dst_index + dst_offset;
+            if dst_index >= dst.len() {
+                break;
+            }
+            dst[dst_index] = src[src_index]
+        }
+    }
+}
+
+impl CpuStorage {
+    pub fn dtype(&self) -> DType {
+        match self {
+            Self::U32(_) => DType::U32,
+            Self::BF16(_) => DType::BF16,
+            Self::F16(_) => DType::F16,
+            Self::F32(_) => DType::F32,
+            Self::F64(_) => DType::F64,
+        }
+    }
+
+    pub fn as_slice<D: crate::WithDType>(&self) -> Result<&[D]> {
+        D::cpu_storage_as_slice(self)
+    }
+
+    pub fn as_mut_slice<D: crate::WithDType>(&mut self) -> Result<&mut [D]> {
+        D::cpu_storage_as_mut_slice(self)
+    }
+
+    pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
+        // TODO: find a way around the quadratic number of cases below.
+        match (self, dtype) {
+            (Self::U32(storage), DType::BF16) => {
+                let data = unary_map(storage, shape, stride, |v| bf16::from_f32(v as f32));
+                Ok(Self::BF16(data))
+            }
+            (Self::BF16(storage), DType::BF16) => {
+                let data = unary_map(storage, shape, stride, |v| v);
+                Ok(Self::BF16(data))
+            }
+            (Self::F16(storage), DType::BF16) => {
+                let data = unary_map(storage, shape, stride, |v| bf16::from_f32(v.to_f32()));
+                Ok(Self::BF16(data))
+            }
+            (Self::F32(storage), DType::BF16) => {
+                let data = unary_map(storage, shape, stride, bf16::from_f32);
+                Ok(Self::BF16(data))
+            }
+            (Self::F64(storage), DType::BF16) => {
+                let data = unary_map(storage, shape, stride, bf16::from_f64);
+                Ok(Self::BF16(data))
+            }
+            (Self::U32(storage), DType::F16) => {
+                let data = unary_map(storage, shape, stride, |v| f16::from_f32(v as f32));
+                Ok(Self::F16(data))
+            }
+            (Self::BF16(storage), DType::F16) => {
+                let data = unary_map(storage, shape, stride, |v| f16::from_f32(v.to_f32()));
+                Ok(Self::F16(data))
+            }
+            (Self::F16(storage), DType::F16) => {
+                let data = unary_map(storage, shape, stride, |v| v);
+                Ok(Self::F16(data))
+            }
+            (Self::F32(storage), DType::F16) => {
+                let data = unary_map(storage, shape, stride, f16::from_f32);
+                Ok(Self::F16(data))
+            }
+            (Self::F64(storage), DType::F16) => {
+                let data = unary_map(storage, shape, stride, f16::from_f64);
+                Ok(Self::F16(data))
+            }
+            (Self::U32(storage), DType::F32) => {
+                let data = unary_map(storage, shape, stride, |v| v as f32);
+                Ok(Self::F32(data))
+            }
+            (Self::BF16(storage), DType::F32) => {
+                let data = unary_map(storage, shape, stride, |v| v.to_f32());
+                Ok(Self::F32(data))
+            }
+            (Self::F16(storage), DType::F32) => {
+                let data = unary_map(storage, shape, stride, |v| v.to_f32());
+                Ok(Self::F32(data))
+            }
+            (Self::F32(storage), DType::F32) => {
+                let data = unary_map(storage, shape, stride, |v| v);
+                Ok(Self::F32(data))
+            }
+            (Self::F64(storage), DType::F32) => {
+                let data = unary_map(storage, shape, stride, |v| v as f32);
+                Ok(Self::F32(data))
+            }
+            (Self::U32(storage), DType::U32) => {
+                let data = unary_map(storage, shape, stride, |v| v);
+                Ok(Self::U32(data))
+            }
+            (Self::BF16(storage), DType::U32) => {
+                let data = unary_map(storage, shape, stride, |v| v.to_f32() as u32);
+                Ok(Self::U32(data))
+            }
+            (Self::F16(storage), DType::U32) => {
+                let data = unary_map(storage, shape, stride, |v| v.to_f32() as u32);
+                Ok(Self::U32(data))
+            }
+            (Self::F32(storage), DType::U32) => {
+                let data = unary_map(storage, shape, stride, |v| v as u32);
+                Ok(Self::U32(data))
+            }
+            (Self::F64(storage), DType::U32) => {
+                let data = unary_map(storage, shape, stride, |v| v as u32);
+                Ok(Self::U32(data))
+            }
+            (Self::U32(storage), DType::F64) => {
+                let data = unary_map(storage, shape, stride, |v| v as f64);
+                Ok(Self::F64(data))
+            }
+            (Self::BF16(storage), DType::F64) => {
+                let data = unary_map(storage, shape, stride, |v| v.to_f64());
+                Ok(Self::F64(data))
+            }
+            (Self::F16(storage), DType::F64) => {
+                let data = unary_map(storage, shape, stride, |v| v.to_f64());
+                Ok(Self::F64(data))
+            }
+            (Self::F32(storage), DType::F64) => {
+                let data = unary_map(storage, shape, stride, |v| v as f64);
+                Ok(Self::F64(data))
+            }
+            (Self::F64(storage), DType::F64) => {
+                let data = unary_map(storage, shape, stride, |v| v);
+                Ok(Self::F64(data))
+            }
+        }
+    }
+
+    pub(crate) fn sum(&self, shape: &Shape, stride: &[usize], sum_dims: &[usize]) -> Result<Self> {
+        let src_dims = shape.dims();
+        let mut dst_dims = src_dims.to_vec();
+        for &sum_dim in sum_dims.iter() {
+            dst_dims[sum_dim] = 1;
+        }
+        let dst_shape = Shape::from(dst_dims);
+        let mut sum_dims = sum_dims.to_vec();
+        // Sort the sum_dims as they have to be processed from left to right when converting the
+        // indexes.
+        sum_dims.sort();
+        let sum_dims_and_stride: Vec<_> = sum_dims
+            .iter()
+            .map(|&d| (src_dims[d], src_dims[d + 1..].iter().product::<usize>()))
+            .collect();
+        let to_dst_index = |unstr_index: usize| {
+            // TODO: Optimize, the following does lots of slow division.
+            let mut dst_index = unstr_index;
+            // Set the sum_dims indexes to 0.
+            for &(dim, stride) in sum_dims_and_stride.iter() {
+                // The compiler is able to optimize the following in a single divmod op.
+                let (pre, post) = (dst_index / stride, dst_index % stride);
+                dst_index = (pre / dim) * stride + post;
+            }
+            dst_index
+        };
+        // TODO: Maybe provide an implementation with higher precision accumulators?
+        map1!(self, sum_impl1, &dst_shape, src_dims, stride, to_dst_index)
+    }
+
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
+        // [self] stores data in a contiguous way.
+        let dims = shape.dims();
+        let elem_per_slice = dims[dim];
+        let prod_pre_dim = dims[..dim].iter().product();
+        let prod_post_dim = dims[dim + 1..].iter().product();
+        match self {
+            Self::BF16(storage) => {
+                for pre_idx in 0..prod_pre_dim {
+                    for post_idx in 0..prod_post_dim {
+                        let mut sum = 0f64;
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            sum += storage[idx].to_f64();
+                            idx += prod_post_dim
+                        }
+                        let sum = bf16::from_f64(sum);
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            storage[idx] /= sum;
+                            idx += prod_post_dim
+                        }
+                    }
+                }
+            }
+            Self::F16(storage) => {
+                for pre_idx in 0..prod_pre_dim {
+                    for post_idx in 0..prod_post_dim {
+                        let mut sum = 0f64;
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            sum += storage[idx].to_f64();
+                            idx += prod_post_dim
+                        }
+                        let sum = f16::from_f64(sum);
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            storage[idx] /= sum;
+                            idx += prod_post_dim
+                        }
+                    }
+                }
+            }
+            Self::F32(storage) => {
+                for pre_idx in 0..prod_pre_dim {
+                    for post_idx in 0..prod_post_dim {
+                        let mut sum = 0f64;
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            sum += storage[idx] as f64;
+                            idx += prod_post_dim
+                        }
+                        let sum = sum as f32;
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            storage[idx] /= sum;
+                            idx += prod_post_dim
+                        }
+                    }
+                }
+            }
+            Self::F64(storage) => {
+                for pre_idx in 0..prod_pre_dim {
+                    for post_idx in 0..prod_post_dim {
+                        let mut sum = 0f64;
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            sum += storage[idx];
+                            idx += prod_post_dim
+                        }
+                        let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
+                        for _ in 0..elem_per_slice {
+                            storage[idx] /= sum;
+                            idx += prod_post_dim
+                        }
+                    }
+                }
+            }
+            Self::U32(_) => {}
+        }
+        Ok(())
+    }
+
+    pub(crate) fn affine_impl(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        mul: f64,
+        add: f64,
+    ) -> Result<Self> {
+        match self {
+            Self::U32(storage) => {
+                let mul = mul as u32;
+                let add = add as u32;
+                let data = unary_map(storage, shape, stride, |v| v * mul + add);
+                Ok(Self::U32(data))
+            }
+            Self::BF16(storage) => {
+                let mul = bf16::from_f64(mul);
+                let add = bf16::from_f64(add);
+                let data = unary_map(storage, shape, stride, |v| v * mul + add);
+                Ok(Self::BF16(data))
+            }
+            Self::F16(storage) => {
+                let mul = f16::from_f64(mul);
+                let add = f16::from_f64(add);
+                let data = unary_map(storage, shape, stride, |v| v * mul + add);
+                Ok(Self::F16(data))
+            }
+            Self::F32(storage) => {
+                let mul = mul as f32;
+                let add = add as f32;
+                let data = unary_map(storage, shape, stride, |v| v * mul + add);
+                Ok(Self::F32(data))
+            }
+            Self::F64(storage) => {
+                let data = unary_map(storage, shape, stride, |v| v * mul + add);
+                Ok(Self::F64(data))
+            }
+        }
+    }
+
+    pub(crate) fn unary_impl<B: UnaryOp>(&self, shape: &Shape, stride: &[usize]) -> Result<Self> {
+        match self {
+            Self::BF16(storage) => {
+                let data = unary_map(storage, shape, stride, B::bf16);
+                Ok(Self::BF16(data))
+            }
+            Self::F16(storage) => {
+                let data = unary_map(storage, shape, stride, B::f16);
+                Ok(Self::F16(data))
+            }
+            Self::F32(storage) => {
+                let data = unary_map(storage, shape, stride, B::f32);
+                Ok(Self::F32(data))
+            }
+            Self::F64(storage) => {
+                let data = unary_map(storage, shape, stride, B::f64);
+                Ok(Self::F64(data))
+            }
+            Self::U32(storage) => {
+                let data = unary_map(storage, shape, stride, B::u32);
+                Ok(Self::U32(data))
+            }
+        }
+    }
+
+    pub(crate) fn binary_impl<B: BinaryOp>(
+        &self,
+        rhs: &Self,
+        shape: &Shape,
+        lhs_stride: &[usize],
+        rhs_stride: &[usize],
+    ) -> Result<Self> {
+        match (self, rhs) {
+            (Self::BF16(lhs), Self::BF16(rhs)) => {
+                let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::bf16);
+                Ok(Self::BF16(data))
+            }
+            (Self::F16(lhs), Self::F16(rhs)) => {
+                let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::f16);
+                Ok(Self::F16(data))
+            }
+            (Self::F32(lhs), Self::F32(rhs)) => {
+                let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::f32);
+                Ok(Self::F32(data))
+            }
+            (Self::F64(lhs), Self::F64(rhs)) => {
+                let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::f64);
+                Ok(Self::F64(data))
+            }
+            (Self::U32(lhs), Self::U32(rhs)) => {
+                let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::u32);
+                Ok(Self::U32(data))
+            }
+            _ => {
+                // This should be covered by the dtype check above.
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: self.dtype(),
+                    rhs: rhs.dtype(),
+                    op: B::NAME,
+                })
+            }
+        }
+    }
+
+    pub(crate) fn copy_strided_src(
+        &self,
+        dst: &mut Self,
+        dst_offset: usize,
+        src_shape: &Shape,
+        src_stride: &[usize],
+        src_offset: usize,
+    ) -> Result<()> {
+        if src_shape.rank() != src_stride.len() {
+            panic!("incoherent shape and strides {src_shape:?} {src_stride:?}")
+        }
+        match (self, dst) {
+            (Self::U32(src), Self::U32(dst)) => {
+                copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
+            }
+            (Self::BF16(src), Self::BF16(dst)) => {
+                copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
+            }
+            (Self::F16(src), Self::F16(dst)) => {
+                copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
+            }
+            (Self::F32(src), Self::F32(dst)) => {
+                copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
+            }
+            (Self::F64(src), Self::F64(dst)) => {
+                copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
+            }
+            (_, dst) => {
+                // This should be covered by the dtype check above.
+                return Err(Error::DTypeMismatchBinaryOp {
+                    lhs: self.dtype(),
+                    rhs: dst.dtype(),
+                    op: "copy_strided",
+                });
+            }
+        }
+        Ok(())
+    }
+
+    pub(crate) fn where_cond(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        t: &Self,
+        stride_t: &[usize],
+        f: &Self,
+        stride_f: &[usize],
+    ) -> Result<Self> {
+        // TODO: Support types that could be casted to a boolean.
+        let pred = self.as_slice::<u32>()?;
+        match (t, f) {
+            (Self::BF16(t), Self::BF16(f)) => {
+                let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
+                Ok(Self::BF16(data))
+            }
+            (Self::F16(t), Self::F16(f)) => {
+                let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
+                Ok(Self::F16(data))
+            }
+            (Self::F32(t), Self::F32(f)) => {
+                let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
+                Ok(Self::F32(data))
+            }
+            (Self::F64(t), Self::F64(f)) => {
+                let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
+                Ok(Self::F64(data))
+            }
+            (Self::U32(t), Self::U32(f)) => {
+                let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
+                Ok(Self::U32(data))
+            }
+            _ => Err(Error::DTypeMismatchBinaryOp {
+                lhs: t.dtype(),
+                rhs: f.dtype(),
+                op: "where_cond",
+            }),
+        }
+    }
+
+    pub(crate) fn embedding_impl(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        vs: &Self,
+        hidden_size: usize,
+        vocab_size: usize,
+    ) -> Result<Self> {
+        let ids = self.as_slice::<u32>()?;
+        map1!(vs, take_impl1, ids, shape, stride, vocab_size, hidden_size)
+    }
+
+    pub(crate) fn matmul_impl(
+        &self,
+        rhs: &Self,
+        (b, m, n, k): (usize, usize, usize, usize),
+        lhs_stride: &[usize],
+        rhs_stride: &[usize],
+    ) -> Result<Self> {
+        let a_skip: usize = m * k;
+        let b_skip: usize = n * k;
+        let c_skip: usize = m * n;
+
+        let rank = lhs_stride.len();
+        let lhs_cs = lhs_stride[rank - 1];
+        let lhs_rs = lhs_stride[rank - 2];
+
+        let rhs_cs = rhs_stride[rank - 1];
+        let rhs_rs = rhs_stride[rank - 2];
+
+        if lhs_stride.len() > 2 {
+            let lhs_batch_stride = &lhs_stride[..rank - 2];
+            let rhs_batch_stride = &rhs_stride[..rank - 2];
+
+            if lhs_batch_stride != [a_skip] || rhs_batch_stride != [b_skip] {
+                // Temporary error before we support abitrary striding.
+                return Err(Error::UnexpectedStriding);
+            }
+        }
+
+        let dst_shape: Shape = (m, n).into();
+        let dst_strides = dst_shape.stride_contiguous();
+        let dst_rs = dst_strides[0];
+        let dst_cs = dst_strides[1];
+
+        match (self, rhs) {
+            (CpuStorage::F16(lhs), CpuStorage::F16(rhs)) => {
+                let mut dst = vec![f16::ZERO; b * m * n];
+                for step in 0..b {
+                    let lhs_p = &lhs[step * a_skip..];
+                    let rhs_p = &rhs[step * b_skip..];
+                    let dst_p = &mut dst[step * c_skip..];
+                    unsafe {
+                        gemm(
+                            // m: usize,
+                            m,
+                            // n: usize,
+                            n,
+                            // k: usize,
+                            k,
+                            // dst: *mut T,
+                            dst_p.as_mut_ptr(),
+                            // dst_cs: isize,
+                            dst_cs as isize,
+                            // dst_rs: isize,
+                            dst_rs as isize,
+                            // read_dst: bool,
+                            false,
+                            // lhs: *const T,
+                            lhs_p.as_ptr(),
+                            // lhs_cs: isize,
+                            lhs_cs as isize,
+                            // lhs_rs: isize,
+                            lhs_rs as isize,
+                            // rhs: *const T,
+                            rhs_p.as_ptr(),
+                            // rhs_cs: isize,
+                            rhs_cs as isize,
+                            // rhs_rs: isize,
+                            rhs_rs as isize,
+                            // alpha: T,
+                            f16::ONE,
+                            // beta: T,
+                            f16::ONE,
+                            // conj_dst: bool,
+                            false,
+                            // conj_lhs: bool,
+                            false,
+                            // conj_rhs: bool,
+                            true,
+                            // parallelism: Parallelism
+                            Parallelism::None,
+                        )
+                    }
+                }
+
+                Ok(Self::F16(dst))
+            }
+            (CpuStorage::F32(lhs), CpuStorage::F32(rhs)) => {
+                let mut dst = vec![0f32; b * m * n];
+                for step in 0..b {
+                    let lhs_p = &lhs[step * a_skip..];
+                    let rhs_p = &rhs[step * b_skip..];
+                    let dst_p = &mut dst[step * c_skip..];
+                    unsafe {
+                        gemm(
+                            // m: usize,
+                            m,
+                            // n: usize,
+                            n,
+                            // k: usize,
+                            k,
+                            // dst: *mut T,
+                            dst_p.as_mut_ptr(),
+                            // dst_cs: isize,
+                            dst_cs as isize,
+                            // dst_rs: isize,
+                            dst_rs as isize,
+                            // read_dst: bool,
+                            false,
+                            // lhs: *const T,
+                            lhs_p.as_ptr(),
+                            // lhs_cs: isize,
+                            lhs_cs as isize,
+                            // lhs_rs: isize,
+                            lhs_rs as isize,
+                            // rhs: *const T,
+                            rhs_p.as_ptr(),
+                            // rhs_cs: isize,
+                            rhs_cs as isize,
+                            // rhs_rs: isize,
+                            rhs_rs as isize,
+                            // alpha: T,
+                            1f32,
+                            // beta: T,
+                            1f32,
+                            // conj_dst: bool,
+                            false,
+                            // conj_lhs: bool,
+                            false,
+                            // conj_rhs: bool,
+                            true,
+                            // parallelism: Parallelism
+                            Parallelism::None,
+                        )
+                    }
+                }
+
+                Ok(Self::F32(dst))
+            }
+            (CpuStorage::F64(lhs), CpuStorage::F64(rhs)) => {
+                let mut dst = vec![0f64; b * m * n];
+                for step in 0..b {
+                    let lhs_p = &lhs[step * a_skip..];
+                    let rhs_p = &rhs[step * b_skip..];
+                    let dst_p = &mut dst[step * c_skip..];
+                    unsafe {
+                        gemm(
+                            // m: usize,
+                            m,
+                            // n: usize,
+                            n,
+                            // k: usize,
+                            k,
+                            // dst: *mut T,
+                            dst_p.as_mut_ptr(),
+                            // dst_cs: isize,
+                            dst_cs as isize,
+                            // dst_rs: isize,
+                            dst_rs as isize,
+                            // read_dst: bool,
+                            false,
+                            // lhs: *const T,
+                            lhs_p.as_ptr(),
+                            // lhs_cs: isize,
+                            lhs_cs as isize,
+                            // lhs_rs: isize,
+                            lhs_rs as isize,
+                            // rhs: *const T,
+                            rhs_p.as_ptr(),
+                            // rhs_cs: isize,
+                            rhs_cs as isize,
+                            // rhs_rs: isize,
+                            rhs_rs as isize,
+                            // alpha: T,
+                            1f64,
+                            // beta: T,
+                            1f64,
+                            // conj_dst: bool,
+                            false,
+                            // conj_lhs: bool,
+                            false,
+                            // conj_rhs: bool,
+                            true,
+                            // parallelism: Parallelism
+                            Parallelism::None,
+                        )
+                    }
+                }
+                Ok(Self::F64(dst))
+            }
+            _ => {
+                // This should be covered by the dtype check above.
+                Err(Error::DTypeMismatchBinaryOp {
+                    lhs: self.dtype(),
+                    rhs: rhs.dtype(),
+                    op: "matmul",
+                })
+            }
+        }
+    }
+
+    pub(crate) fn ones_impl(shape: &Shape, dtype: DType) -> Self {
+        let elem_count = shape.elem_count();
+        match dtype {
+            DType::U32 => {
+                let data = vec![1u32; elem_count];
+                Self::U32(data)
+            }
+            DType::BF16 => {
+                let data = vec![bf16::ONE; elem_count];
+                Self::BF16(data)
+            }
+            DType::F16 => {
+                let data = vec![f16::ONE; elem_count];
+                Self::F16(data)
+            }
+            DType::F32 => {
+                let data = vec![1f32; elem_count];
+                Self::F32(data)
+            }
+            DType::F64 => {
+                let data = vec![1f64; elem_count];
+                Self::F64(data)
+            }
+        }
+    }
+
+    pub(crate) fn zeros_impl(shape: &Shape, dtype: DType) -> Self {
+        let elem_count = shape.elem_count();
+        match dtype {
+            DType::U32 => {
+                let data = vec![0u32; elem_count];
+                Self::U32(data)
+            }
+            DType::BF16 => {
+                let data = vec![bf16::ZERO; elem_count];
+                Self::BF16(data)
+            }
+            DType::F16 => {
+                let data = vec![f16::ZERO; elem_count];
+                Self::F16(data)
+            }
+            DType::F32 => {
+                let data = vec![0f32; elem_count];
+                Self::F32(data)
+            }
+            DType::F64 => {
+                let data = vec![0f64; elem_count];
+                Self::F64(data)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{Device, Tensor};
+
+    #[test]
+    fn simple_matmul() -> Result<()> {
+        let data = vec![1.0f32, 2.0, 3.0, 4.0];
+        let a = Tensor::from_slice(&data, (2, 2), &Device::Cpu)?;
+        let data = vec![1.0f32, 2.0, 3.0, 4.0];
+        let b = Tensor::from_slice(&data, (2, 2), &Device::Cpu)?;
+
+        let c = a.matmul(&b)?;
+        assert_eq!(c.to_vec2::<f32>()?, &[&[7.0f32, 10.0], &[15.0, 22.0]]);
+
+        let data = vec![1.0f32, 2.0];
+        let a = Tensor::from_slice(&data, (2, 1), &Device::Cpu)?;
+        let data = vec![3.0f32, 4.0];
+        let b = Tensor::from_slice(&data, (1, 2), &Device::Cpu)?;
+        let c = a.matmul(&b)?;
+        assert_eq!(c.to_vec2::<f32>()?, &[&[3.0, 4.0], &[6.0, 8.0]]);
+
+        let data: Vec<_> = (0..6).map(|i| i as f32).collect();
+        let a = Tensor::from_slice(&data, (2, 3), &Device::Cpu)?;
+        let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect();
+        let b = Tensor::from_slice(&data, (3, 2), &Device::Cpu)?;
+        let c = a.matmul(&b)?;
+        assert_eq!(c.to_vec2::<f32>()?, &[&[16., 19.], &[52., 64.]]);
+
+        let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+        let a = Tensor::from_slice(&data, (2, 2, 3), &Device::Cpu)?;
+        let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect();
+        let b = Tensor::from_slice(&data, (2, 3, 2), &Device::Cpu)?;
+        let c = a.matmul(&b)?;
+        assert_eq!(
+            c.to_vec3::<f32>()?,
+            &[&[&[16., 19.], &[52., 64.]], &[&[214., 235.], &[304., 334.]]]
+        );
+        Ok(())
+    }
+}
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@ -0,0 +1,978 @@
+use crate::{CpuStorage, DType, Shape};
+use candle_kernels as kernels;
+use cudarc::cublas::{Gemm, GemmConfig, StridedBatchedConfig};
+use cudarc::driver::{CudaFunction, CudaSlice, LaunchAsync, LaunchConfig};
+use half::{bf16, f16};
+use std::sync::Arc;
+
+/// cudarc related errors
+#[derive(thiserror::Error, Debug)]
+pub enum CudaError {
+    #[error(transparent)]
+    Cuda(#[from] cudarc::driver::DriverError),
+
+    #[error(transparent)]
+    Compiler(#[from] cudarc::nvrtc::CompileError),
+
+    #[error(transparent)]
+    Cublas(#[from] cudarc::cublas::result::CublasError),
+
+    #[error("{op} only supports contiguous tensors")]
+    RequiresContiguous { op: &'static str },
+
+    #[error("missing kernel '{module_name}'")]
+    MissingKernel { module_name: String },
+
+    #[error("internal error '{0}'")]
+    InternalError(&'static str),
+
+    #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")]
+    MatMulNonContiguous {
+        lhs_stride: Vec<usize>,
+        rhs_stride: Vec<usize>,
+        mnk: (usize, usize, usize),
+    },
+
+    #[error("{msg}, expected: {expected:?}, got: {got:?}")]
+    UnexpectedDType {
+        msg: &'static str,
+        expected: DType,
+        got: DType,
+    },
+
+    #[error("{cuda} when loading {module_name}")]
+    Load {
+        cuda: cudarc::driver::DriverError,
+        module_name: String,
+    },
+}
+
+type Result<T> = std::result::Result<T, CudaError>;
+
+/// Unique identifier for cuda devices.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub(crate) struct DeviceId(usize);
+
+impl DeviceId {
+    fn new() -> Self {
+        // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
+        use std::sync::atomic;
+        static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
+        Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct CudaDevice {
+    id: DeviceId,
+    device: Arc<cudarc::driver::CudaDevice>,
+    #[allow(dead_code)]
+    blas: Arc<cudarc::cublas::CudaBlas>,
+}
+
+impl std::ops::Deref for CudaDevice {
+    type Target = Arc<cudarc::driver::CudaDevice>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.device
+    }
+}
+
+impl CudaDevice {
+    pub(crate) fn new(ordinal: usize) -> Result<Self> {
+        let device = cudarc::driver::CudaDevice::new(ordinal)?;
+        let blas = cudarc::cublas::CudaBlas::new(device.clone())?;
+        Ok(Self {
+            id: DeviceId::new(),
+            device,
+            blas: Arc::new(blas),
+        })
+    }
+
+    pub(crate) fn same_id(&self, rhs: &Self) -> bool {
+        self.id == rhs.id
+    }
+
+    pub(crate) fn ordinal(&self) -> usize {
+        self.device.ordinal()
+    }
+
+    pub(crate) fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
+        let elem_count = shape.elem_count();
+        let slice = match dtype {
+            DType::U32 => {
+                let data = self.alloc_zeros::<u32>(elem_count)?;
+                CudaStorageSlice::U32(data)
+            }
+            DType::BF16 => {
+                let data = self.alloc_zeros::<bf16>(elem_count)?;
+                CudaStorageSlice::BF16(data)
+            }
+            DType::F16 => {
+                let data = self.alloc_zeros::<f16>(elem_count)?;
+                CudaStorageSlice::F16(data)
+            }
+            DType::F32 => {
+                let data = self.alloc_zeros::<f32>(elem_count)?;
+                CudaStorageSlice::F32(data)
+            }
+            DType::F64 => {
+                let data = self.alloc_zeros::<f64>(elem_count)?;
+                CudaStorageSlice::F64(data)
+            }
+        };
+        Ok(CudaStorage {
+            slice,
+            device: self.clone(),
+        })
+    }
+
+    pub(crate) fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
+        let elem_count = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
+        let slice = match dtype {
+            DType::U32 => {
+                // SAFETY: Set later by running the fill kernel.
+                let data = unsafe { self.alloc::<u32>(elem_count) }?;
+                let func = self.get_or_load_func("fill_u32", kernels::FILL)?;
+                let params = (&data, v as u32, elem_count);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(data)
+            }
+            DType::BF16 => {
+                // SAFETY: Set later by running the fill kernel.
+                let data = unsafe { self.alloc::<bf16>(elem_count) }?;
+                let func = self.get_or_load_func("fill_bf16", kernels::FILL)?;
+                let params = (&data, bf16::from_f64(v), elem_count);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(data)
+            }
+            DType::F16 => {
+                // SAFETY: Set later by running the fill kernel.
+                let data = unsafe { self.alloc::<f16>(elem_count) }?;
+                let func = self.get_or_load_func("fill_f16", kernels::FILL)?;
+                let params = (&data, f16::from_f64(v), elem_count);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(data)
+            }
+            DType::F32 => {
+                // SAFETY: Set later by running the fill kernel.
+                let data = unsafe { self.alloc::<f32>(elem_count) }?;
+                let func = self.get_or_load_func("fill_f32", kernels::FILL)?;
+                let params = (&data, v as f32, elem_count);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(data)
+            }
+            DType::F64 => {
+                // SAFETY: Set later by running the fill kernel.
+                let data = unsafe { self.alloc::<f64>(elem_count) }?;
+                let func = self.get_or_load_func("fill_f64", kernels::FILL)?;
+                let params = (&data, v, elem_count);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(data)
+            }
+        };
+        Ok(CudaStorage {
+            slice,
+            device: self.clone(),
+        })
+    }
+
+    pub(crate) fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
+        self.const_impl(1., shape, dtype)
+    }
+
+    pub(crate) fn cuda_from_cpu_storage(&self, storage: &CpuStorage) -> Result<CudaStorage> {
+        let slice = match storage {
+            CpuStorage::U32(storage) => {
+                let data = self.htod_sync_copy(storage)?;
+                CudaStorageSlice::U32(data)
+            }
+            CpuStorage::BF16(storage) => {
+                let data = self.htod_sync_copy(storage)?;
+                CudaStorageSlice::BF16(data)
+            }
+            CpuStorage::F16(storage) => {
+                let data = self.htod_sync_copy(storage)?;
+                CudaStorageSlice::F16(data)
+            }
+            CpuStorage::F32(storage) => {
+                let data = self.htod_sync_copy(storage)?;
+                CudaStorageSlice::F32(data)
+            }
+            CpuStorage::F64(storage) => {
+                let data = self.htod_sync_copy(storage)?;
+                CudaStorageSlice::F64(data)
+            }
+        };
+        Ok(CudaStorage {
+            slice,
+            device: self.clone(),
+        })
+    }
+
+    fn get_or_load_func(&self, module_name: &str, ptx: &'static str) -> Result<CudaFunction> {
+        if !self.has_func(module_name, module_name) {
+            // Leaking the string here is a bit sad but we need a &'static str and this is only
+            // done once per kernel name.
+            let static_module_name = Box::leak(module_name.to_string().into_boxed_str());
+            self.load_ptx(ptx.into(), module_name, &[static_module_name])
+                .map_err(|cuda| CudaError::Load {
+                    cuda,
+                    module_name: module_name.to_string(),
+                })?;
+        }
+        self.get_func(module_name, module_name)
+            // Clippy recommends this `ok_or` rather than `ok_or_else` so hopefully the compiler is
+            // able to only build the error value if needed.
+            .ok_or(CudaError::MissingKernel {
+                module_name: module_name.to_string(),
+            })
+    }
+}
+
+#[derive(Debug)]
+enum CudaStorageSlice {
+    U32(CudaSlice<u32>),
+    BF16(CudaSlice<bf16>),
+    F16(CudaSlice<f16>),
+    F32(CudaSlice<f32>),
+    F64(CudaSlice<f64>),
+}
+
+#[derive(Debug)]
+pub struct CudaStorage {
+    slice: CudaStorageSlice,
+    device: CudaDevice,
+}
+
+fn gemm_config<T>(
+    alpha: T,
+    beta: T,
+    (b, m, n, k): (usize, usize, usize, usize),
+    lhs_stride: &[usize],
+    rhs_stride: &[usize],
+) -> Result<StridedBatchedConfig<T>> {
+    // https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-gemm
+    use cudarc::cublas::sys::cublasOperation_t;
+
+    let rhs_m1 = rhs_stride[rhs_stride.len() - 1];
+    let rhs_m2 = rhs_stride[rhs_stride.len() - 2];
+    let lhs_m1 = lhs_stride[lhs_stride.len() - 1];
+    let lhs_m2 = lhs_stride[lhs_stride.len() - 2];
+    // The a tensor has dims batching, k, n (rhs)
+    let (lda, transa) = if rhs_m1 == 1 && rhs_m2 == n {
+        (n as i32, cublasOperation_t::CUBLAS_OP_N)
+    } else if rhs_m1 == k && rhs_m2 == 1 {
+        (k as i32, cublasOperation_t::CUBLAS_OP_T)
+    } else {
+        Err(CudaError::MatMulNonContiguous {
+            lhs_stride: lhs_stride.to_vec(),
+            rhs_stride: rhs_stride.to_vec(),
+            mnk: (m, n, k),
+        })?
+    };
+    // The b tensor has dims batching, m, k (lhs)
+    let (ldb, transb) = if lhs_m1 == 1 && lhs_m2 == k {
+        (k as i32, cublasOperation_t::CUBLAS_OP_N)
+    } else if lhs_m1 == m && lhs_m2 == 1 {
+        (m as i32, cublasOperation_t::CUBLAS_OP_T)
+    } else {
+        Err(CudaError::MatMulNonContiguous {
+            lhs_stride: lhs_stride.to_vec(),
+            rhs_stride: rhs_stride.to_vec(),
+            mnk: (m, n, k),
+        })?
+    };
+    // The setup below was copied from:
+    // https://github.com/lebedov/scikit-cuda/blob/7e7300474286019c917a6c8a4bca59405c64fbce/tests/test_cublas.py#L531
+    let gemm = GemmConfig {
+        alpha,
+        beta,
+        m: n as i32,
+        n: m as i32,
+        k: k as i32,
+        lda,
+        ldb,
+        ldc: n as i32,
+        transa,
+        transb,
+    };
+    Ok(StridedBatchedConfig {
+        batch_size: b as i32,
+        gemm,
+        stride_a: (m * k) as i64,
+        stride_b: (n * k) as i64,
+        stride_c: (m * n) as i64,
+    })
+}
+
+impl CudaStorage {
+    pub fn try_clone(&self) -> Result<Self> {
+        let slice = match &self.slice {
+            CudaStorageSlice::U32(slice) => CudaStorageSlice::U32(slice.try_clone()?),
+            CudaStorageSlice::BF16(slice) => CudaStorageSlice::BF16(slice.try_clone()?),
+            CudaStorageSlice::F16(slice) => CudaStorageSlice::F16(slice.try_clone()?),
+            CudaStorageSlice::F32(slice) => CudaStorageSlice::F32(slice.try_clone()?),
+            CudaStorageSlice::F64(slice) => CudaStorageSlice::F64(slice.try_clone()?),
+        };
+        let device = self.device.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub fn dtype(&self) -> DType {
+        match self.slice {
+            CudaStorageSlice::U32(_) => DType::U32,
+            CudaStorageSlice::BF16(_) => DType::BF16,
+            CudaStorageSlice::F16(_) => DType::F16,
+            CudaStorageSlice::F32(_) => DType::F32,
+            CudaStorageSlice::F64(_) => DType::F64,
+        }
+    }
+
+    pub fn device(&self) -> &CudaDevice {
+        &self.device
+    }
+
+    pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
+        use cudarc::driver::DevicePtr;
+        let dims = shape.dims();
+        let el = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(el as u32);
+        let dev = self.device();
+        let ds = dev.htod_copy([dims, stride].concat())?;
+        let inp = match &self.slice {
+            CudaStorageSlice::U32(inp) => inp.device_ptr(),
+            CudaStorageSlice::BF16(inp) => inp.device_ptr(),
+            CudaStorageSlice::F16(inp) => inp.device_ptr(),
+            CudaStorageSlice::F32(inp) => inp.device_ptr(),
+            CudaStorageSlice::F64(inp) => inp.device_ptr(),
+        };
+        let kernel_name = format!("cast_{}_{}", self.dtype().as_str(), dtype.as_str());
+        let func = dev.get_or_load_func(&kernel_name, kernels::CAST)?;
+        let slice = match dtype {
+            DType::U32 => {
+                let out = unsafe { dev.alloc::<u32>(el) }?;
+                let params = (el, dims.len(), &ds, *inp, &out);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(out)
+            }
+            DType::BF16 => {
+                let out = unsafe { dev.alloc::<bf16>(el) }?;
+                let params = (el, dims.len(), &ds, *inp, &out);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            DType::F16 => {
+                let out = unsafe { dev.alloc::<f16>(el) }?;
+                let params = (el, dims.len(), &ds, *inp, &out);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            DType::F32 => {
+                let out = unsafe { dev.alloc::<f32>(el) }?;
+                let params = (el, dims.len(), &ds, *inp, &out);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            DType::F64 => {
+                let out = unsafe { dev.alloc::<f64>(el) }?;
+                let params = (el, dims.len(), &ds, *inp, &out);
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+        };
+        Ok(Self {
+            slice,
+            device: dev.clone(),
+        })
+    }
+
+    pub(crate) fn affine_impl(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        mul: f64,
+        add: f64,
+    ) -> Result<Self> {
+        let dims = shape.dims();
+        let el_count = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(el_count as u32);
+        let dev = self.device();
+        let ds = dev.htod_copy([dims, stride].concat())?;
+        let slice = match &self.slice {
+            CudaStorageSlice::U32(arg) => {
+                let func = dev.get_or_load_func("affine_u32", kernels::AFFINE)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<u32>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out, mul as u32, add as u32);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(out)
+            }
+            CudaStorageSlice::BF16(arg) => {
+                let func = dev.get_or_load_func("affine_bf16", kernels::AFFINE)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<bf16>(el_count) }?;
+                let params = (
+                    el_count,
+                    dims.len(),
+                    &ds,
+                    arg,
+                    &out,
+                    bf16::from_f64(mul),
+                    bf16::from_f64(add),
+                );
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            CudaStorageSlice::F16(arg) => {
+                let func = dev.get_or_load_func("affine_f16", kernels::AFFINE)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f16>(el_count) }?;
+                let params = (
+                    el_count,
+                    dims.len(),
+                    &ds,
+                    arg,
+                    &out,
+                    f16::from_f64(mul),
+                    f16::from_f64(add),
+                );
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            CudaStorageSlice::F32(arg) => {
+                let func = dev.get_or_load_func("affine_f32", kernels::AFFINE)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f32>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out, mul as f32, add as f32);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            CudaStorageSlice::F64(arg) => {
+                let func = dev.get_or_load_func("affine_f64", kernels::AFFINE)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f64>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out, mul, add);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn sum(&self, shape: &Shape, stride: &[usize], sum_dims: &[usize]) -> Result<Self> {
+        let src_dims = shape.dims();
+        let el = shape.elem_count();
+        let mut dst_el = el;
+        for &sum_dim in sum_dims.iter() {
+            dst_el /= src_dims[sum_dim];
+        }
+        let mut sum_dims = sum_dims.to_vec();
+        // Sort the sum_dims as they have to be processed from left to right when converting the
+        // indexes.
+        sum_dims.sort();
+        let sum_dims_l: Vec<usize> = sum_dims.iter().map(|&d| src_dims[d]).collect();
+        let sum_dims_s: Vec<usize> = sum_dims
+            .iter()
+            .map(|&d| src_dims[d + 1..].iter().product::<usize>())
+            .collect();
+        let cfg = LaunchConfig::for_num_elems(el as u32);
+        let dev = self.device();
+        let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?;
+        let slice = match &self.slice {
+            CudaStorageSlice::U32(arg) => {
+                let func = dev.get_or_load_func("sum_u32", kernels::REDUCE)?;
+                let out = dev.alloc_zeros::<u32>(dst_el)?;
+                let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(out)
+            }
+            CudaStorageSlice::BF16(arg) => {
+                let func = dev.get_or_load_func("sum_bf16", kernels::REDUCE)?;
+                let out = dev.alloc_zeros::<bf16>(dst_el)?;
+                let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            CudaStorageSlice::F16(arg) => {
+                let func = dev.get_or_load_func("sum_f16", kernels::REDUCE)?;
+                let out = dev.alloc_zeros::<f16>(dst_el)?;
+                let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            CudaStorageSlice::F32(arg) => {
+                let func = dev.get_or_load_func("sum_f32", kernels::REDUCE)?;
+                let out = dev.alloc_zeros::<f32>(dst_el)?;
+                let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            CudaStorageSlice::F64(arg) => {
+                let func = dev.get_or_load_func("sum_f64", kernels::REDUCE)?;
+                let out = dev.alloc_zeros::<f64>(dst_el)?;
+                let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
+        Err(CudaError::InternalError(
+            "TODO: implement divide_by_sum_over_dim",
+        ))
+    }
+
+    pub(crate) fn unary_impl<U: crate::op::UnaryOp>(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+    ) -> Result<Self> {
+        let dims = shape.dims();
+        let el_count = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(el_count as u32);
+        let dev = &self.device;
+        let ds = dev.htod_copy([dims, stride].concat())?;
+        let slice = match &self.slice {
+            CudaStorageSlice::U32(_arg) => {
+                todo!("No unary kernels for u32");
+            }
+            CudaStorageSlice::BF16(arg) => {
+                let func = dev.get_or_load_func(U::KERNEL_BF16, kernels::UNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<bf16>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            CudaStorageSlice::F16(arg) => {
+                let func = dev.get_or_load_func(U::KERNEL_F16, kernels::UNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f16>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            CudaStorageSlice::F32(arg) => {
+                let func = dev.get_or_load_func(U::KERNEL_F32, kernels::UNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f32>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            CudaStorageSlice::F64(arg) => {
+                let func = dev.get_or_load_func(U::KERNEL_F64, kernels::UNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f64>(el_count) }?;
+                let params = (el_count, dims.len(), &ds, arg, &out);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn binary_impl<B: crate::op::BinaryOp>(
+        &self,
+        rhs: &Self,
+        shape: &Shape,
+        lhs_stride: &[usize],
+        rhs_stride: &[usize],
+    ) -> Result<Self> {
+        let dims = shape.dims();
+        let elem_count = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(elem_count as u32);
+        let dev = self.device();
+        let dims_and_strides = dev.htod_copy([dims, lhs_stride, rhs_stride].concat())?;
+        let slice = match (&self.slice, &rhs.slice) {
+            (CudaStorageSlice::BF16(lhs), CudaStorageSlice::BF16(rhs)) => {
+                let func = dev.get_or_load_func(B::KERNEL_BF16, kernels::BINARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<bf16>(elem_count) }?;
+                let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            (CudaStorageSlice::F16(lhs), CudaStorageSlice::F16(rhs)) => {
+                let func = dev.get_or_load_func(B::KERNEL_F16, kernels::BINARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f16>(elem_count) }?;
+                let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            (CudaStorageSlice::F32(lhs), CudaStorageSlice::F32(rhs)) => {
+                let func = dev.get_or_load_func(B::KERNEL_F32, kernels::BINARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f32>(elem_count) }?;
+                let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            (CudaStorageSlice::F64(lhs), CudaStorageSlice::F64(rhs)) => {
+                // SAFETY: Set later by running the kernel.
+                let func = dev.get_or_load_func(B::KERNEL_F64, kernels::BINARY)?;
+                let out = unsafe { dev.alloc::<f64>(elem_count) }?;
+                let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+            (CudaStorageSlice::U32(lhs), CudaStorageSlice::U32(rhs)) => {
+                // SAFETY: Set later by running the kernel.
+                let func = dev.get_or_load_func(B::KERNEL_U32, kernels::BINARY)?;
+                let out = unsafe { dev.alloc::<u32>(elem_count) }?;
+                let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(out)
+            }
+            // The dtypes should have been checked at this point so this is an internal error.
+            _ => return Err(CudaError::InternalError("dtype mismatch in binary op")),
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn to_cpu_storage(&self) -> Result<CpuStorage> {
+        match &self.slice {
+            CudaStorageSlice::U32(slice) => {
+                let dev = slice.device();
+                let cpu_storage = dev.dtoh_sync_copy(slice)?;
+                Ok(CpuStorage::U32(cpu_storage))
+            }
+            CudaStorageSlice::BF16(slice) => {
+                let dev = slice.device();
+                let cpu_storage = dev.dtoh_sync_copy(slice)?;
+                Ok(CpuStorage::BF16(cpu_storage))
+            }
+            CudaStorageSlice::F16(slice) => {
+                let dev = slice.device();
+                let cpu_storage = dev.dtoh_sync_copy(slice)?;
+                Ok(CpuStorage::F16(cpu_storage))
+            }
+            CudaStorageSlice::F32(slice) => {
+                let dev = slice.device();
+                let cpu_storage = dev.dtoh_sync_copy(slice)?;
+                Ok(CpuStorage::F32(cpu_storage))
+            }
+            CudaStorageSlice::F64(slice) => {
+                let dev = slice.device();
+                let cpu_storage = dev.dtoh_sync_copy(slice)?;
+                Ok(CpuStorage::F64(cpu_storage))
+            }
+        }
+    }
+
+    pub(crate) fn where_cond(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        t: &Self,
+        stride_t: &[usize],
+        f: &Self,
+        stride_f: &[usize],
+    ) -> Result<Self> {
+        let ids = match &self.slice {
+            CudaStorageSlice::U32(slice) => slice,
+            _ => Err(CudaError::UnexpectedDType {
+                msg: "where conditions should be u32",
+                expected: DType::U32,
+                got: self.dtype(),
+            })?,
+        };
+        let dims = shape.dims();
+        let el = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(el as u32);
+        let dev = self.device();
+        let ds = dev.htod_copy([dims, stride, stride_t, stride_f].concat())?;
+        let slice = match (&t.slice, &f.slice) {
+            (CudaStorageSlice::BF16(t), CudaStorageSlice::BF16(f)) => {
+                let func = dev.get_or_load_func("where_bf16", kernels::TERNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<bf16>(el) }?;
+                let params = (el, dims.len(), &ds, ids, t, f, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            (CudaStorageSlice::F16(t), CudaStorageSlice::F16(f)) => {
+                let func = dev.get_or_load_func("where_f16", kernels::TERNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f16>(el) }?;
+                let params = (el, dims.len(), &ds, ids, t, f, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            (CudaStorageSlice::F32(t), CudaStorageSlice::F32(f)) => {
+                let func = dev.get_or_load_func("where_f32", kernels::TERNARY)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f32>(el) }?;
+                let params = (el, dims.len(), &ds, ids, t, f, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            (CudaStorageSlice::F64(t), CudaStorageSlice::F64(f)) => {
+                // SAFETY: Set later by running the kernel.
+                let func = dev.get_or_load_func("where_f64", kernels::TERNARY)?;
+                let out = unsafe { dev.alloc::<f64>(el) }?;
+                let params = (el, dims.len(), &ds, ids, t, f, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+            (CudaStorageSlice::U32(t), CudaStorageSlice::U32(f)) => {
+                // SAFETY: Set later by running the kernel.
+                let func = dev.get_or_load_func("where_u32", kernels::TERNARY)?;
+                let out = unsafe { dev.alloc::<u32>(el) }?;
+                let params = (el, dims.len(), &ds, ids, t, f, &out);
+                // SAFETY: ffi
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(out)
+            }
+            // The dtypes should have been checked at this point so this is an internal error.
+            _ => return Err(CudaError::InternalError("dtype mismatch in binary op")),
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn embedding_impl(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        rhs: &Self,
+        h_size: usize, // hidden size
+        v_size: usize, // vocab size
+    ) -> Result<Self> {
+        let ids = match &self.slice {
+            CudaStorageSlice::U32(slice) => slice,
+            _ => Err(CudaError::UnexpectedDType {
+                msg: "embedding ids should be u32",
+                expected: DType::U32,
+                got: self.dtype(),
+            })?,
+        };
+        let dims = shape.dims();
+        let el = shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(el as u32);
+        let dev = self.device();
+        let ds = dev.htod_copy([dims, stride].concat())?;
+        let slice = match &rhs.slice {
+            // The kernels below assume that rhs is contiguous.
+            CudaStorageSlice::U32(arg) => {
+                let func = dev.get_or_load_func("emb_u32", kernels::EMBEDDINGS)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<u32>(el * h_size) }?;
+                let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::U32(out)
+            }
+            CudaStorageSlice::BF16(arg) => {
+                let func = dev.get_or_load_func("emb_bf16", kernels::EMBEDDINGS)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<bf16>(el * h_size) }?;
+                let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::BF16(out)
+            }
+            CudaStorageSlice::F16(arg) => {
+                let func = dev.get_or_load_func("emb_f16", kernels::EMBEDDINGS)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f16>(el * h_size) }?;
+                let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F16(out)
+            }
+            CudaStorageSlice::F32(arg) => {
+                let func = dev.get_or_load_func("emb_f32", kernels::EMBEDDINGS)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f32>(el * h_size) }?;
+                let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F32(out)
+            }
+            CudaStorageSlice::F64(arg) => {
+                let func = dev.get_or_load_func("emb_f64", kernels::EMBEDDINGS)?;
+                // SAFETY: Set later by running the kernel.
+                let out = unsafe { dev.alloc::<f64>(el * h_size) }?;
+                let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
+                // SAFETY: ffi.
+                unsafe { func.launch(cfg, params) }?;
+                CudaStorageSlice::F64(out)
+            }
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn matmul_impl(
+        &self,
+        rhs: &Self,
+        (b, m, n, k): (usize, usize, usize, usize),
+        lhs_stride: &[usize],
+        rhs_stride: &[usize],
+    ) -> Result<Self> {
+        let elem_count = b * m * n;
+        let dev = &self.device;
+        let slice = match (&self.slice, &rhs.slice) {
+            (CudaStorageSlice::BF16(_lhs), CudaStorageSlice::BF16(_rhs)) => {
+                todo!("bf16")
+            }
+            (CudaStorageSlice::F16(lhs), CudaStorageSlice::F16(rhs)) => {
+                let cfg = gemm_config(f16::ONE, f16::ZERO, (b, m, n, k), lhs_stride, rhs_stride)?;
+                let mut out = unsafe { dev.alloc::<f16>(elem_count) }?;
+                unsafe {
+                    self.device
+                        .blas
+                        .gemm_strided_batched(cfg, rhs, lhs, &mut out)
+                }?;
+                CudaStorageSlice::F16(out)
+            }
+            (CudaStorageSlice::F32(lhs), CudaStorageSlice::F32(rhs)) => {
+                let cfg = gemm_config(1., 0., (b, m, n, k), lhs_stride, rhs_stride)?;
+                let mut out = unsafe { dev.alloc::<f32>(elem_count) }?;
+                unsafe {
+                    self.device
+                        .blas
+                        .gemm_strided_batched(cfg, rhs, lhs, &mut out)
+                }?;
+                CudaStorageSlice::F32(out)
+            }
+            (CudaStorageSlice::F64(lhs), CudaStorageSlice::F64(rhs)) => {
+                let cfg = gemm_config(1., 0., (b, m, n, k), lhs_stride, rhs_stride)?;
+                let mut out = unsafe { dev.alloc::<f64>(elem_count) }?;
+                unsafe {
+                    self.device
+                        .blas
+                        .gemm_strided_batched(cfg, rhs, lhs, &mut out)
+                }?;
+                CudaStorageSlice::F64(out)
+            }
+            _ => return Err(CudaError::InternalError("dtype mismatch in matmul op")),
+        };
+        let device = dev.clone();
+        Ok(Self { slice, device })
+    }
+
+    pub(crate) fn copy_strided_src(
+        &self,
+        dst: &mut Self,
+        dst_offset: usize,
+        src_shape: &Shape,
+        src_stride: &[usize],
+        src_offset: usize,
+    ) -> Result<()> {
+        if src_shape.rank() != src_stride.len() {
+            panic!("incoherent shape and strides {src_shape:?} {src_stride:?}")
+        }
+        let dims = src_shape.dims();
+        let el_count = src_shape.elem_count();
+        let cfg = LaunchConfig::for_num_elems(el_count as u32);
+        let dev = &self.device;
+        let ds = dev.htod_copy([dims, src_stride].concat())?;
+        match (&self.slice, &mut dst.slice) {
+            (CudaStorageSlice::BF16(src), CudaStorageSlice::BF16(dst)) => {
+                let src = src.slice(src_offset..);
+                let mut dst = dst.slice_mut(dst_offset..);
+                if src_shape.is_contiguous(src_stride) {
+                    dev.dtod_copy(&src, &mut dst)?
+                } else {
+                    let func = dev.get_or_load_func("ucopy_bf16", kernels::UNARY)?;
+                    // SAFETY: Set later by running the kernel.
+                    let params = (el_count, dims.len(), &ds, &src, &mut dst);
+                    // SAFETY: ffi.
+                    unsafe { func.launch(cfg, params) }?
+                }
+            }
+            (CudaStorageSlice::F16(src), CudaStorageSlice::F16(dst)) => {
+                let src = src.slice(src_offset..);
+                let mut dst = dst.slice_mut(dst_offset..);
+                if src_shape.is_contiguous(src_stride) {
+                    dev.dtod_copy(&src, &mut dst)?
+                } else {
+                    let func = dev.get_or_load_func("ucopy_f16", kernels::UNARY)?;
+                    // SAFETY: Set later by running the kernel.
+                    let params = (el_count, dims.len(), &ds, &src, &mut dst);
+                    // SAFETY: ffi.
+                    unsafe { func.launch(cfg, params) }?
+                }
+            }
+            (CudaStorageSlice::F32(src), CudaStorageSlice::F32(dst)) => {
+                let src = src.slice(src_offset..);
+                let mut dst = dst.slice_mut(dst_offset..);
+                if src_shape.is_contiguous(src_stride) {
+                    dev.dtod_copy(&src, &mut dst)?
+                } else {
+                    let func = dev.get_or_load_func("ucopy_f32", kernels::UNARY)?;
+                    // SAFETY: Set later by running the kernel.
+                    let params = (el_count, dims.len(), &ds, &src, &mut dst);
+                    // SAFETY: ffi.
+                    unsafe { func.launch(cfg, params) }?
+                }
+            }
+            (CudaStorageSlice::U32(src), CudaStorageSlice::U32(dst)) => {
+                let src = src.slice(src_offset..);
+                let mut dst = dst.slice_mut(dst_offset..);
+                if src_shape.is_contiguous(src_stride) {
+                    dev.dtod_copy(&src, &mut dst)?
+                } else {
+                    let func = dev.get_or_load_func("ucopy_u32", kernels::UNARY)?;
+                    // SAFETY: Set later by running the kernel.
+                    let params = (el_count, dims.len(), &ds, &src, &mut dst);
+                    // SAFETY: ffi.
+                    unsafe { func.launch(cfg, params) }?
+                }
+            }
+            (CudaStorageSlice::F64(src), CudaStorageSlice::F64(dst)) => {
+                let src = src.slice(src_offset..);
+                let mut dst = dst.slice_mut(dst_offset..);
+                if src_shape.is_contiguous(src_stride) {
+                    dev.dtod_copy(&src, &mut dst)?
+                } else {
+                    let func = dev.get_or_load_func("ucopy_64", kernels::UNARY)?;
+                    // SAFETY: Set later by running the kernel.
+                    let params = (el_count, dims.len(), &ds, &src, &mut dst);
+                    // SAFETY: ffi.
+                    unsafe { func.launch(cfg, params) }?;
+                }
+            }
+            _ => {
+                return Err(CudaError::InternalError(
+                    "dtype mismatch in copy_strided op",
+                ))
+            }
+        }
+        Ok(())
+    }
+}
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -0,0 +1,159 @@
+use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
+
+/// A `DeviceLocation` represents a physical device whereas multiple `Device`
+/// can live on the same location (typically for cuda devices).
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum DeviceLocation {
+    Cpu,
+    Cuda { gpu_id: usize },
+}
+
+#[derive(Debug, Clone)]
+pub enum Device {
+    Cpu,
+    Cuda(crate::CudaDevice),
+}
+
+// TODO: Should we back the cpu implementation using the NdArray crate or similar?
+pub trait NdArray {
+    fn shape(&self) -> Result<Shape>;
+
+    fn to_cpu_storage(&self) -> CpuStorage;
+}
+
+impl<S: WithDType> NdArray for S {
+    fn shape(&self) -> Result<Shape> {
+        Ok(Shape::from(()))
+    }
+
+    fn to_cpu_storage(&self) -> CpuStorage {
+        S::to_cpu_storage(&[*self])
+    }
+}
+
+impl<S: WithDType, const N: usize> NdArray for &[S; N] {
+    fn shape(&self) -> Result<Shape> {
+        Ok(Shape::from(self.len()))
+    }
+
+    fn to_cpu_storage(&self) -> CpuStorage {
+        S::to_cpu_storage(self.as_slice())
+    }
+}
+
+impl<S: WithDType> NdArray for &[S] {
+    fn shape(&self) -> Result<Shape> {
+        Ok(Shape::from(self.len()))
+    }
+
+    fn to_cpu_storage(&self) -> CpuStorage {
+        S::to_cpu_storage(self)
+    }
+}
+
+impl<S: WithDType, const N: usize, const M: usize> NdArray for &[[S; N]; M] {
+    fn shape(&self) -> Result<Shape> {
+        Ok(Shape::from((M, N)))
+    }
+
+    fn to_cpu_storage(&self) -> CpuStorage {
+        S::to_cpu_storage_owned(self.concat())
+    }
+}
+
+impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize> NdArray
+    for &[[[S; N3]; N2]; N1]
+{
+    fn shape(&self) -> Result<Shape> {
+        Ok(Shape::from((N1, N2, N3)))
+    }
+
+    fn to_cpu_storage(&self) -> CpuStorage {
+        let mut vec = Vec::new();
+        vec.reserve(N1 * N2 * N3);
+        for i1 in 0..N1 {
+            for i2 in 0..N2 {
+                vec.extend(self[i1][i2])
+            }
+        }
+        S::to_cpu_storage_owned(vec)
+    }
+}
+
+impl Device {
+    pub fn new_cuda(ordinal: usize) -> Result<Self> {
+        Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
+    }
+
+    pub fn same_id(&self, rhs: &Self) -> bool {
+        match (self, rhs) {
+            (Self::Cpu, Self::Cpu) => true,
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => lhs.same_id(rhs),
+            _ => false,
+        }
+    }
+
+    pub fn location(&self) -> DeviceLocation {
+        match self {
+            Self::Cpu => DeviceLocation::Cpu,
+            Self::Cuda(device) => DeviceLocation::Cuda {
+                gpu_id: device.ordinal(),
+            },
+        }
+    }
+
+    pub fn is_cuda(&self) -> bool {
+        match self {
+            Self::Cpu => false,
+            Self::Cuda(_) => true,
+        }
+    }
+
+    pub(crate) fn ones(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
+        match self {
+            Device::Cpu => {
+                let storage = CpuStorage::ones_impl(shape, dtype);
+                Ok(Storage::Cpu(storage))
+            }
+            Device::Cuda(device) => {
+                let storage = device.ones_impl(shape, dtype)?;
+                Ok(Storage::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn zeros(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
+        match self {
+            Device::Cpu => {
+                let storage = CpuStorage::zeros_impl(shape, dtype);
+                Ok(Storage::Cpu(storage))
+            }
+            Device::Cuda(device) => {
+                let storage = device.zeros_impl(shape, dtype)?;
+                Ok(Storage::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn storage<A: NdArray>(&self, array: A) -> Result<Storage> {
+        match self {
+            Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())),
+            Device::Cuda(device) => {
+                let storage = array.to_cpu_storage();
+                let storage = device.cuda_from_cpu_storage(&storage)?;
+                Ok(Storage::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn storage_owned<S: WithDType>(&self, data: Vec<S>) -> Result<Storage> {
+        match self {
+            Device::Cpu => Ok(Storage::Cpu(S::to_cpu_storage_owned(data))),
+            Device::Cuda(device) => {
+                let storage = S::to_cpu_storage_owned(data);
+                let storage = device.cuda_from_cpu_storage(&storage)?;
+                Ok(Storage::Cuda(storage))
+            }
+        }
+    }
+}
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@ -0,0 +1,96 @@
+use crate::{CpuStorage, Error, Result};
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum DType {
+    U32,
+    BF16,
+    F16,
+    F32,
+    F64,
+}
+
+impl DType {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::U32 => "u32",
+            Self::BF16 => "bf16",
+            Self::F16 => "f16",
+            Self::F32 => "f32",
+            Self::F64 => "f64",
+        }
+    }
+
+    pub fn size_in_bytes(&self) -> usize {
+        match self {
+            Self::U32 => 4,
+            Self::BF16 => 2,
+            Self::F16 => 2,
+            Self::F32 => 4,
+            Self::F64 => 8,
+        }
+    }
+}
+
+pub trait WithDType: Sized + Copy {
+    const DTYPE: DType;
+
+    fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage;
+
+    fn to_cpu_storage(data: &[Self]) -> CpuStorage {
+        Self::to_cpu_storage_owned(data.to_vec())
+    }
+
+    fn cpu_storage_as_slice(s: &CpuStorage) -> Result<&[Self]>;
+    fn cpu_storage_as_mut_slice(s: &mut CpuStorage) -> Result<&mut [Self]>;
+    fn cpu_storage_data(s: CpuStorage) -> Result<Vec<Self>>;
+}
+
+macro_rules! with_dtype {
+    ($ty:ty, $dtype:ident) => {
+        impl WithDType for $ty {
+            const DTYPE: DType = DType::$dtype;
+
+            fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage {
+                CpuStorage::$dtype(data)
+            }
+
+            fn cpu_storage_data(s: CpuStorage) -> Result<Vec<Self>> {
+                match s {
+                    CpuStorage::$dtype(data) => Ok(data),
+                    _ => Err(Error::UnexpectedDType {
+                        expected: DType::$dtype,
+                        got: s.dtype(),
+                        msg: "unexpected dtype",
+                    }),
+                }
+            }
+
+            fn cpu_storage_as_slice(s: &CpuStorage) -> Result<&[Self]> {
+                match s {
+                    CpuStorage::$dtype(data) => Ok(data),
+                    _ => Err(Error::UnexpectedDType {
+                        expected: DType::$dtype,
+                        got: s.dtype(),
+                        msg: "unexpected dtype",
+                    }),
+                }
+            }
+
+            fn cpu_storage_as_mut_slice(s: &mut CpuStorage) -> Result<&mut [Self]> {
+                match s {
+                    CpuStorage::$dtype(data) => Ok(data),
+                    _ => Err(Error::UnexpectedDType {
+                        expected: DType::$dtype,
+                        got: s.dtype(),
+                        msg: "unexpected dtype",
+                    }),
+                }
+            }
+        }
+    };
+}
+with_dtype!(u32, U32);
+with_dtype!(half::f16, F16);
+with_dtype!(half::bf16, BF16);
+with_dtype!(f32, F32);
+with_dtype!(f64, F64);
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -0,0 +1,136 @@
+#![allow(dead_code)]
+use crate::{CpuStorage, DType, Error, Result, Shape};
+
+#[derive(thiserror::Error, Debug)]
+pub enum DummyError {}
+pub type CudaError = DummyError;
+
+#[derive(Debug, Clone)]
+pub struct CudaDevice;
+
+macro_rules! fail {
+    () => {
+        unimplemented!("cuda support has not been enabled")
+    };
+}
+
+impl CudaDevice {
+    pub(crate) fn new(_: usize) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn same_id(&self, _: &Self) -> bool {
+        true
+    }
+
+    pub(crate) fn ordinal(&self) -> usize {
+        fail!()
+    }
+
+    pub(crate) fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<CudaStorage> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<CudaStorage> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn cuda_from_cpu_storage(&self, _: &CpuStorage) -> Result<CudaStorage> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+}
+
+#[derive(Debug)]
+pub struct CudaStorage;
+
+impl CudaStorage {
+    pub fn try_clone(&self) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub fn dtype(&self) -> DType {
+        fail!()
+    }
+
+    pub fn device(&self) -> &CudaDevice {
+        fail!()
+    }
+
+    pub(crate) fn to_cpu_storage(&self) -> Result<CpuStorage> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn affine_impl(&self, _: &Shape, _: &[usize], _: f64, _: f64) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn sum(&self, _: &Shape, _: &[usize], _: &[usize]) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn to_dtype(&self, _: &Shape, _: &[usize], _: DType) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn unary_impl<B: crate::op::UnaryOp>(&self, _: &Shape, _: &[usize]) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn binary_impl<B: crate::op::BinaryOp>(
+        &self,
+        _: &Self,
+        _: &Shape,
+        _: &[usize],
+        _: &[usize],
+    ) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn where_cond(
+        &self,
+        _: &Shape,
+        _: &[usize],
+        _: &Self,
+        _: &[usize],
+        _: &Self,
+        _: &[usize],
+    ) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn embedding_impl(
+        &self,
+        _: &Shape,
+        _: &[usize],
+        _: &Self,
+        _: usize,
+        _: usize,
+    ) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn matmul_impl(
+        &self,
+        _: &Self,
+        _: (usize, usize, usize, usize),
+        _: &[usize],
+        _: &[usize],
+    ) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub(crate) fn copy_strided_src(
+        &self,
+        _: &mut Self,
+        _: usize,
+        _: &Shape,
+        _: &[usize],
+        _: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+}
--- a/candle-core/src/error.rs
+++ b/candle-core/src/error.rs
@ -0,0 +1,102 @@
+use crate::{DType, DeviceLocation, Shape};
+
+/// Main library error type.
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("{msg}, expected: {expected:?}, got: {got:?}")]
+    UnexpectedDType {
+        msg: &'static str,
+        expected: DType,
+        got: DType,
+    },
+
+    #[error("{op} only supports contiguous tensors")]
+    RequiresContiguous { op: &'static str },
+
+    #[error("{op} expects at least one tensor")]
+    OpRequiresAtLeastOneTensor { op: &'static str },
+
+    #[error("backward is not supported for {op}")]
+    BackwardNotSupported { op: &'static str },
+
+    #[error("{op} invalid index {index} with vocab {vocab_size}")]
+    InvalidIndex {
+        op: &'static str,
+        index: usize,
+        vocab_size: usize,
+    },
+
+    #[error("the candle crate has not been built with cuda support")]
+    NotCompiledWithCudaSupport,
+
+    #[error(
+        "Shape mismatch, got buffer of size {buffer_size} which is compatible with shape {shape:?}"
+    )]
+    ShapeMismatch { buffer_size: usize, shape: Shape },
+
+    #[error("shape mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
+    ShapeMismatchBinaryOp {
+        lhs: Shape,
+        rhs: Shape,
+        op: &'static str,
+    },
+
+    #[error("shape mismatch in cat for dim {dim}, shape for arg 1: {first_shape:?} shape for arg {n}: {nth_shape:?}")]
+    ShapeMismatchCat {
+        dim: usize,
+        first_shape: Shape,
+        n: usize,
+        nth_shape: Shape,
+    },
+
+    #[error("device mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
+    DeviceMismatchBinaryOp {
+        lhs: DeviceLocation,
+        rhs: DeviceLocation,
+        op: &'static str,
+    },
+
+    #[error("dtype mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
+    DTypeMismatchBinaryOp {
+        lhs: DType,
+        rhs: DType,
+        op: &'static str,
+    },
+
+    #[error("unexpected rank, expected: {expected}, got: {got} ({shape:?})")]
+    UnexpectedNumberOfDims {
+        expected: usize,
+        got: usize,
+        shape: Shape,
+    },
+
+    // TODO this is temporary when we support arbitrary matmul
+    #[error("temporary error where matmul doesn't support arbitrary striding")]
+    UnexpectedStriding,
+
+    #[error(transparent)]
+    Cuda(#[from] crate::CudaError),
+
+    #[error(transparent)]
+    TryFromIntError(#[from] core::num::TryFromIntError),
+
+    #[error("npy/npz error {0}")]
+    Npy(String),
+
+    /// Zip file format error.
+    #[error(transparent)]
+    Zip(#[from] zip::result::ZipError),
+
+    /// Integer parse error.
+    #[error(transparent)]
+    ParseInt(#[from] std::num::ParseIntError),
+
+    /// I/O error.
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+
+    #[error("cannot broadcast {src_shape:?} to {dst_shape:?}")]
+    BroadcastIncompatibleShapes { src_shape: Shape, dst_shape: Shape },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -0,0 +1,29 @@
+mod backprop;
+mod cpu_backend;
+#[cfg(feature = "cuda")]
+mod cuda_backend;
+mod device;
+mod dtype;
+mod dummy_cuda_backend;
+mod error;
+mod npy;
+mod op;
+mod shape;
+mod storage;
+mod strided_index;
+mod tensor;
+
+pub use cpu_backend::CpuStorage;
+pub use device::{Device, DeviceLocation};
+pub use dtype::{DType, WithDType};
+pub use error::{Error, Result};
+pub use shape::Shape;
+pub use storage::Storage;
+use strided_index::StridedIndex;
+pub use tensor::{Tensor, TensorId};
+
+#[cfg(feature = "cuda")]
+pub use cuda_backend::{CudaDevice, CudaError, CudaStorage};
+
+#[cfg(not(feature = "cuda"))]
+pub use dummy_cuda_backend::{CudaDevice, CudaError, CudaStorage};
--- a/candle-core/src/npy.rs
+++ b/candle-core/src/npy.rs
@ -0,0 +1,401 @@
+//! Numpy support for literals.
+//!
+//! The spec for the npy format can be found in
+//! [npy-format](https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html).
+//! The functions from this module can be used to read literals from npy/npz files
+//! or write literals to these files. A npy file contains a single literal (unnamed)
+//! whereas a npz file can contain multiple named literals. npz files are also compressed.
+//!
+//! These two formats are easy to use in Python using the numpy library.
+//!
+//! ```python
+//! import numpy as np
+//! x = np.arange(10)
+//!
+//! # Write a npy file.
+//! np.save("test.npy", x)
+//!
+//! # Read a value from the npy file.
+//! x = np.load("test.npy")
+//!
+//! # Write multiple values to a npz file.
+//! values = { "x": x, "x_plus_one": x + 1 }
+//! np.savez("test.npz", **values)
+//!
+//! # Load multiple values from a npz file.
+//! values = np.loadz("test.npz")
+//! ```
+use crate::{DType, Device, Error, Result, Shape, Tensor};
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use half::{bf16, f16, slice::HalfFloatSliceExt};
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{BufReader, Read, Write};
+use std::path::Path;
+
+const NPY_MAGIC_STRING: &[u8] = b"\x93NUMPY";
+const NPY_SUFFIX: &str = ".npy";
+
+fn read_header<R: Read>(reader: &mut R) -> Result<String> {
+    let mut magic_string = vec![0u8; NPY_MAGIC_STRING.len()];
+    reader.read_exact(&mut magic_string)?;
+    if magic_string != NPY_MAGIC_STRING {
+        return Err(Error::Npy("magic string mismatch".to_string()));
+    }
+    let mut version = [0u8; 2];
+    reader.read_exact(&mut version)?;
+    let header_len_len = match version[0] {
+        1 => 2,
+        2 => 4,
+        otherwise => return Err(Error::Npy(format!("unsupported version {otherwise}"))),
+    };
+    let mut header_len = vec![0u8; header_len_len];
+    reader.read_exact(&mut header_len)?;
+    let header_len = header_len
+        .iter()
+        .rev()
+        .fold(0_usize, |acc, &v| 256 * acc + v as usize);
+    let mut header = vec![0u8; header_len];
+    reader.read_exact(&mut header)?;
+    Ok(String::from_utf8_lossy(&header).to_string())
+}
+
+#[derive(Debug, PartialEq)]
+struct Header {
+    descr: DType,
+    fortran_order: bool,
+    shape: Vec<usize>,
+}
+
+impl Header {
+    fn shape(&self) -> Shape {
+        Shape::from(self.shape.as_slice())
+    }
+
+    fn to_string(&self) -> Result<String> {
+        let fortran_order = if self.fortran_order { "True" } else { "False" };
+        let mut shape = self
+            .shape
+            .iter()
+            .map(|x| x.to_string())
+            .collect::<Vec<_>>()
+            .join(",");
+        let descr = match self.descr {
+            DType::BF16 => Err(Error::Npy("bf16 is not supported".into()))?,
+            DType::F16 => "f2",
+            DType::F32 => "f4",
+            DType::F64 => "f8",
+            DType::U32 => "u4",
+        };
+        if !shape.is_empty() {
+            shape.push(',')
+        }
+        Ok(format!(
+            "{{'descr': '<{descr}', 'fortran_order': {fortran_order}, 'shape': ({shape}), }}"
+        ))
+    }
+
+    // Hacky parser for the npy header, a typical example would be:
+    // {'descr': '<f8', 'fortran_order': False, 'shape': (128,), }
+    fn parse(header: &str) -> Result<Header> {
+        let header =
+            header.trim_matches(|c: char| c == '{' || c == '}' || c == ',' || c.is_whitespace());
+
+        let mut parts: Vec<String> = vec![];
+        let mut start_index = 0usize;
+        let mut cnt_parenthesis = 0i64;
+        for (index, c) in header.chars().enumerate() {
+            match c {
+                '(' => cnt_parenthesis += 1,
+                ')' => cnt_parenthesis -= 1,
+                ',' => {
+                    if cnt_parenthesis == 0 {
+                        parts.push(header[start_index..index].to_owned());
+                        start_index = index + 1;
+                    }
+                }
+                _ => {}
+            }
+        }
+        parts.push(header[start_index..].to_owned());
+        let mut part_map: HashMap<String, String> = HashMap::new();
+        for part in parts.iter() {
+            let part = part.trim();
+            if !part.is_empty() {
+                match part.split(':').collect::<Vec<_>>().as_slice() {
+                    [key, value] => {
+                        let key = key.trim_matches(|c: char| c == '\'' || c.is_whitespace());
+                        let value = value.trim_matches(|c: char| c == '\'' || c.is_whitespace());
+                        let _ = part_map.insert(key.to_owned(), value.to_owned());
+                    }
+                    _ => return Err(Error::Npy(format!("unable to parse header {header}"))),
+                }
+            }
+        }
+        let fortran_order = match part_map.get("fortran_order") {
+            None => false,
+            Some(fortran_order) => match fortran_order.as_ref() {
+                "False" => false,
+                "True" => true,
+                _ => return Err(Error::Npy(format!("unknown fortran_order {fortran_order}"))),
+            },
+        };
+        let descr = match part_map.get("descr") {
+            None => return Err(Error::Npy("no descr in header".to_string())),
+            Some(descr) => {
+                if descr.is_empty() {
+                    return Err(Error::Npy("empty descr".to_string()));
+                }
+                if descr.starts_with('>') {
+                    return Err(Error::Npy(format!("little-endian descr {descr}")));
+                }
+                // the only supported types in tensor are:
+                //     float64, float32, float16,
+                //     complex64, complex128,
+                //     int64, int32, int16, int8,
+                //     uint8, and bool.
+                match descr.trim_matches(|c: char| c == '=' || c == '<' || c == '|') {
+                    "e" | "f2" => DType::F16,
+                    "f" | "f4" => DType::F32,
+                    "d" | "f8" => DType::F64,
+                    // "i" | "i4" => DType::S32,
+                    // "q" | "i8" => DType::S64,
+                    // "h" | "i2" => DType::S16,
+                    // "b" | "i1" => DType::S8,
+                    // "B" | "u1" => DType::U8,
+                    "I" | "u4" => DType::U32,
+                    // "?" | "b1" => DType::Pred,
+                    // "F" | "F4" => DType::C64,
+                    // "D" | "F8" => DType::C128,
+                    descr => return Err(Error::Npy(format!("unrecognized descr {descr}"))),
+                }
+            }
+        };
+        let shape = match part_map.get("shape") {
+            None => return Err(Error::Npy("no shape in header".to_string())),
+            Some(shape) => {
+                let shape = shape.trim_matches(|c: char| c == '(' || c == ')' || c == ',');
+                if shape.is_empty() {
+                    vec![]
+                } else {
+                    shape
+                        .split(',')
+                        .map(|v| v.trim().parse::<usize>())
+                        .collect::<std::result::Result<Vec<_>, _>>()?
+                }
+            }
+        };
+        Ok(Header {
+            descr,
+            fortran_order,
+            shape,
+        })
+    }
+}
+
+impl Tensor {
+    // TODO: Add the possibility to read directly to a device?
+    fn from_reader<R: std::io::Read>(shape: Shape, dtype: DType, reader: &mut R) -> Result<Self> {
+        let elem_count = shape.elem_count();
+        match dtype {
+            DType::BF16 => {
+                let mut data_t = vec![bf16::ZERO; elem_count];
+                reader.read_u16_into::<LittleEndian>(data_t.reinterpret_cast_mut())?;
+                Tensor::from_vec(data_t, shape, &Device::Cpu)
+            }
+            DType::F16 => {
+                let mut data_t = vec![f16::ZERO; elem_count];
+                reader.read_u16_into::<LittleEndian>(data_t.reinterpret_cast_mut())?;
+                Tensor::from_vec(data_t, shape, &Device::Cpu)
+            }
+            DType::F32 => {
+                let mut data_t = vec![0f32; elem_count];
+                reader.read_f32_into::<LittleEndian>(&mut data_t)?;
+                Tensor::from_vec(data_t, shape, &Device::Cpu)
+            }
+            DType::F64 => {
+                let mut data_t = vec![0f64; elem_count];
+                reader.read_f64_into::<LittleEndian>(&mut data_t)?;
+                Tensor::from_vec(data_t, shape, &Device::Cpu)
+            }
+            DType::U32 => {
+                let mut data_t = vec![0u32; elem_count];
+                reader.read_u32_into::<LittleEndian>(&mut data_t)?;
+                Tensor::from_vec(data_t, shape, &Device::Cpu)
+            }
+        }
+    }
+
+    /// Reads a npy file and return the stored multi-dimensional array as a literal.
+    pub fn read_npy<T: AsRef<Path>>(path: T) -> Result<Self> {
+        let mut reader = File::open(path.as_ref())?;
+        let header = read_header(&mut reader)?;
+        let header = Header::parse(&header)?;
+        if header.fortran_order {
+            return Err(Error::Npy("fortran order not supported".to_string()));
+        }
+        let mut data: Vec<u8> = vec![];
+        reader.read_to_end(&mut data)?;
+        Self::from_reader(header.shape(), header.descr, &mut reader)
+    }
+
+    /// Reads a npz file and returns the stored multi-dimensional arrays together with their names.
+    pub fn read_npz<T: AsRef<Path>>(path: T) -> Result<Vec<(String, Self)>> {
+        let zip_reader = BufReader::new(File::open(path.as_ref())?);
+        let mut zip = zip::ZipArchive::new(zip_reader)?;
+        let mut result = vec![];
+        for i in 0..zip.len() {
+            let mut reader = zip.by_index(i).unwrap();
+            let name = {
+                let name = reader.name();
+                name.strip_suffix(NPY_SUFFIX).unwrap_or(name).to_owned()
+            };
+            let header = read_header(&mut reader)?;
+            let header = Header::parse(&header)?;
+            if header.fortran_order {
+                return Err(Error::Npy("fortran order not supported".to_string()));
+            }
+            let s = Self::from_reader(header.shape(), header.descr, &mut reader)?;
+            result.push((name, s))
+        }
+        Ok(result)
+    }
+
+    /// Reads a npz file and returns the stored multi-dimensional arrays for some specified names.
+    pub fn read_npz_by_name<T: AsRef<Path>>(path: T, names: &[&str]) -> Result<Vec<Self>> {
+        let zip_reader = BufReader::new(File::open(path.as_ref())?);
+        let mut zip = zip::ZipArchive::new(zip_reader)?;
+        let mut result = vec![];
+        for name in names.iter() {
+            let mut reader = match zip.by_name(&format!("{name}{NPY_SUFFIX}")) {
+                Ok(reader) => reader,
+                Err(_) => Err(Error::Npy(format!(
+                    "no array for {name} in {:?}",
+                    path.as_ref()
+                )))?,
+            };
+            let header = read_header(&mut reader)?;
+            let header = Header::parse(&header)?;
+            if header.fortran_order {
+                return Err(Error::Npy("fortran order not supported".to_string()));
+            }
+            let s = Self::from_reader(header.shape(), header.descr, &mut reader)?;
+            result.push(s)
+        }
+        Ok(result)
+    }
+
+    fn write<T: Write>(&self, f: &mut T) -> Result<()> {
+        f.write_all(NPY_MAGIC_STRING)?;
+        f.write_all(&[1u8, 0u8])?;
+        let header = Header {
+            descr: self.dtype(),
+            fortran_order: false,
+            shape: self.dims().to_vec(),
+        };
+        let mut header = header.to_string()?;
+        let pad = 16 - (NPY_MAGIC_STRING.len() + 5 + header.len()) % 16;
+        for _ in 0..pad % 16 {
+            header.push(' ')
+        }
+        header.push('\n');
+        f.write_all(&[(header.len() % 256) as u8, (header.len() / 256) as u8])?;
+        f.write_all(header.as_bytes())?;
+        let elem_count = self.elem_count();
+        match self.dtype() {
+            DType::BF16 => {
+                let vs = self.reshape(elem_count)?.to_vec1::<bf16>()?;
+                for &v in vs.reinterpret_cast() {
+                    f.write_u16::<LittleEndian>(v)?
+                }
+            }
+            DType::F16 => {
+                let vs = self.reshape(elem_count)?.to_vec1::<f16>()?;
+                for &v in vs.reinterpret_cast() {
+                    f.write_u16::<LittleEndian>(v)?
+                }
+            }
+            DType::F32 => {
+                // TODO: Avoid using a buffer when data is already on the CPU.
+                for v in self.reshape(elem_count)?.to_vec1::<f32>()? {
+                    f.write_f32::<LittleEndian>(v)?
+                }
+            }
+            DType::F64 => {
+                for v in self.reshape(elem_count)?.to_vec1::<f64>()? {
+                    f.write_f64::<LittleEndian>(v)?
+                }
+            }
+            DType::U32 => {
+                for v in self.reshape(elem_count)?.to_vec1::<u32>()? {
+                    f.write_u32::<LittleEndian>(v)?
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Writes a multi-dimensional array in the npy format.
+    pub fn write_npy<T: AsRef<Path>>(&self, path: T) -> Result<()> {
+        let mut f = File::create(path.as_ref())?;
+        self.write(&mut f)
+    }
+
+    /// Writes multiple multi-dimensional arrays using the npz format.
+    pub fn write_npz<S: AsRef<str>, T: AsRef<Tensor>, P: AsRef<Path>>(
+        ts: &[(S, T)],
+        path: P,
+    ) -> Result<()> {
+        let mut zip = zip::ZipWriter::new(File::create(path.as_ref())?);
+        let options =
+            zip::write::FileOptions::default().compression_method(zip::CompressionMethod::Stored);
+
+        for (name, tensor) in ts.iter() {
+            zip.start_file(format!("{}.npy", name.as_ref()), options)?;
+            tensor.as_ref().write(&mut zip)?
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Header;
+
+    #[test]
+    fn parse() {
+        let h = "{'descr': '<f8', 'fortran_order': False, 'shape': (128,), }";
+        assert_eq!(
+            Header::parse(h).unwrap(),
+            Header {
+                descr: crate::DType::F64,
+                fortran_order: false,
+                shape: vec![128]
+            }
+        );
+        let h = "{'descr': '<f4', 'fortran_order': True, 'shape': (256,1,128), }";
+        let h = Header::parse(h).unwrap();
+        assert_eq!(
+            h,
+            Header {
+                descr: crate::DType::F32,
+                fortran_order: true,
+                shape: vec![256, 1, 128]
+            }
+        );
+        assert_eq!(
+            h.to_string().unwrap(),
+            "{'descr': '<f4', 'fortran_order': True, 'shape': (256,1,128,), }"
+        );
+
+        let h = Header {
+            descr: crate::DType::U32,
+            fortran_order: false,
+            shape: vec![],
+        };
+        assert_eq!(
+            h.to_string().unwrap(),
+            "{'descr': '<u4', 'fortran_order': False, 'shape': (), }"
+        );
+    }
+}
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -0,0 +1,197 @@
+use crate::Tensor;
+use half::{bf16, f16};
+use num_traits::float::Float;
+
+#[derive(Clone)]
+pub(crate) enum Op {
+    Add(Tensor, Tensor),
+    Mul(Tensor, Tensor),
+    Sub(Tensor, Tensor),
+    Div(Tensor, Tensor),
+    Matmul(Tensor, Tensor),
+    Embedding(Tensor, Tensor),
+    WhereCond(Tensor, Tensor, Tensor),
+
+    Cat(Vec<Tensor>, usize),
+
+    #[allow(dead_code)] // add is currently unused.
+    Affine {
+        arg: Tensor,
+        mul: f64,
+        add: f64,
+    },
+    Sum(Tensor, Vec<usize>),
+    ToDType(Tensor),
+    Broadcast(Tensor),
+    Exp(Tensor),
+    Log(Tensor),
+    Sin(Tensor),
+    Cos(Tensor),
+    Abs(Tensor),
+    Narrow(Tensor, usize, usize, usize),
+    Neg(Tensor),
+    Reshape(Tensor),
+    Softmax(Tensor, usize),
+    Sqr(Tensor),
+    Sqrt(Tensor),
+    ToDevice(Tensor),
+    Transpose(Tensor, usize, usize),
+    Gelu(Tensor),
+    // TODO: Support for custom ops.
+}
+
+pub(crate) trait UnaryOp {
+    const NAME: &'static str;
+    const KERNEL_BF16: &'static str;
+    const KERNEL_F16: &'static str;
+    const KERNEL_F32: &'static str;
+    const KERNEL_F64: &'static str;
+    const KERNEL_U32: &'static str;
+    fn bf16(v1: bf16) -> bf16;
+    fn f16(v1: f16) -> f16;
+    fn f32(v1: f32) -> f32;
+    fn f64(v1: f64) -> f64;
+    fn u32(v1: u32) -> u32;
+}
+
+pub(crate) trait BinaryOp {
+    const NAME: &'static str;
+    const KERNEL_BF16: &'static str;
+    const KERNEL_F16: &'static str;
+    const KERNEL_F32: &'static str;
+    const KERNEL_F64: &'static str;
+    const KERNEL_U32: &'static str;
+    fn bf16(v1: bf16, v2: bf16) -> bf16;
+    fn f16(v1: f16, v2: f16) -> f16;
+    fn f32(v1: f32, v2: f32) -> f32;
+    fn f64(v1: f64, v2: f64) -> f64;
+    fn u32(v1: u32, v2: u32) -> u32;
+}
+
+pub(crate) struct Add;
+pub(crate) struct Div;
+pub(crate) struct Mul;
+pub(crate) struct Sub;
+pub(crate) struct Exp;
+pub(crate) struct Log;
+pub(crate) struct Sin;
+pub(crate) struct Cos;
+pub(crate) struct Abs;
+pub(crate) struct Neg;
+pub(crate) struct Sqr;
+pub(crate) struct Sqrt;
+pub(crate) struct Gelu;
+
+macro_rules! bin_op {
+    ($op:ident, $name: literal, $e: expr) => {
+        impl BinaryOp for $op {
+            const NAME: &'static str = $name;
+            const KERNEL_BF16: &'static str = concat!("b", $name, "_bf16");
+            const KERNEL_F16: &'static str = concat!("b", $name, "_f16");
+            const KERNEL_F32: &'static str = concat!("b", $name, "_f32");
+            const KERNEL_F64: &'static str = concat!("b", $name, "_f64");
+            const KERNEL_U32: &'static str = concat!("b", $name, "_u32");
+            fn bf16(v1: bf16, v2: bf16) -> bf16 {
+                $e(v1, v2)
+            }
+            fn f16(v1: f16, v2: f16) -> f16 {
+                $e(v1, v2)
+            }
+            fn f32(v1: f32, v2: f32) -> f32 {
+                $e(v1, v2)
+            }
+            fn f64(v1: f64, v2: f64) -> f64 {
+                $e(v1, v2)
+            }
+            fn u32(v1: u32, v2: u32) -> u32 {
+                $e(v1, v2)
+            }
+        }
+    };
+}
+
+bin_op!(Add, "add", |v1, v2| v1 + v2);
+bin_op!(Sub, "sub", |v1, v2| v1 - v2);
+bin_op!(Mul, "mul", |v1, v2| v1 * v2);
+bin_op!(Div, "div", |v1, v2| v1 / v2);
+
+macro_rules! unary_op {
+    ($op: ident, $name: literal, $a: ident, $e: expr) => {
+        impl UnaryOp for $op {
+            const NAME: &'static str = $name;
+            const KERNEL_BF16: &'static str = concat!("u", $name, "_bf16");
+            const KERNEL_F16: &'static str = concat!("u", $name, "_f16");
+            const KERNEL_F32: &'static str = concat!("u", $name, "_f32");
+            const KERNEL_F64: &'static str = concat!("u", $name, "_f64");
+            const KERNEL_U32: &'static str = concat!("u", $name, "_u32");
+            fn bf16($a: bf16) -> bf16 {
+                $e
+            }
+            fn f16($a: f16) -> f16 {
+                $e
+            }
+            fn f32($a: f32) -> f32 {
+                $e
+            }
+            fn f64($a: f64) -> f64 {
+                $e
+            }
+            fn u32(_: u32) -> u32 {
+                todo!("no unary function for u32")
+            }
+        }
+    };
+}
+
+unary_op!(Exp, "exp", v, v.exp());
+unary_op!(Log, "log", v, v.ln());
+unary_op!(Sin, "sin", v, v.sin());
+unary_op!(Cos, "cos", v, v.cos());
+unary_op!(Abs, "abs", v, v.abs());
+unary_op!(Neg, "neg", v, -v);
+unary_op!(Sqr, "sqr", v, v * v);
+unary_op!(Sqrt, "sqrt", v, v.sqrt());
+
+/// `gelu` operation
+/// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
+impl UnaryOp for Gelu {
+    const NAME: &'static str = "gelu";
+    fn bf16(v: bf16) -> bf16 {
+        bf16::from_f32_const(0.5)
+            * v
+            * (bf16::ONE
+                + bf16::tanh(
+                    (bf16::from_f32_const(2.0) / bf16::PI).sqrt()
+                        * v
+                        * (bf16::ONE + bf16::from_f32_const(0.044715) * v * v),
+                ))
+    }
+    fn f16(v: f16) -> f16 {
+        f16::from_f32_const(0.5)
+            * v
+            * (f16::ONE
+                + f16::tanh(
+                    (f16::from_f32_const(2.0) / f16::PI).sqrt()
+                        * v
+                        * (f16::ONE + f16::from_f32_const(0.044715) * v * v),
+                ))
+    }
+    fn f32(v: f32) -> f32 {
+        0.5 * v
+            * (1.0
+                + f32::tanh((2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
+    }
+    fn f64(v: f64) -> f64 {
+        0.5 * v
+            * (1.0
+                + f64::tanh((2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
+    }
+    fn u32(_: u32) -> u32 {
+        0
+    }
+    const KERNEL_BF16: &'static str = "gelu_bf16";
+    const KERNEL_F16: &'static str = "gelu_f16";
+    const KERNEL_F32: &'static str = "gelu_f32";
+    const KERNEL_F64: &'static str = "gelu_f64";
+    const KERNEL_U32: &'static str = "gelu_u32";
+}
--- a/candle-core/src/shape.rs
+++ b/candle-core/src/shape.rs
@ -0,0 +1,199 @@
+use crate::{Error, Result};
+
+#[derive(Clone, PartialEq, Eq)]
+pub struct Shape(Vec<usize>);
+
+impl std::fmt::Debug for Shape {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", &self.dims())
+    }
+}
+
+impl<const C: usize> From<&[usize; C]> for Shape {
+    fn from(dims: &[usize; C]) -> Self {
+        Self(dims.to_vec())
+    }
+}
+
+impl From<&[usize]> for Shape {
+    fn from(dims: &[usize]) -> Self {
+        Self(dims.to_vec())
+    }
+}
+
+impl From<&Shape> for Shape {
+    fn from(shape: &Shape) -> Self {
+        Self(shape.0.to_vec())
+    }
+}
+
+impl From<()> for Shape {
+    fn from(_: ()) -> Self {
+        Self(vec![])
+    }
+}
+
+impl From<usize> for Shape {
+    fn from(d1: usize) -> Self {
+        Self(vec![d1])
+    }
+}
+
+impl From<(usize, usize)> for Shape {
+    fn from(d12: (usize, usize)) -> Self {
+        Self(vec![d12.0, d12.1])
+    }
+}
+
+impl From<(usize, usize, usize)> for Shape {
+    fn from(d123: (usize, usize, usize)) -> Self {
+        Self(vec![d123.0, d123.1, d123.2])
+    }
+}
+
+impl From<(usize, usize, usize, usize)> for Shape {
+    fn from(d1234: (usize, usize, usize, usize)) -> Self {
+        Self(vec![d1234.0, d1234.1, d1234.2, d1234.3])
+    }
+}
+
+impl From<(usize, usize, usize, usize, usize)> for Shape {
+    fn from(d12345: (usize, usize, usize, usize, usize)) -> Self {
+        Self(vec![d12345.0, d12345.1, d12345.2, d12345.3, d12345.4])
+    }
+}
+
+impl From<Vec<usize>> for Shape {
+    fn from(dims: Vec<usize>) -> Self {
+        Self(dims)
+    }
+}
+
+macro_rules! extract_dims {
+    ($fn_name:ident, $cnt:tt, $dims:expr, $out_type:ty) => {
+        pub fn $fn_name(&self) -> Result<$out_type> {
+            if self.0.len() != $cnt {
+                Err(Error::UnexpectedNumberOfDims {
+                    expected: $cnt,
+                    got: self.0.len(),
+                    shape: self.clone(),
+                })
+            } else {
+                Ok($dims(&self.0))
+            }
+        }
+    };
+}
+
+impl Shape {
+    pub fn from_dims(dims: &[usize]) -> Self {
+        Self(dims.to_vec())
+    }
+
+    pub fn rank(&self) -> usize {
+        self.0.len()
+    }
+
+    pub fn into_dims(self) -> Vec<usize> {
+        self.0
+    }
+
+    pub fn dims(&self) -> &[usize] {
+        &self.0
+    }
+
+    pub fn elem_count(&self) -> usize {
+        self.0.iter().product()
+    }
+
+    extract_dims!(r0, 0, |_: &Vec<usize>| (), ());
+    extract_dims!(r1, 1, |d: &[usize]| d[0], usize);
+    extract_dims!(r2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
+    extract_dims!(
+        r3,
+        3,
+        |d: &[usize]| (d[0], d[1], d[2]),
+        (usize, usize, usize)
+    );
+    extract_dims!(
+        r4,
+        4,
+        |d: &[usize]| (d[0], d[1], d[2], d[3]),
+        (usize, usize, usize, usize)
+    );
+    extract_dims!(
+        r5,
+        5,
+        |d: &[usize]| (d[0], d[1], d[2], d[3], d[4]),
+        (usize, usize, usize, usize, usize)
+    );
+
+    /// The strides given in number of elements for a contiguous n-dimensional
+    /// arrays using this shape.
+    pub(crate) fn stride_contiguous(&self) -> Vec<usize> {
+        let mut stride: Vec<_> = self
+            .0
+            .iter()
+            .rev()
+            .scan(1, |prod, u| {
+                let prod_pre_mult = *prod;
+                *prod *= u;
+                Some(prod_pre_mult)
+            })
+            .collect();
+        stride.reverse();
+        stride
+    }
+
+    /// Returns true if the strides are C contiguous (aka row major).
+    pub fn is_contiguous(&self, stride: &[usize]) -> bool {
+        if self.0.len() != stride.len() {
+            return false;
+        }
+        let mut acc = 1;
+        for (&stride, &dim) in stride.iter().zip(self.0.iter()).rev() {
+            if stride != acc {
+                return false;
+            }
+            acc *= dim;
+        }
+        true
+    }
+
+    /// Returns true if the strides are Fortran contiguous (aka column major).
+    pub fn is_fortran_contiguous(&self, stride: &[usize]) -> bool {
+        if self.0.len() != stride.len() {
+            return false;
+        }
+        let mut acc = 1;
+        for (&stride, &dim) in stride.iter().zip(self.0.iter()) {
+            if stride != acc {
+                return false;
+            }
+            acc *= dim;
+        }
+        true
+    }
+
+    pub fn extend(mut self, additional_dims: &[usize]) -> Self {
+        self.0.extend(additional_dims);
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn stride() {
+        let shape = Shape::from(());
+        assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
+        let shape = Shape::from(42);
+        assert_eq!(shape.stride_contiguous(), [1]);
+        let shape = Shape::from((42, 1337));
+        assert_eq!(shape.stride_contiguous(), [1337, 1]);
+        let shape = Shape::from((299, 792, 458));
+        assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
+    }
+}
--- a/candle-core/src/storage.rs
+++ b/candle-core/src/storage.rs
@ -0,0 +1,261 @@
+use crate::{op, CpuStorage, CudaStorage, DType, Device, Error, Result, Shape};
+
+// We do not want to implement Clone on Storage as cloning may fail because of
+// out of memory. Instead try_clone should be used.
+#[derive(Debug)]
+pub enum Storage {
+    Cpu(CpuStorage),
+    Cuda(CudaStorage),
+}
+
+impl Storage {
+    pub fn try_clone(&self) -> Result<Self> {
+        match self {
+            Self::Cpu(storage) => Ok(Self::Cpu(storage.clone())),
+            Self::Cuda(storage) => {
+                let storage = storage.try_clone()?;
+                Ok(Self::Cuda(storage))
+            }
+        }
+    }
+
+    pub fn device(&self) -> Device {
+        match self {
+            Self::Cpu(_) => Device::Cpu,
+            Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
+        }
+    }
+
+    pub fn dtype(&self) -> DType {
+        match self {
+            Self::Cpu(storage) => storage.dtype(),
+            Self::Cuda(storage) => storage.dtype(),
+        }
+    }
+
+    pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
+        let lhs = self.device().location();
+        let rhs = rhs.device().location();
+        if lhs != rhs {
+            Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op })
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(crate) fn same_dtype(&self, rhs: &Self, op: &'static str) -> Result<()> {
+        let lhs = self.dtype();
+        let rhs = rhs.dtype();
+        if lhs != rhs {
+            Err(Error::DTypeMismatchBinaryOp { lhs, rhs, op })
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(crate) fn affine_impl(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        mul: f64,
+        add: f64,
+    ) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.affine_impl(shape, stride, mul, add)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.affine_impl(shape, stride, mul, add)?;
+                Ok(Self::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn sum(&self, shape: &Shape, stride: &[usize], s: &[usize]) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.sum(shape, stride, s)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.sum(shape, stride, s)?;
+                Ok(Self::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
+        match self {
+            Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
+            Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
+        }
+        Ok(())
+    }
+
+    pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.to_dtype(shape, stride, dtype)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.to_dtype(shape, stride, dtype)?;
+                Ok(Self::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn unary_impl<B: op::UnaryOp>(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+    ) -> Result<Self> {
+        // TODO: Different code path for the contiguous case?
+        match self {
+            Storage::Cpu(storage) => {
+                let storage = storage.unary_impl::<B>(shape, stride)?;
+                Ok(Self::Cpu(storage))
+            }
+            Self::Cuda(storage) => {
+                let storage = storage.unary_impl::<B>(shape, stride)?;
+                Ok(Self::Cuda(storage))
+            }
+        }
+    }
+
+    pub(crate) fn binary_impl<B: op::BinaryOp>(
+        &self,
+        rhs: &Self,
+        shape: &Shape,
+        lhs_stride: &[usize],
+        rhs_stride: &[usize],
+    ) -> Result<Self> {
+        self.same_device(rhs, B::NAME)?;
+        self.same_dtype(rhs, B::NAME)?;
+        match (self, rhs) {
+            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
+                let storage = lhs.binary_impl::<B>(rhs, shape, lhs_stride, rhs_stride)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.binary_impl::<B>(rhs, shape, lhs_stride, rhs_stride)?;
+                Ok(Self::Cuda(storage))
+            }
+            (lhs, rhs) => {
+                // Should not happen because of the same device check above but we're defensive
+                // anyway.
+                Err(Error::DeviceMismatchBinaryOp {
+                    lhs: lhs.device().location(),
+                    rhs: rhs.device().location(),
+                    op: B::NAME,
+                })
+            }
+        }
+    }
+
+    pub(crate) fn where_cond(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        t: &Self,
+        stride_t: &[usize],
+        f: &Self,
+        stride_f: &[usize],
+    ) -> Result<Self> {
+        self.same_device(t, "where")?;
+        self.same_device(f, "where")?;
+        t.same_dtype(f, "where")?;
+        match (self, t, f) {
+            (Storage::Cpu(cond), Storage::Cpu(t), Storage::Cpu(f)) => {
+                let storage = cond.where_cond(shape, stride, t, stride_t, f, stride_f)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(cond), Self::Cuda(t), Self::Cuda(f)) => {
+                let storage = cond.where_cond(shape, stride, t, stride_t, f, stride_f)?;
+                Ok(Self::Cuda(storage))
+            }
+            (_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "embedding",
+            }),
+        }
+    }
+
+    pub(crate) fn embedding_impl(
+        &self,
+        shape: &Shape,
+        stride: &[usize],
+        rhs: &Self,
+        hidden_size: usize,
+        vocab_size: usize,
+    ) -> Result<Self> {
+        self.same_device(rhs, "embedding")?;
+        match (self, rhs) {
+            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
+                let storage = lhs.embedding_impl(shape, stride, rhs, hidden_size, vocab_size)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.embedding_impl(shape, stride, rhs, hidden_size, vocab_size)?;
+                Ok(Self::Cuda(storage))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "embedding",
+            }),
+        }
+    }
+
+    pub(crate) fn matmul_impl(
+        &self,
+        rhs: &Self,
+        bmnk: (usize, usize, usize, usize),
+        lhs_stride: &[usize],
+        rhs_stride: &[usize],
+    ) -> Result<Self> {
+        self.same_device(rhs, "matmul")?;
+        self.same_dtype(rhs, "matmul")?;
+        match (self, rhs) {
+            (Self::Cpu(lhs), Self::Cpu(rhs)) => {
+                let storage = lhs.matmul_impl(rhs, bmnk, lhs_stride, rhs_stride)?;
+                Ok(Self::Cpu(storage))
+            }
+            (Self::Cuda(lhs), Self::Cuda(rhs)) => {
+                let storage = lhs.matmul_impl(rhs, bmnk, lhs_stride, rhs_stride)?;
+                Ok(Self::Cuda(storage))
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "matmul",
+            }),
+        }
+    }
+
+    // self, the source can be strided whereas dst is contiguous.
+    pub(crate) fn copy_strided_src(
+        &self,
+        dst: &mut Self,
+        dst_offset: usize,
+        src_shape: &Shape,
+        src_stride: &[usize],
+        src_offset: usize,
+    ) -> Result<()> {
+        match (self, dst) {
+            (Self::Cpu(src), Self::Cpu(dst)) => {
+                src.copy_strided_src(dst, dst_offset, src_shape, src_stride, src_offset)
+            }
+            (Self::Cuda(src), Self::Cuda(dst)) => {
+                Ok(src.copy_strided_src(dst, dst_offset, src_shape, src_stride, src_offset)?)
+            }
+            (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
+                lhs: lhs.device().location(),
+                rhs: rhs.device().location(),
+                op: "copy",
+            }),
+        }
+    }
+}
--- a/candle-core/src/strided_index.rs
+++ b/candle-core/src/strided_index.rs
@ -0,0 +1,61 @@
+/// An iterator over offset position for items of an N-dimensional arrays stored in a
+/// flat buffer using some potential strides.
+#[derive(Debug)]
+pub(crate) struct StridedIndex<'a> {
+    next_storage_index: Option<usize>,
+    multi_index: Vec<usize>,
+    dims: &'a [usize],
+    stride: &'a [usize],
+}
+
+impl<'a> StridedIndex<'a> {
+    pub(crate) fn new(dims: &'a [usize], stride: &'a [usize]) -> Self {
+        let elem_count: usize = dims.iter().product();
+        let next_storage_index = if elem_count == 0 {
+            None
+        } else {
+            // This applies to the scalar case.
+            Some(0)
+        };
+        StridedIndex {
+            next_storage_index,
+            multi_index: vec![0; dims.len()],
+            dims,
+            stride,
+        }
+    }
+}
+
+impl<'a> Iterator for StridedIndex<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let storage_index = match self.next_storage_index {
+            None => return None,
+            Some(storage_index) => storage_index,
+        };
+        let mut updated = false;
+        for (multi_i, max_i) in self.multi_index.iter_mut().zip(self.dims.iter()).rev() {
+            let next_i = *multi_i + 1;
+            if next_i < *max_i {
+                *multi_i = next_i;
+                updated = true;
+                break;
+            } else {
+                *multi_i = 0
+            }
+        }
+        self.next_storage_index = if updated {
+            let next_storage_index = self
+                .multi_index
+                .iter()
+                .zip(self.stride.iter())
+                .map(|(&x, &y)| x * y)
+                .sum();
+            Some(next_storage_index)
+        } else {
+            None
+        };
+        Some(storage_index)
+    }
+}
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -0,0 +1,40 @@
+use anyhow::{Context, Result};
+use candle::{Device, Shape, Tensor};
+
+#[test]
+fn simple_grad() -> Result<()> {
+    let x = Tensor::var(&[3f32, 1., 4.], &Device::Cpu)?;
+    let y = (((&x * &x)? + &x * 5f64)? + 4f64)?;
+    let grads = y.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    assert_eq!(x.to_vec1::<f32>()?, [3., 1., 4.]);
+    // y = x^2 + 5.x + 4
+    assert_eq!(y.to_vec1::<f32>()?, [28., 10., 40.]);
+    // dy/dx = 2.x + 5
+    assert_eq!(grad_x.to_vec1::<f32>()?, [11., 7., 13.]);
+    Ok(())
+}
+
+#[test]
+fn matmul_grad() -> Result<()> {
+    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+    let x = Tensor::var_from_slice(&data, (2, 2, 3), &Device::Cpu)?;
+    let data: Vec<_> = (0..12).map(|i| i as f32).collect();
+    let y = Tensor::var_from_slice(&data, (2, 3, 2), &Device::Cpu)?;
+
+    let c = x.matmul(&y)?;
+    let grads = c.backward()?;
+    let grad_x = grads.get(&x).context("no grad for x")?;
+    let grad_y = grads.get(&y).context("no grad for y")?;
+    assert_eq!(grad_x.shape(), &Shape::from((2, 2, 3)));
+    assert_eq!(grad_y.shape(), &Shape::from((2, 3, 2)));
+    assert_eq!(
+        &*grad_x.storage_data::<f32>()?,
+        &[1., 5., 9., 1., 5., 9., 13., 17., 21., 13., 17., 21.]
+    );
+    assert_eq!(
+        &*grad_y.storage_data::<f32>()?,
+        &[3., 3., 5., 5., 7., 7., 15., 15., 17., 17., 19., 19.]
+    );
+    Ok(())
+}
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -0,0 +1,228 @@
+// TODO: Also test the cuda backend.
+use candle::{DType, Device, Result, Tensor};
+
+#[test]
+fn zeros() -> Result<()> {
+    let tensor = Tensor::zeros((5, 2), DType::F32, &Device::Cpu)?;
+    let (dim1, dim2) = tensor.shape().r2()?;
+    assert_eq!(dim1, 5);
+    assert_eq!(dim2, 2);
+    Ok(())
+}
+
+#[test]
+fn add_mul() -> Result<()> {
+    let tensor = Tensor::new(&[3f32, 1., 4.], &Device::Cpu)?;
+    let dim1 = tensor.shape().r1()?;
+    assert_eq!(dim1, 3);
+    let content: Vec<f32> = tensor.to_vec1()?;
+    assert_eq!(content, [3., 1., 4.]);
+    let tensor = Tensor::add(&tensor, &tensor)?;
+    let content: Vec<f32> = tensor.to_vec1()?;
+    assert_eq!(content, [6., 2., 8.]);
+    let tensor = Tensor::mul(&tensor, &tensor)?;
+    let content: Vec<f32> = tensor.to_vec1()?;
+    assert_eq!(content, [36., 4., 64.]);
+    Ok(())
+}
+
+#[test]
+fn tensor_2d() -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor = Tensor::new(data, &Device::Cpu)?;
+    let dims = tensor.shape().r2()?;
+    assert_eq!(dims, (2, 5));
+    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
+    assert_eq!(content, data);
+    Ok(())
+}
+
+#[test]
+fn binary_op() -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor = Tensor::new(data, &Device::Cpu)?;
+    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 1., 7., 8., 2.]];
+    let tensor2 = Tensor::new(data2, &Device::Cpu)?;
+    let tensor = (&tensor + (&tensor * &tensor)? / (&tensor + &tensor2))?;
+    let dims = tensor.shape().r2()?;
+    assert_eq!(dims, (2, 5));
+    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
+    assert_eq!(content[0], [4.125, 1.1666666, 5.7777777, 1.1666666, 7.5]);
+    assert_eq!(content[1], [3.0, 1.5, 10.5, 12.0, 3.0]);
+    let tensor = (&tensor - &tensor)?;
+    let content: Vec<Vec<f32>> = tensor.to_vec2()?;
+    assert_eq!(content[0], [0., 0., 0., 0., 0.]);
+    Ok(())
+}
+
+#[test]
+fn tensor_2d_transpose() -> Result<()> {
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
+    let tensor = Tensor::new(data, &Device::Cpu)?.t()?;
+    let dims = tensor.shape().r2()?;
+    assert_eq!(dims, (5, 2));
+    assert_eq!(
+        tensor.to_vec2::<f32>()?,
+        &[[3f32, 2.], [1., 1.], [4., 7.], [1., 8.], [5., 2.]]
+    );
+    assert_eq!(tensor.t()?.to_vec2::<f32>()?, data);
+    assert_eq!(tensor.contiguous()?.t()?.to_vec2::<f32>()?, data);
+    assert_eq!(((tensor + 1.)?.t()? - 1.)?.to_vec2::<f32>()?, data);
+    Ok(())
+}
+
+#[test]
+fn softmax() -> Result<()> {
+    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
+    let tensor = Tensor::new(data, &Device::Cpu)?;
+    let t0 = tensor.log()?.softmax(0)?;
+    let t1 = tensor.log()?.softmax(1)?;
+    let t2 = tensor.log()?.softmax(2)?;
+    assert_eq!(
+        t0.to_vec3::<f32>()?,
+        &[
+            // 3/5, 1/2, 4/11
+            [[0.6, 0.5, 0.36363637], [0.11111111, 0.71428573, 0.5294118]],
+            // 2/5, 1/2, 7/11
+            [[0.4, 0.5, 0.63636357], [0.8888889, 0.2857143, 0.47058824]]
+        ]
+    );
+    assert_eq!(
+        t1.to_vec3::<f32>()?,
+        &[
+            // 3/4, 1/6, 4/13
+            [[0.75, 0.16666667, 0.30769232], [0.25, 0.8333333, 0.6923077]],
+            // 2/10, 1/3, 7/15
+            [[0.2, 0.33333334, 0.46666664], [0.8, 0.6666667, 0.53333336]]
+        ]
+    );
+    assert_eq!(
+        t2.to_vec3::<f32>()?,
+        &[
+            // (3, 1, 4) / 8, (1, 5, 9) / 15
+            [[0.375, 0.125, 0.5], [0.06666667, 0.33333334, 0.6]],
+            // (2, 1, 7) / 10, (8, 2, 8) / 18
+            [[0.2, 0.1, 0.6999999], [0.44444445, 0.11111111, 0.44444445]]
+        ]
+    );
+    Ok(())
+}
+
+#[test]
+fn sum() -> Result<()> {
+    let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
+    let tensor = Tensor::new(data, &Device::Cpu)?;
+    assert_eq!(
+        tensor.sum(&[2])?.to_vec3::<u32>()?,
+        &[[[8], [15]], [[10], [18]]]
+    );
+    assert_eq!(
+        tensor.sum(&[0])?.to_vec3::<u32>()?,
+        &[[[5, 2, 11], [9, 7, 17]]],
+    );
+    assert_eq!(tensor.sum(&[0, 2, 1])?.to_vec3::<u32>()?, &[[[51]]],);
+    assert_eq!(
+        tensor.t()?.sum(&[1])?.t()?.to_vec3::<u32>()?,
+        &[[[8], [15]], [[10], [18]]]
+    );
+    assert_eq!(
+        tensor.sum(&[2, 1])?.to_vec3::<u32>()?,
+        &[[[8 + 15]], [[10 + 18]]]
+    );
+    Ok(())
+}
+
+#[test]
+fn narrow() -> Result<()> {
+    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
+    let tensor = Tensor::new(data, &Device::Cpu)?;
+    assert_eq!(
+        tensor.narrow(2, 1, 2)?.to_vec3::<f32>()?,
+        &[[[1.0, 4.0], [5.0, 9.0]], [[1.0, 7.0], [2.0, 8.0]]],
+    );
+    assert_eq!(
+        tensor.narrow(1, 1, 1)?.to_vec3::<f32>()?,
+        &[[[1.0, 5.0, 9.0]], [[8.0, 2.0, 8.0]]],
+    );
+    assert_eq!(
+        tensor.narrow(0, 0, 1)?.to_vec3::<f32>()?,
+        &[[[3.0, 1.0, 4.0], [1.0, 5.0, 9.0]]],
+    );
+    assert_eq!(
+        tensor.narrow(0, 1, 1)?.to_vec3::<f32>()?,
+        &[[[2.0, 1.0, 7.0], [8.0, 2.0, 8.0]]],
+    );
+    // The following has been checked against PyTorch via:
+    //   import torch
+    //   t = torch.tensor([[[3., 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]])
+    //   t.transpose(-1, -2).narrow(1, 1, 2)
+    assert_eq!(
+        tensor.t()?.narrow(1, 1, 2)?.to_vec3::<f32>()?,
+        &[[[1.0, 5.0], [4.0, 9.0]], [[1.0, 2.0], [7.0, 8.0]]],
+    );
+    Ok(())
+}
+
+#[test]
+fn broadcast() -> Result<()> {
+    let data = &[3f32, 1., 4.];
+    let tensor = Tensor::new(data, &Device::Cpu)?;
+    assert_eq!(
+        tensor.broadcast_left((3, 1))?.to_vec3::<f32>()?,
+        &[[[3.0, 1.0, 4.0]], [[3.0, 1.0, 4.0]], [[3.0, 1.0, 4.0]]]
+    );
+    Ok(())
+}
+
+#[test]
+fn cat() -> Result<()> {
+    // 1D
+    let t1 = Tensor::new(&[3f32, 1., 4.], &Device::Cpu)?;
+    let t2 = Tensor::new(&[1f32, 5., 9., 2.], &Device::Cpu)?;
+    let t3 = Tensor::new(&[6f32, 5., 3., 5., 8., 9.], &Device::Cpu)?;
+    assert_eq!(Tensor::cat(&[&t1], 0)?.to_vec1::<f32>()?, [3f32, 1., 4.],);
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2], 0)?.to_vec1::<f32>()?,
+        [3f32, 1., 4., 1., 5., 9., 2.],
+    );
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2, &t3], 0)?.to_vec1::<f32>()?,
+        [3f32, 1., 4., 1., 5., 9., 2., 6., 5., 3., 5., 8., 9.],
+    );
+
+    // 2D
+    let data = &[[3f32, 1., 4., 1., 5.], [2., 7., 1., 8., 2.]];
+    let t1 = Tensor::new(data, &Device::Cpu)?;
+    let data2 = &[[5f32, 5., 5., 5., 5.], [2., 7., 1., 8., 2.]];
+    let t2 = Tensor::new(data2, &Device::Cpu)?;
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2], 0)?.to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0],
+            [5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    // TODO: This is not the expected answer, to be fixed!
+    assert_eq!(
+        Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
+            .t()?
+            .to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0],
+            [5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    // TODO: This is not the expected answer, to be fixed!
+    assert_eq!(
+        Tensor::cat(&[&t1, &t2], 1)?.to_vec2::<f32>()?,
+        [
+            [3.0, 1.0, 4.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0],
+            [2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0]
+        ]
+    );
+    Ok(())
+}