[WIP] Bf16 support.

Update our cuda runner. (#1705 )
* Update our cuda runner. * Fix install rust. * Simplify. * Docker in docker. * Install curl * Install curl * No sudo. * devel * Put curl again. * Add missing deps. * pkg-config. * Cleanup.
2025-06-17 19:18:50 +00:00 · 2024-02-13 22:44:11 +01:00 · 2024-02-13 19:06:15 +01:00 · 2024-02-13 18:11:17 +01:00 · 2024-02-13 16:28:56 +01:00 · 2024-02-13 14:26:32 +01:00
180 changed files with 11174 additions and 2186 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,7 @@
+version: 2
+updates:
+  - package-ecosystem: "cargo"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    open-pull-requests-limit: 5
--- a/.github/workflows/ci_cuda.yaml
+++ b/.github/workflows/ci_cuda.yaml
@ -5,49 +5,15 @@ on:
  pull_request:

 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    # Don't run on forks, they won't have access to secrets anyway.
-    if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ami-03cfed9ea28f4b002
-      EC2_INSTANCE_TYPE: g5.xlarge
-      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
-      EC2_SECURITY_GROUP: sg-030175c435ac141d6
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
  test-cuda:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: nvidia/cuda:12.3.1-devel-ubuntu22.04
+      options: --gpus 0 
+    if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
    permissions:
      contents: write
      packages: write
@ -58,32 +24,10 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
+      - name: Install dependencies
+        run: apt-get update && apt install curl build-essential libssl-dev protobuf-compiler pkg-config -y
      - name: Install Rust Stable
-        run: curl https://sh.rustup.rs -sSf | sh -s -- -y
+        uses: actions-rust-lang/setup-rust-toolchain@v1
      - uses: Swatinem/rust-cache@v2
-      - run: apt-get update -y && apt-get install libssl-dev protobuf-compiler -y
      - name: Test (cuda)
-        run: PATH=$PATH:/usr/local/cuda-11.8/bin/ /root/.cargo/bin/cargo test --features cuda
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - test-cuda
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ (success() || failure()) && github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+        run: cargo test --features cuda
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,7 +19,7 @@ exclude = [
 resolver = "2"

 [workspace.package]
-version = "0.3.3"
+version = "0.4.0"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@ -31,10 +31,18 @@ license = "MIT OR Apache-2.0"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
+candle = { path = "./candle-core", package = "candle-core", version = "0.4.0" }
+candle-datasets = { path = "./candle-datasets", version = "0.4.0" }
+candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.0" }
+candle-kernels = { path = "./candle-kernels", version = "0.4.0" }
+candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.0" }
+candle-nn = { path = "./candle-nn", version = "0.4.0" }
+candle-onnx = { path = "./candle-onnx", version = "0.4.0" }
+candle-transformers = { path = "./candle-transformers", version = "0.4.0" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "0.9.14", features = ["f16"] }
-gemm = { version = "0.16.6", features = ["wasm-simd128-enable"] }
+cudarc = { version = "0.10.0", features = ["f16"] }
+gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 image = { version = "0.24.7", default-features = false, features = ["jpeg", "png"] }
@ -42,20 +50,20 @@ imageproc = { version = "0.23.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
-memmap2 = { version = "0.7.1", features = ["stable_deref_trait"] }
+memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
 num_cpus = "1.15.0"
 num-traits = "0.2.15"
-parquet = { version = "45.0.0" }
+parquet = { version = "50.0.0" }
 rand = "0.8.5"
 rand_distr = "0.4.3"
 rayon = "1.7.0"
 rusttype = { version = "0.9", default-features = false }
-safetensors = "0.3.1"
+safetensors = "0.4.1"
 serde = { version = "1.0.171", features = ["derive"] }
 serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
-tokenizers = { version = "0.13.4", default-features = false }
+tokenizers = { version = "0.15.0", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
--- a/README.md
+++ b/README.md
@ -65,8 +65,9 @@ We also provide a some command line based examples using state of the art models
 - [Falcon](./candle-examples/examples/falcon/): general LLM.
 - [Phi-1, Phi-1.5, and Phi-2](./candle-examples/examples/phi/): 1.3b and 2.7b general LLMs with performance on par with LLaMA-v2 7b.
 - [StableLM-3B-4E1T](./candle-examples/examples/stable-lm/): a 3b general LLM
-  pre-trained on 1T tokens of English and code datasets.
- [Minimal Mamba](./candle-examples/examples/minimal-mamba/): a minimal
+  pre-trained on 1T tokens of English and code datasets. Also supports
+  StableLM-2, a 1.6b LLM trained on 2T tokens, as well as the code variants.
+- [Mamba](./candle-examples/examples/mamba/): an inference only
  implementation of the Mamba state space model.
 - [Mistral7b-v0.1](./candle-examples/examples/mistral/): a 7b general LLM with
  better performance than all publicly available 13b models as of 2023-09-28.
@ -109,8 +110,12 @@ We also provide a some command line based examples using state of the art models
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
  using self-supervision (can be used for imagenet classification, depth
  evaluation, segmentation).
+- [VGG](./candle-examples/examples/vgg/),
+  [RepVGG](./candle-examples/examples/repvgg): computer vision models.
 - [BLIP](./candle-examples/examples/blip/): image to text model, can be used to
  generate captions for an image.
+- [TrOCR](./candle-examples/examples/trocr/): a transformer OCR model, with
+  dedicated submodels for hand-writing and printed recognition.
 - [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
  model, generates the translated text from the input text.

@ -181,10 +186,10 @@ If you have an addition to this list, please submit a pull request.
        - Falcon.
        - StarCoder.
        - Phi 1, 1.5, and 2.
-        - Minimal Mamba
+        - Mamba, Minimal Mamba
        - Mistral 7b v0.1.
        - Mixtral 8x7b v0.1.
-        - StableLM-3B-4E1T.
+        - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
        - Replit-code-v1.5-3B.
        - Bert.
        - Yi-6B and Yi-34B.
@ -203,8 +208,9 @@ If you have an addition to this list, please submit a pull request.
        - Wurstchen v2.
    - Image to text.
        - BLIP.
+        - TrOCR.
    - Computer Vision Models.
-        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT.
+        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT.
        - yolo-v3, yolo-v8.
        - Segment-Anything Model (SAM).
 - File formats: load models from safetensors, npz, ggml, or PyTorch files.
--- a/candle-book/Cargo.toml
+++ b/candle-book/Cargo.toml
@ -11,11 +11,11 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.3.3", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.3.3" }
-candle-nn = { path = "../candle-nn", version = "0.3.3" }
-candle-transformers = { path = "../candle-transformers", version = "0.3.3" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.3", optional = true }
+candle = { workspace = true }
+candle-datasets = { workspace = true }
+candle-nn = { workspace = true }
+candle-transformers = { workspace = true }
+candle-flash-attn = { workspace = true, optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -12,8 +12,8 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { path = "../candle-kernels", version = "0.3.3", optional = true }
-candle-metal-kernels = { path = "../candle-metal-kernels", version = "0.3.3", optional = true }
+candle-kernels = { workspace = true, optional = true }
+candle-metal-kernels = { workspace = true, optional = true }
 metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
@ -46,6 +46,5 @@ accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels"]

 [[bench]]
-name = "matmul"
+name = "bench_main"
 harness = false
-
--- a/candle-core/benches/bench_main.rs
+++ b/candle-core/benches/bench_main.rs
@ -0,0 +1,9 @@
+mod benchmarks;
+
+use criterion::criterion_main;
+criterion_main!(
+    benchmarks::affine::benches,
+    benchmarks::matmul::benches,
+    benchmarks::random::benches,
+    benchmarks::where_cond::benches
+);
--- a/candle-core/benches/benchmarks/affine.rs
+++ b/candle-core/benches/benchmarks/affine.rs
@ -0,0 +1,43 @@
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
+use candle_core::{DType, Device, Tensor};
+use criterion::{black_box, criterion_group, Criterion, Throughput};
+use std::time::Instant;
+
+fn run(a: &Tensor) {
+    a.affine(12.34, 56.78).unwrap();
+}
+
+fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
+    let b = 1;
+    let m = 1024;
+    let k = 1024;
+
+    let tensor = Tensor::zeros((b, m, k), dtype, &device).unwrap();
+
+    let flops = b * m * k * dtype.size_in_bytes();
+
+    let mut group = c.benchmark_group(device.bench_name(name));
+    group.throughput(Throughput::Bytes(flops as u64));
+    group.bench_function("iter", move |b| {
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                run(black_box(&tensor));
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let handler = BenchDeviceHandler::new().unwrap();
+    for device in handler.devices {
+        run_affine_benchmark(c, &device, DType::F32, "affine_f32");
+        run_affine_benchmark(c, &device, DType::F16, "affine_f16");
+        run_affine_benchmark(c, &device, DType::BF16, "affine_bf16");
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/matmul.rs
+++ b/candle-core/benches/benchmarks/matmul.rs
@ -1,25 +1,25 @@
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
-use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
+use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;

 fn run(a: &Tensor, b: &Tensor) {
    a.matmul(&b.t().unwrap()).unwrap();
 }

-fn criterion_benchmark(c: &mut Criterion) {
+fn run_bench(c: &mut Criterion, device: &Device) {
    let b = 1;
    let m = 1;
    let n = 2048;
    let k = 2048;

-    let device = Device::new_metal(0).unwrap();
    let dtype = DType::F32;
-    let lhs = Tensor::zeros((b, m, k), dtype, &device).unwrap();
-    let rhs = Tensor::zeros((b, n, k), dtype, &device).unwrap();
+    let lhs = Tensor::zeros((b, m, k), dtype, device).unwrap();
+    let rhs = Tensor::zeros((b, n, k), dtype, device).unwrap();

    let flops = b * m * n * k;

-    let mut group = c.benchmark_group("matmul_metal");
+    let mut group = c.benchmark_group(device.bench_name("matmul"));
    group.throughput(Throughput::Bytes(flops as u64));
    group.bench_function("iter", move |b| {
        b.iter_custom(|iters| {
@ -27,16 +27,18 @@ fn criterion_benchmark(c: &mut Criterion) {
            for _i in 0..iters {
                run(black_box(&lhs), black_box(&rhs));
            }
-            if let Device::Metal(device) = &device {
-                device.wait_until_completed().unwrap();
-            } else {
-                panic!("Expected metal device");
-            }
+            device.sync().unwrap();
            start.elapsed()
        })
    });
    group.finish();
 }

+fn criterion_benchmark(c: &mut Criterion) {
+    let handler = BenchDeviceHandler::new().unwrap();
+    for device in handler.devices {
+        run_bench(c, &device);
+    }
+}
+
 criterion_group!(benches, criterion_benchmark);
-criterion_main!(benches);
--- a/candle-core/benches/benchmarks/mod.rs
+++ b/candle-core/benches/benchmarks/mod.rs
@ -0,0 +1,66 @@
+pub(crate) mod affine;
+pub(crate) mod matmul;
+pub(crate) mod random;
+pub(crate) mod where_cond;
+
+use candle_core::{Device, Result};
+
+pub(crate) trait BenchDevice {
+    fn sync(&self) -> Result<()>;
+
+    fn bench_name<S: Into<String>>(&self, name: S) -> String;
+}
+
+impl BenchDevice for Device {
+    fn sync(&self) -> Result<()> {
+        match self {
+            Device::Cpu => Ok(()),
+            Device::Cuda(device) => {
+                #[cfg(feature = "cuda")]
+                return Ok(device.synchronize()?);
+                #[cfg(not(feature = "cuda"))]
+                panic!("Cuda device without cuda feature enabled: {:?}", device)
+            }
+            Device::Metal(device) => {
+                #[cfg(feature = "metal")]
+                return Ok(device.wait_until_completed()?);
+                #[cfg(not(feature = "metal"))]
+                panic!("Metal device without metal feature enabled: {:?}", device)
+            }
+        }
+    }
+
+    fn bench_name<S: Into<String>>(&self, name: S) -> String {
+        match self {
+            Device::Cpu => {
+                let cpu_type = if cfg!(feature = "accelerate") {
+                    "accelerate"
+                } else if cfg!(feature = "mkl") {
+                    "mkl"
+                } else {
+                    "cpu"
+                };
+                format!("{}_{}", cpu_type, name.into())
+            }
+            Device::Cuda(_) => format!("cuda_{}", name.into()),
+            Device::Metal(_) => format!("metal_{}", name.into()),
+        }
+    }
+}
+
+struct BenchDeviceHandler {
+    devices: Vec<Device>,
+}
+
+impl BenchDeviceHandler {
+    pub fn new() -> Result<Self> {
+        let mut devices = Vec::new();
+        if cfg!(feature = "metal") {
+            devices.push(Device::new_metal(0)?);
+        } else if cfg!(feature = "cuda") {
+            devices.push(Device::new_cuda(0)?);
+        }
+        devices.push(Device::Cpu);
+        Ok(Self { devices })
+    }
+}
--- a/candle-core/benches/benchmarks/random.rs
+++ b/candle-core/benches/benchmarks/random.rs
@ -0,0 +1,63 @@
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
+use candle_core::{DType, Device, Tensor};
+use criterion::{black_box, criterion_group, Criterion, Throughput};
+use std::time::Instant;
+
+fn rand_uniform(a: &Tensor) {
+    a.rand_like(-1.0, 123.0).unwrap();
+}
+
+fn rand_normal(a: &Tensor) {
+    a.randn_like(100.0, 15.0).unwrap();
+}
+
+fn run_random_bench(c: &mut Criterion, device: &Device) {
+    let b = 1;
+
+    let rows = 2048;
+    let cols = 2048;
+
+    let dtype = DType::F32;
+    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
+
+    let flops = b * rows * cols * dtype.size_in_bytes();
+
+    let mut group = c.benchmark_group(device.bench_name("random_uniform"));
+    group.throughput(Throughput::Bytes(flops as u64));
+    group.bench_function("iter", move |benches| {
+        benches.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                rand_uniform(black_box(&tensor));
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+
+    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
+
+    let mut group = c.benchmark_group(device.bench_name("random_normal"));
+    group.throughput(Throughput::Bytes(flops as u64));
+    group.bench_function("iter", move |benches| {
+        benches.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                rand_normal(black_box(&tensor));
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let handler = BenchDeviceHandler::new().unwrap();
+    for device in handler.devices {
+        run_random_bench(c, &device);
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
--- a/candle-core/benches/benchmarks/where_cond.rs
+++ b/candle-core/benches/benchmarks/where_cond.rs
@ -0,0 +1,64 @@
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
+use candle_core::{DType, Device, Tensor};
+use criterion::{black_box, criterion_group, Criterion, Throughput};
+use std::time::Instant;
+
+fn run(a: &Tensor, b: &Tensor, c: &Tensor) {
+    a.where_cond(b, c).unwrap();
+}
+
+const fn create_cond_arr<const N: usize>() -> [u8; N] {
+    let mut arr = [0u8; N];
+    let mut i = 0;
+    while i < N {
+        arr[i] = (i % 2) as u8;
+        i += 1;
+    }
+    arr
+}
+
+const B: usize = 1;
+const M: usize = 1024;
+const K: usize = 1024;
+const SIZE: usize = B * M * K;
+
+const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
+
+fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
+    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), &device).unwrap();
+    let on_true = Tensor::ones((B, M, K), dtype, &device).unwrap();
+    let on_false = Tensor::zeros((B, M, K), dtype, &device).unwrap();
+
+    let elements = B * M * K;
+    // E.g. 2 f32 tensors + 1 u8 tensor
+    let flops = (2 * elements * dtype.size_in_bytes()) + elements;
+
+    let mut group = c.benchmark_group(device.bench_name(name));
+    group.throughput(Throughput::Bytes(flops as u64));
+    group.bench_function("iter", move |b| {
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                run(
+                    black_box(&tensor),
+                    black_box(&on_true),
+                    black_box(&on_false),
+                );
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let device = BenchDeviceHandler::new().unwrap();
+    for d in device.devices {
+        run_where_cond_benchmark(c, &d, DType::F32, "where_cond_f32");
+        run_where_cond_benchmark(c, &d, DType::BF16, "where_cond_bf16");
+        run_where_cond_benchmark(c, &d, DType::F16, "where_cond_f16");
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@ -118,7 +118,7 @@ enum Command {
    },

    Quantize {
-        /// The input file, in gguf format.
+        /// The input file(s), in safetensors format.
        in_file: Vec<std::path::PathBuf>,

        /// The output file, in gguf format.
@ -133,6 +133,15 @@ enum Command {
        #[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
        mode: QuantizationMode,
    },
+
+    Dequantize {
+        /// The input file, in gguf format.
+        in_file: std::path::PathBuf,
+
+        /// The output file, in safetensors format.
+        #[arg(long)]
+        out_file: std::path::PathBuf,
+    },
 }

 #[derive(Parser, Debug, Clone)]
@ -187,7 +196,7 @@ fn run_ls(
            }
        }
        Format::Pth => {
-            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose)?;
+            let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose, None)?;
            tensors.sort_by(|a, b| a.name.cmp(&b.name));
            for tensor_info in tensors.iter() {
                println!(
@ -277,6 +286,23 @@ fn run_quantize_safetensors(
    Ok(())
 }

+fn run_dequantize(
+    in_file: std::path::PathBuf,
+    out_file: std::path::PathBuf,
+    device: &Device,
+) -> Result<()> {
+    let mut in_file = std::fs::File::open(in_file)?;
+    let content = gguf_file::Content::read(&mut in_file)?;
+    let mut tensors = std::collections::HashMap::new();
+    for (tensor_name, _) in content.tensor_infos.iter() {
+        let tensor = content.tensor(&mut in_file, tensor_name, device)?;
+        let tensor = tensor.dequantize(device)?;
+        tensors.insert(tensor_name.to_string(), tensor);
+    }
+    candle_core::safetensors::save(&tensors, out_file)?;
+    Ok(())
+}
+
 fn run_quantize(
    in_files: &[std::path::PathBuf],
    out_file: std::path::PathBuf,
@ -357,6 +383,7 @@ fn main() -> anyhow::Result<()> {
            quantization,
            mode,
        } => run_quantize(&in_file, out_file, quantization, mode, &device)?,
+        Command::Dequantize { in_file, out_file } => run_dequantize(in_file, out_file, &device)?,
    }
    Ok(())
 }
--- a/candle-core/src/backprop.rs
+++ b/candle-core/src/backprop.rs
@ -175,7 +175,7 @@ impl Tensor {
            // the backprop graph of the backprop itself. This would be an issue for second order
            // derivatives but these are out of scope at the moment.
            let do_not_detach = CANDLE_GRAD_DO_NOT_DETACH.with(|b| *b);
-            let grad = if do_not_detach { grad } else { grad.detach()? };
+            let grad = if do_not_detach { grad } else { grad.detach() };
            if let Some(op) = node.op() {
                match op {
                    Op::Binary(lhs, rhs, BinaryOp::Add) => {
--- a/candle-core/src/cuda_backend.rs
+++ b/candle-core/src/cuda_backend.rs
@ -1149,6 +1149,55 @@ impl<'a> Map2 for Conv2D<'a> {
    }
 }

+struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D);
+impl<'a> Map2 for ConvTranspose1D<'a> {
+    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
+        &self,
+        inp: &CudaSlice<T>,
+        inp_l: &Layout,
+        k: &CudaSlice<T>,
+        k_l: &Layout,
+        dev: &CudaDevice,
+    ) -> Result<CudaSlice<T>> {
+        // Kernel shape: (c_in_k, c_out, l_k)
+        // Input shape: (b_size, c_in, l_in)
+        let p = &self.0;
+        let l_out = p.l_out();
+        let dst_el = p.c_out * l_out * p.b_size;
+        let inp = &inp.slice(inp_l.start_offset()..);
+        let k = &k.slice(k_l.start_offset()..);
+        let shape = inp_l.shape();
+        let dims = shape.dims();
+        let el = shape.elem_count();
+
+        // SAFETY: Set later by running the kernel.
+        let out = unsafe { dev.alloc::<T>(dst_el) }.w()?;
+        let cfg = LaunchConfig::for_num_elems(dst_el as u32);
+        let func = dev.get_or_load_func(&kernel_name::<T>("conv_transpose1d"), kernels::CONV)?;
+        let ds = if dims.len() == 3 {
+            [dims, inp_l.stride(), k_l.dims(), k_l.stride()].concat()
+        } else {
+            crate::bail!("unexpected input shape for conv_transpose1d {dims:?}")
+        };
+        let ds = dev.htod_copy(ds).w()?;
+        let params = (
+            el,
+            l_out,
+            p.stride,
+            p.padding,
+            p.output_padding,
+            p.dilation,
+            &ds,
+            inp,
+            k,
+            &out,
+        );
+        // SAFETY: ffi.
+        unsafe { func.launch(cfg, params) }.w()?;
+        Ok(out)
+    }
+}
+
 struct ConvTranspose2D<'a>(&'a crate::conv::ParamsConvTranspose2D);
 impl<'a> Map2 for ConvTranspose2D<'a> {
    fn f<T: DeviceRepr + WithDType + ValidAsZeroBits>(
@ -1810,12 +1859,15 @@ impl BackendStorage for CudaStorage {

    fn conv_transpose1d(
        &self,
-        _: &Layout,
-        _: &Self,
-        _: &Layout,
-        _: &crate::conv::ParamsConvTranspose1D,
+        l: &Layout,
+        kernel: &Self,
+        kernel_l: &Layout,
+        params: &crate::conv::ParamsConvTranspose1D,
    ) -> Result<Self> {
-        todo!()
+        let device = self.device().clone();
+        let slice =
+            ConvTranspose1D(params).map(&self.slice, l, &kernel.slice, kernel_l, &device)?;
+        Ok(Self { slice, device })
    }

    #[cfg(not(feature = "cudnn"))]
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -72,7 +72,7 @@ pub mod utils;
 mod variable;

 pub use cpu_backend::CpuStorage;
-pub use device::{Device, DeviceLocation};
+pub use device::{Device, DeviceLocation, NdArray};
 pub use dtype::{DType, FloatDType, IntDType, WithDType};
 pub use error::{Error, Result};
 pub use indexer::IndexOp;
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@ -7,8 +7,9 @@ use candle_metal_kernels::Kernels;
 use metal;
 use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger};
 use std::collections::HashMap;
+use std::ffi::c_void;
 use std::path::Path;
-use std::sync::{Arc, RwLock, TryLockError};
+use std::sync::{Arc, Mutex, RwLock, TryLockError};

 /// Simple way to catch lock error without
 /// depending on T
@ -84,13 +85,8 @@ pub struct MetalDevice {
    command_buffer_index: Arc<RwLock<usize>>,
    /// The maximum amount of [compute command encoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) per [command buffer](https://developer.apple.com/documentation/metal/mtlcommandbuffer?language=objc)
    compute_per_buffer: usize,
-    /// Every compute command encoder (and blit encoders) are defended with this Fence, forcing the
-    /// execution order to be linear.
-    /// It could be relaxed in some circumstances, by managing ourselves the dependencies in the
-    /// compute graph.
-    // fence: metal::Fence,
    /// Simple keeper struct to keep track of the already compiled kernels so we can reuse them.
-    /// Heavily used by [`candle_metal_kernels`], both fences need to match
+    /// Heavily used by [`candle_metal_kernels`]
    kernels: Arc<candle_metal_kernels::Kernels>,
    /// Simple allocator struct.
    /// The buffers are stored in size buckets since ML tends to use similar shapes over and over.
@ -106,6 +102,8 @@ pub struct MetalDevice {
    /// Whenever we actually allocate a new buffer, we make a full sweep to cleanup unused buffers
    /// (strong_count = 1).
    buffers: AllocatedBuffers,
+    /// Seed for random number generation.
+    seed: Arc<Mutex<Buffer>>,
 }

 impl std::fmt::Debug for MetalDevice {
@ -131,10 +129,6 @@ impl MetalDevice {
        &self.device
    }

-    // pub(crate) fn fence(&self) -> &metal::Fence {
-    //     &self.fence
-    // }
-
    pub fn command_queue(&self) -> &CommandQueue {
        &self.command_queue
    }
@ -225,10 +219,8 @@ impl MetalDevice {
        let command_buffer = self.command_buffer()?;
        command_buffer.set_label("with_data");
        let blit = command_buffer.new_blit_command_encoder();
-        // blit.wait_for_fence(&self.fence);
        blit.set_label("with_data_blit");
        blit.copy_from_buffer(&tmp, 0, &real, 0, tmp.length());
-        // blit.update_fence(&self.fence);
        blit.end_encoding();

        // This is necessary, for mmaped safetensors
@ -236,7 +228,7 @@ impl MetalDevice {
        // The slice might not live long enough for metal
        // To actually fill the GPU buffer.
        // Putting this wait forces the GPU buffer to be filled
-        // with the actual data allowing the CPU storage todo
+        // with the actual data allowing the CPU storage to do
        // deallocate properly.
        self.wait_until_completed()?;
        Ok(real)
@ -251,7 +243,6 @@ impl MetalDevice {
        let command_buffer = self.command_buffer()?;
        command_buffer.set_label("zeros");
        let blit = command_buffer.new_blit_command_encoder();
-        // blit.wait_for_fence(&self.fence);
        blit.fill_buffer(
            &buffer,
            metal::NSRange {
@ -260,7 +251,6 @@ impl MetalDevice {
            },
            0,
        );
-        // blit.update_fence(&self.fence);
        blit.end_encoding();
        Ok(buffer)
    }
@ -359,6 +349,7 @@ impl BackendStorage for MetalStorage {
            let name = match self.dtype {
                DType::F32 => "affine_f32",
                DType::F16 => "affine_f16",
+                DType::BF16 => "affine_bf16",
                dtype => crate::bail!("Metal contiguous affine {dtype:?} not implemented"),
            };
            candle_metal_kernels::call_affine(
@ -377,6 +368,7 @@ impl BackendStorage for MetalStorage {
            let name = match self.dtype {
                DType::F32 => "affine_f32_strided",
                DType::F16 => "affine_f16_strided",
+                DType::BF16 => "affine_bf16_strided",
                dtype => crate::bail!("Metal strided affine {dtype:?} not implemented"),
            };
            candle_metal_kernels::call_affine_strided(
@ -596,14 +588,26 @@ impl BackendStorage for MetalStorage {
                (DType::U32, DType::F32) => "cast_u32_f32",
                (DType::U32, DType::U8) => "cast_u32_u8",
                (DType::U32, DType::I64) => "cast_u32_i64",
+                (DType::U32, DType::BF16) => "cast_u32_bf16",
+
                (DType::U8, DType::U32) => "cast_u8_u32",
                (DType::U8, DType::F32) => "cast_u8_f32",
                (DType::U8, DType::I64) => "cast_u8_i64",
+                (DType::U8, DType::BF16) => "cast_u8_bf16",
+
                (DType::F32, DType::F16) => "cast_f32_f16",
-                (DType::F16, DType::F32) => "cast_f16_f32",
-                (DType::I64, DType::F32) => "cast_i64_f32",
                (DType::F32, DType::BF16) => "cast_f32_bf16",
+
+                (DType::I64, DType::F32) => "cast_i64_f32",
+
+                (DType::F16, DType::BF16) => "cast_f16_bf16",
+                (DType::F16, DType::F32) => "cast_f16_f32",
+
+                (DType::BF16, DType::U8) => "cast_bf16_u8",
+                (DType::BF16, DType::U32) => "cast_bf16_u32",
+                (DType::BF16, DType::F16) => "cast_bf16_f16",
                (DType::BF16, DType::F32) => "cast_bf16_f32",
+
                (left, right) => {
                    crate::bail!("Metal contiguous to_dtype {left:?} {right:?} not implemented")
                }
@ -681,6 +685,7 @@ impl BackendStorage for MetalStorage {
                ("uround", DType::F32) => contiguous::round::FLOAT,
                ("urecip", DType::F32) => contiguous::recip::FLOAT,
                ("utanh", DType::F32) => contiguous::tanh::FLOAT,
+                ("urelu", DType::F32) => contiguous::relu::FLOAT,
                ("ucos", DType::F16) => contiguous::cos::HALF,
                ("usin", DType::F16) => contiguous::sin::HALF,
                ("usqr", DType::F16) => contiguous::sqr::HALF,
@ -697,6 +702,7 @@ impl BackendStorage for MetalStorage {
                ("uround", DType::F16) => contiguous::round::HALF,
                ("urecip", DType::F16) => contiguous::recip::HALF,
                ("utanh", DType::F16) => contiguous::tanh::HALF,
+                ("urelu", DType::F16) => contiguous::relu::HALF,
                (name, dtype) => {
                    crate::bail!("Metal contiguous unary {name} {dtype:?} not implemented")
                }
@ -727,6 +733,7 @@ impl BackendStorage for MetalStorage {
                ("uabs", DType::F32) => strided::abs::FLOAT,
                ("uceil", DType::F32) => strided::ceil::FLOAT,
                ("ufloor", DType::F32) => strided::floor::FLOAT,
+                ("urelu", DType::F32) => strided::relu::FLOAT,
                ("uround", DType::F32) => strided::round::FLOAT,
                ("ucos", DType::F16) => strided::cos::HALF,
                ("usin", DType::F16) => strided::sin::HALF,
@ -741,6 +748,7 @@ impl BackendStorage for MetalStorage {
                ("uabs", DType::F16) => strided::abs::HALF,
                ("uceil", DType::F16) => strided::ceil::HALF,
                ("ufloor", DType::F16) => strided::floor::HALF,
+                ("urelu", DType::F16) => strided::relu::HALF,
                ("uround", DType::F16) => strided::round::HALF,
                (name, dtype) => {
                    crate::bail!("Metal strided unary {name} {dtype:?} not implemented")
@ -796,6 +804,7 @@ impl BackendStorage for MetalStorage {
        }
        let name = match (self.dtype, t.dtype()) {
            (DType::U8, DType::F32) => "where_u8_f32",
+            (DType::U8, DType::BF16) => "where_u8_bf16",
            (DType::U8, DType::F16) => "where_u8_f16",
            (DType::U8, DType::I64) => "where_u8_i64",
            (DType::U8, DType::U32) => "where_u8_u32",
@ -1133,8 +1142,12 @@ impl BackendStorage for MetalStorage {
        let device = self.device();
        let buffer = device.new_buffer(dst_el, dtype, "index_select")?;
        let name = match (ids.dtype, self.dtype) {
+            (DType::U8, DType::BF16) => "is_u8_bf16",
+
            (DType::U32, DType::F32) => "is_u32_f32",
            (DType::U32, DType::F16) => "is_u32_f16",
+            (DType::U32, DType::BF16) => "is_u32_bf16",
+
            (left, right) => {
                crate::bail!("Metal contiguous index_select {left:?} {right:?} not implemented")
            }
@ -1324,6 +1337,7 @@ impl MetalStorage {
                ("lt", DType::F32) => (contiguous::lt::FLOAT, DType::U8),
                ("ge", DType::F32) => (contiguous::ge::FLOAT, DType::U8),
                ("gt", DType::F32) => (contiguous::gt::FLOAT, DType::U8),
+
                ("add", DType::F16) => (contiguous::add::HALF, self.dtype),
                ("sub", DType::F16) => (contiguous::sub::HALF, self.dtype),
                ("mul", DType::F16) => (contiguous::mul::HALF, self.dtype),
@ -1334,6 +1348,18 @@ impl MetalStorage {
                ("lt", DType::F16) => (contiguous::lt::HALF, DType::U8),
                ("ge", DType::F16) => (contiguous::ge::HALF, DType::U8),
                ("gt", DType::F16) => (contiguous::gt::HALF, DType::U8),
+
+                ("add", DType::BF16) => (contiguous::add::BFLOAT, self.dtype),
+                ("sub", DType::BF16) => (contiguous::sub::BFLOAT, self.dtype),
+                ("mul", DType::BF16) => (contiguous::mul::BFLOAT, self.dtype),
+                ("div", DType::BF16) => (contiguous::div::BFLOAT, self.dtype),
+                ("eq", DType::BF16) => (contiguous::eq::BFLOAT, DType::U8),
+                ("ne", DType::BF16) => (contiguous::ne::BFLOAT, DType::U8),
+                ("le", DType::BF16) => (contiguous::le::BFLOAT, DType::U8),
+                ("lt", DType::BF16) => (contiguous::lt::BFLOAT, DType::U8),
+                ("ge", DType::BF16) => (contiguous::ge::BFLOAT, DType::U8),
+                ("gt", DType::BF16) => (contiguous::gt::BFLOAT, DType::U8),
+
                ("add", DType::I64) => (contiguous::add::I64, self.dtype),
                ("sub", DType::I64) => (contiguous::sub::I64, self.dtype),
                ("mul", DType::I64) => (contiguous::mul::I64, self.dtype),
@ -1344,6 +1370,7 @@ impl MetalStorage {
                ("lt", DType::I64) => (contiguous::lt::I64, DType::U8),
                ("ge", DType::I64) => (contiguous::ge::I64, DType::U8),
                ("gt", DType::I64) => (contiguous::gt::I64, DType::U8),
+
                ("add", DType::U32) => (contiguous::add::U32, self.dtype),
                ("sub", DType::U32) => (contiguous::sub::U32, self.dtype),
                ("mul", DType::U32) => (contiguous::mul::U32, self.dtype),
@ -1354,6 +1381,7 @@ impl MetalStorage {
                ("lt", DType::U32) => (contiguous::lt::U32, DType::U8),
                ("ge", DType::U32) => (contiguous::ge::U32, DType::U8),
                ("gt", DType::U32) => (contiguous::gt::U32, DType::U8),
+
                ("add", DType::U8) => (contiguous::add::U8, self.dtype),
                ("sub", DType::U8) => (contiguous::sub::U8, self.dtype),
                ("mul", DType::U8) => (contiguous::mul::U8, self.dtype),
@ -1364,6 +1392,7 @@ impl MetalStorage {
                ("lt", DType::U8) => (contiguous::lt::U8, DType::U8),
                ("ge", DType::U8) => (contiguous::ge::U8, DType::U8),
                ("gt", DType::U8) => (contiguous::gt::U8, DType::U8),
+
                (name, dtype) => {
                    crate::bail!("Metal contiguous binary {name} {dtype:?} not implemented")
                }
@ -1397,6 +1426,7 @@ impl MetalStorage {
                ("lt", DType::F32) => (strided::lt::FLOAT, DType::U8),
                ("ge", DType::F32) => (strided::ge::FLOAT, DType::U8),
                ("gt", DType::F32) => (strided::gt::FLOAT, DType::U8),
+
                ("badd", DType::F16) => (strided::add::HALF, self.dtype),
                ("bsub", DType::F16) => (strided::sub::HALF, self.dtype),
                ("bmul", DType::F16) => (strided::mul::HALF, self.dtype),
@ -1409,6 +1439,20 @@ impl MetalStorage {
                ("lt", DType::F16) => (strided::lt::HALF, DType::U8),
                ("ge", DType::F16) => (strided::ge::HALF, DType::U8),
                ("gt", DType::F16) => (strided::gt::HALF, DType::U8),
+
+                ("badd", DType::BF16) => (strided::add::BFLOAT, self.dtype),
+                ("bsub", DType::BF16) => (strided::sub::BFLOAT, self.dtype),
+                ("bmul", DType::BF16) => (strided::mul::BFLOAT, self.dtype),
+                ("bdiv", DType::BF16) => (strided::div::BFLOAT, self.dtype),
+                ("bminimum", DType::BF16) => (strided::min::BFLOAT, self.dtype),
+                ("bmaximum", DType::BF16) => (strided::max::BFLOAT, self.dtype),
+                ("eq", DType::BF16) => (strided::eq::BFLOAT, DType::U8),
+                ("ne", DType::BF16) => (strided::ne::BFLOAT, DType::U8),
+                ("le", DType::BF16) => (strided::le::BFLOAT, DType::U8),
+                ("lt", DType::BF16) => (strided::lt::BFLOAT, DType::U8),
+                ("ge", DType::BF16) => (strided::ge::BFLOAT, DType::U8),
+                ("gt", DType::BF16) => (strided::gt::BFLOAT, DType::U8),
+
                ("badd", DType::I64) => (strided::add::I64, self.dtype),
                ("bsub", DType::I64) => (strided::sub::I64, self.dtype),
                ("bmul", DType::I64) => (strided::mul::I64, self.dtype),
@ -1421,6 +1465,7 @@ impl MetalStorage {
                ("lt", DType::I64) => (strided::lt::I64, DType::U8),
                ("ge", DType::I64) => (strided::ge::I64, DType::U8),
                ("gt", DType::I64) => (strided::gt::I64, DType::U8),
+
                ("badd", DType::U32) => (strided::add::U32, self.dtype),
                ("bsub", DType::U32) => (strided::sub::U32, self.dtype),
                ("bmul", DType::U32) => (strided::mul::U32, self.dtype),
@ -1433,6 +1478,7 @@ impl MetalStorage {
                ("lt", DType::U32) => (strided::lt::U32, DType::U8),
                ("ge", DType::U32) => (strided::ge::U32, DType::U8),
                ("gt", DType::U32) => (strided::gt::U32, DType::U8),
+
                ("badd", DType::U8) => (strided::add::U8, self.dtype),
                ("bsub", DType::U8) => (strided::sub::U8, self.dtype),
                ("bmul", DType::U8) => (strided::mul::U8, self.dtype),
@ -1445,6 +1491,7 @@ impl MetalStorage {
                ("lt", DType::U8) => (strided::lt::U8, DType::U8),
                ("ge", DType::U8) => (strided::ge::U8, DType::U8),
                ("gt", DType::U8) => (strided::gt::U8, DType::U8),
+
                (name, dtype) => {
                    crate::bail!("Metal strided binary {name} {dtype:?} not implemented")
                }
@ -1486,9 +1533,7 @@ impl MetalStorage {
            command_buffer.set_label("to_cpu");
            let blit = command_buffer.new_blit_command_encoder();
            blit.set_label("blit_to_cpu");
-            // blit.wait_for_fence(&self.device.fence);
            blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
-            // blit.update_fence(&self.device.fence);
            blit.end_encoding();
        }
        self.device.wait_until_completed()?;
@ -1506,29 +1551,29 @@ impl BackendDevice for MetalDevice {
        command_buffer.enqueue();
        let command_buffer = Arc::new(RwLock::new(command_buffer));
        let command_buffer_index = Arc::new(RwLock::new(0));
-        // let fence = device.new_fence();
        let kernels = Arc::new(Kernels::new());
        let buffers = Arc::new(RwLock::new(HashMap::new()));
        let compute_per_buffer = match std::env::var("CANDLE_METAL_COMPUTE_PER_BUFFER") {
            Ok(val) => val.parse()?,
            _ => 10,
        };
+        let seed = Arc::new(Mutex::new(device.new_buffer_with_data(
+            [299792458].as_ptr() as *const c_void,
+            4,
+            MTLResourceOptions::StorageModeManaged,
+        )));
        Ok(Self {
            device,
-            // fence,
            command_queue,
            command_buffer,
            command_buffer_index,
            compute_per_buffer,
            buffers,
            kernels,
+            seed,
        })
    }

-    fn set_seed(&self, _seed: u64) -> Result<()> {
-        crate::bail!("Metal set_seed not implemented")
-    }
-
    fn location(&self) -> crate::DeviceLocation {
        crate::DeviceLocation::Metal {
            gpu_id: self.registry_id() as usize,
@ -1568,12 +1613,31 @@ impl BackendDevice for MetalDevice {
        &self,
        shape: &Shape,
        dtype: DType,
-        mean: f64,
-        stddev: f64,
+        min: f64,
+        max: f64,
    ) -> Result<Self::Storage> {
-        // TODO is there a better way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.rand_uniform(shape, dtype, mean, stddev)?;
-        self.storage_from_cpu_storage(&cpu_storage)
+        let name = match dtype {
+            DType::F32 => "rand_uniform_f32",
+            DType::F16 => "rand_uniform_f16",
+            DType::BF16 => "rand_uniform_bf16",
+            dtype => crate::bail!("rand_uniform not implemented for {dtype:?}"),
+        };
+        let buffer = self.new_buffer(shape.elem_count(), dtype, "rand_uniform")?;
+        let command_buffer = self.command_buffer()?;
+        candle_metal_kernels::call_random_uniform(
+            &self.device,
+            &command_buffer,
+            &self.kernels,
+            name,
+            min as f32,
+            max as f32,
+            shape.elem_count(),
+            &*self.seed.lock().unwrap(),
+            &buffer,
+        )
+        .map_err(MetalError::from)?;
+
+        Ok(Self::Storage::new(buffer, self.clone(), dtype))
    }

    fn rand_normal(
@ -1583,9 +1647,43 @@ impl BackendDevice for MetalDevice {
        mean: f64,
        stddev: f64,
    ) -> Result<Self::Storage> {
-        // TODO is there a better way ?
-        let cpu_storage = crate::cpu_backend::CpuDevice.rand_normal(shape, dtype, mean, stddev)?;
-        self.storage_from_cpu_storage(&cpu_storage)
+        let name = match dtype {
+            DType::F32 => "rand_normal_f32",
+            DType::F16 => "rand_normal_f16",
+            DType::BF16 => "rand_normal_bf16",
+            dtype => crate::bail!("rand_uniform not implemented for {dtype:?}"),
+        };
+        let buffer = self.new_buffer(shape.elem_count(), dtype, "rand_normal")?;
+        let command_buffer = self.command_buffer()?;
+        candle_metal_kernels::call_random_normal(
+            &self.device,
+            &command_buffer,
+            &self.kernels,
+            name,
+            mean as f32,
+            stddev as f32,
+            shape.elem_count(),
+            &*self.seed.lock().unwrap(),
+            &buffer,
+        )
+        .map_err(MetalError::from)?;
+
+        Ok(Self::Storage::new(buffer, self.clone(), dtype))
+    }
+
+    fn set_seed(&self, seed: u64) -> Result<()> {
+        let seed: u32 = seed.try_into().map_err(|_| {
+            MetalError::Message("Metal seed must be less than or equal to u32::MAX".to_string())
+        })?;
+
+        let seed_buffer = self.seed.try_lock().map_err(MetalError::from)?;
+        let contents = seed_buffer.contents();
+        unsafe {
+            std::ptr::copy([seed].as_ptr(), contents as *mut u32, 4);
+        }
+        seed_buffer.did_modify_range(metal::NSRange::new(0, 4));
+
+        Ok(())
    }
 }

--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@ -217,6 +217,13 @@ impl Object {
                let args = args.remove(1);
                (callable, args)
            }
+            Object::Class {
+                module_name,
+                class_name,
+            } if module_name == "torch._utils" && class_name == "_rebuild_parameter" => {
+                let mut args = args.tuple()?;
+                args.remove(0).reduce()?
+            }
            _ => (callable, args),
        };
        match callable {
@ -227,13 +234,11 @@ impl Object {
            _ => return Ok(None),
        };
        let (layout, dtype, file_path, storage_size) = rebuild_args(args)?;
-        let mut path = dir_name.to_path_buf();
-        path.push(file_path);
        Ok(Some(TensorInfo {
            name,
            dtype,
            layout,
-            path: path.to_string_lossy().into_owned(),
+            path: format!("{}/{}", dir_name.to_string_lossy(), file_path),
            storage_size,
        }))
    }
@ -345,8 +350,10 @@ impl Stack {
                module_name,
                class_name,
            } => {
-                if module_name == "collections" && class_name == "OrderedDict" {
-                    // TODO: have a separate ordered dict.
+                if module_name == "collections"
+                    && (class_name == "OrderedDict" || class_name == "defaultdict")
+                {
+                    // TODO: have a separate ordered dict and a separate default dict.
                    Some(Object::Dict(vec![]))
                } else {
                    None
@ -627,9 +634,16 @@ pub struct TensorInfo {
    pub storage_size: usize,
 }

+/// Read the tensor info from a .pth file.
+///
+/// # Arguments
+/// * `file` - The path to the .pth file.
+/// * `verbose` - Whether to print debug information.
+/// * `key` - Optional key to retrieve `state_dict` from the pth file.
 pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
    file: P,
    verbose: bool,
+    key: Option<&str>,
 ) -> Result<Vec<TensorInfo>> {
    let file = std::fs::File::open(file)?;
    let zip_reader = std::io::BufReader::new(file);
@ -651,8 +665,9 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
        stack.read_loop(&mut reader)?;
        let obj = stack.finalize()?;
        if VERBOSE || verbose {
-            println!("{obj:?}");
+            println!("{obj:#?}");
        }
+
        let obj = match obj {
            Object::Build { callable, args } => match *callable {
                Object::Reduce { callable, args: _ } => match *callable {
@ -666,6 +681,24 @@ pub fn read_pth_tensor_info<P: AsRef<std::path::Path>>(
            },
            obj => obj,
        };
+
+        // If key is provided, then we need to extract the state_dict from the object.
+        let obj = if let Some(key) = key {
+            if let Object::Dict(key_values) = obj {
+                key_values
+                    .into_iter()
+                    .find(|(k, _)| *k == Object::Unicode(key.to_owned()))
+                    .map(|(_, v)| v)
+                    .ok_or_else(|| E::Msg(format!("key {key} not found")))?
+            } else {
+                obj
+            }
+        } else {
+            obj
+        };
+
+        // If the object is a dict, then we can extract the tensor info from it.
+        // NOTE: We are assuming that the `obj` is state_dict by this stage.
        if let Object::Dict(key_values) = obj {
            for (name, value) in key_values.into_iter() {
                match value.into_tensor_info(name, &dir_name) {
@ -688,8 +721,8 @@ pub struct PthTensors {
 }

 impl PthTensors {
-    pub fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
-        let tensor_infos = read_pth_tensor_info(path.as_ref(), false)?;
+    pub fn new<P: AsRef<std::path::Path>>(path: P, key: Option<&str>) -> Result<Self> {
+        let tensor_infos = read_pth_tensor_info(path.as_ref(), false, key)?;
        let tensor_infos = tensor_infos
            .into_iter()
            .map(|ti| (ti.name.to_string(), ti))
@ -703,6 +736,7 @@ impl PthTensors {
    }

    pub fn get(&self, name: &str) -> Result<Option<Tensor>> {
+        use std::io::Read;
        let tensor_info = match self.tensor_infos.get(name) {
            None => return Ok(None),
            Some(tensor_info) => tensor_info,
@ -711,27 +745,56 @@ impl PthTensors {
        let zip_reader = std::io::BufReader::new(std::fs::File::open(&self.path)?);
        let mut zip = zip::ZipArchive::new(zip_reader)?;
        let mut reader = zip.by_name(&tensor_info.path)?;
+        let is_fortran_contiguous = tensor_info.layout.is_fortran_contiguous();
+        let rank = tensor_info.layout.shape().rank();

-        // Reading the data is a bit tricky as it can be strided, use an offset, etc.
-        // For now only support the basic case.
-        if tensor_info.layout.start_offset() != 0 || !tensor_info.layout.is_contiguous() {
+        // Reading the data is a bit tricky as it can be strided, for now only support the basic
+        // case and when the tensor is fortran contiguous.
+        if !tensor_info.layout.is_contiguous() && !is_fortran_contiguous {
            crate::bail!(
                "cannot retrieve non-contiguous tensors {:?}",
                tensor_info.layout
            )
        }
+        let start_offset = tensor_info.layout.start_offset();
+        if start_offset > 0 {
+            std::io::copy(
+                &mut reader.by_ref().take(start_offset as u64),
+                &mut std::io::sink(),
+            )?;
+        }
        let tensor = Tensor::from_reader(
            tensor_info.layout.shape().clone(),
            tensor_info.dtype,
            &mut reader,
        )?;
-        Ok(Some(tensor))
+
+        if rank > 1 && is_fortran_contiguous {
+            // Reverse the shape, e.g. Shape(2, 3, 4) -> Shape(4, 3, 2)
+            let shape_reversed: Vec<_> = tensor_info.layout.dims().iter().rev().cloned().collect();
+            let tensor = tensor.reshape(shape_reversed)?;
+
+            // Permute (transpose) the dimensions, e.g. Shape(4, 3, 2) -> Shape(2, 3, 4)
+            let dim_indeces_reversed: Vec<_> = (0..rank).rev().collect();
+            let tensor = tensor.permute(dim_indeces_reversed)?;
+            Ok(Some(tensor))
+        } else {
+            Ok(Some(tensor))
+        }
    }
 }

-/// Read all the tensors from a PyTorch pth file.
-pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tensor)>> {
-    let pth = PthTensors::new(path)?;
+/// Read all the tensors from a PyTorch pth file with a given key.
+///
+/// # Arguments
+/// * `path` - Path to the pth file.
+/// * `key` - Optional key to retrieve `state_dict` from the pth file. Sometimes the pth file
+///           contains multiple objects and the state_dict is the one we are interested in.
+pub fn read_all_with_key<P: AsRef<std::path::Path>>(
+    path: P,
+    key: Option<&str>,
+) -> Result<Vec<(String, Tensor)>> {
+    let pth = PthTensors::new(path, key)?;
    let tensor_names = pth.tensor_infos.keys();
    let mut tensors = Vec::with_capacity(tensor_names.len());
    for name in tensor_names {
@ -741,3 +804,11 @@ pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tenso
    }
    Ok(tensors)
 }
+
+/// Read all the tensors from a PyTorch pth file.
+///
+/// # Arguments
+/// * `path` - Path to the pth file.
+pub fn read_all<P: AsRef<std::path::Path>>(path: P) -> Result<Vec<(String, Tensor)>> {
+    read_all_with_key(path, None)
+}
--- a/candle-core/src/quantized/dummy_metal.rs
+++ b/candle-core/src/quantized/dummy_metal.rs
@ -0,0 +1,43 @@
+#![allow(unused)]
+use super::GgmlDType;
+use crate::{Error, MetalDevice, MetalStorage, Result};
+
+pub struct QMetalStorage {
+    dtype: GgmlDType,
+    device: MetalDevice,
+}
+
+impl QMetalStorage {
+    pub fn zeros(_: &MetalDevice, _: usize, _: GgmlDType) -> Result<Self> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+
+    pub fn dtype(&self) -> GgmlDType {
+        self.dtype
+    }
+
+    pub fn device(&self) -> &MetalDevice {
+        &self.device
+    }
+
+    pub fn dequantize(&self, _elem_count: usize) -> Result<MetalStorage> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+
+    pub fn quantize(&mut self, _src: &MetalStorage) -> Result<()> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+
+    pub fn storage_size_in_bytes(&self) -> usize {
+        0
+    }
+
+    pub fn fwd(
+        &self,
+        _self_shape: &crate::Shape,
+        _storage: &MetalStorage,
+        _layout: &crate::Layout,
+    ) -> Result<(MetalStorage, crate::Shape)> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+}
--- a/candle-core/src/quantized/ggml_file.rs
+++ b/candle-core/src/quantized/ggml_file.rs
@ -233,6 +233,7 @@ pub struct Content {
    pub hparams: HParams,
    pub vocab: Vocab,
    pub tensors: HashMap<String, super::QTensor>,
+    pub device: Device,
 }

 impl Content {
@ -252,11 +253,13 @@ impl Content {
            let (name, tensor) = read_one_tensor(reader, magic, device)?;
            tensors.insert(name, tensor);
        }
+        let device = device.clone();
        Ok(Self {
            magic,
            hparams,
            vocab,
            tensors,
+            device,
        })
    }

--- a/candle-core/src/quantized/k_quants.rs
+++ b/candle-core/src/quantized/k_quants.rs
@ -1545,13 +1545,13 @@ impl GgmlType for BlockQ5K {
                let d2 = d * sc as f32;
                let m2 = min * m as f32;
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u1 != 0 { 16 } else { 1 };
-                    y[ys_index] = d1 * ((ql & 0xF) + to_add) as f32 - m1;
+                    let to_add = if qh & u1 != 0 { 16f32 } else { 0f32 };
+                    y[ys_index] = d1 * ((ql & 0xF) as f32 + to_add) - m1;
                    ys_index += 1;
                }
                for (ql, qh) in ql.iter().zip(qh) {
-                    let to_add = if qh & u2 != 0 { 16 } else { 1 };
-                    y[ys_index] = d2 * ((ql >> 4) + to_add) as f32 - m2;
+                    let to_add = if qh & u2 != 0 { 16f32 } else { 0f32 };
+                    y[ys_index] = d2 * ((ql >> 4) as f32 + to_add) - m2;
                    ys_index += 1;
                }
                is += 2;
--- a/candle-core/src/quantized/metal.rs
+++ b/candle-core/src/quantized/metal.rs
@ -1,5 +1,6 @@
 use super::{GgmlDType, QStorage};
-use crate::{DType, MetalDevice, MetalStorage, Result};
+use crate::backend::BackendStorage;
+use crate::{DType, MetalDevice, MetalStorage, Result, Shape};
 use metal::Buffer;
 use std::sync::Arc;

@ -10,20 +11,26 @@ pub struct QMetalStorage {
 }

 impl QMetalStorage {
+    pub fn zeros(device: &MetalDevice, elem_count: usize, dtype: GgmlDType) -> Result<Self> {
+        let size = elem_count * dtype.type_size() / dtype.block_size();
+        let buffer = device.allocate_zeros(size)?;
+        Ok(Self {
+            buffer,
+            device: device.clone(),
+            dtype,
+        })
+    }
+
    pub fn dtype(&self) -> GgmlDType {
        self.dtype
    }

-    pub fn buffer(&self) -> &Buffer {
-        &self.buffer
+    pub fn device(&self) -> &MetalDevice {
+        &self.device
    }

-    pub fn new(buffer: Arc<Buffer>, device: MetalDevice, dtype: GgmlDType) -> Self {
-        Self {
-            device,
-            buffer,
-            dtype,
-        }
+    pub fn buffer(&self) -> &Buffer {
+        &self.buffer
    }

    pub fn dequantize(&self, elem_count: usize) -> Result<MetalStorage> {
@ -32,9 +39,7 @@ impl QMetalStorage {
        command_buffer.set_label("to_cpu");
        let blit = command_buffer.new_blit_command_encoder();
        blit.set_label("blit_to_cpu");
-        // blit.wait_for_fence(&self.device.fence());
        blit.copy_from_buffer(&self.buffer, 0, &buffer, 0, self.buffer.length());
-        // blit.update_fence(&self.device.fence());
        blit.end_encoding();
        self.device.wait_until_completed()?;
        let mut out = vec![0.0; elem_count];
@ -132,6 +137,59 @@ impl QMetalStorage {
        self.buffer = buffer;
        Ok(())
    }
+
+    pub fn storage_size_in_bytes(&self) -> usize {
+        self.buffer.length() as usize
+    }
+
+    pub fn fwd(
+        &self,
+        self_shape: &Shape,
+        storage: &MetalStorage,
+        layout: &crate::Layout,
+    ) -> Result<(MetalStorage, Shape)> {
+        use crate::MetalError;
+
+        if !layout.is_contiguous() {
+            crate::bail!("input tensor is not contiguous {layout:?}")
+        }
+        let src_shape = layout.shape();
+        // self is transposed so n is first then k.
+        if src_shape.rank() < 2 {
+            crate::bail!("input tensor has only one dimension {layout:?}")
+        }
+        let (n, k) = self_shape.dims2()?;
+        let mut dst_shape = src_shape.dims().to_vec();
+
+        let (b, m) = match dst_shape.len() {
+            3 => (dst_shape[0], dst_shape[1]),
+            2 => (1, dst_shape[0]),
+            n => crate::bail!("Invalid rank {n} for quantized matmul metal"),
+        };
+        let last_k = dst_shape.pop().unwrap();
+        if last_k != k {
+            crate::bail!("input tensor {layout:?} incompatible with {:?}", self_shape)
+        }
+        dst_shape.push(n);
+        let dst_shape = Shape::from(dst_shape);
+        let device = storage.device().clone();
+        let dst = device.new_buffer(dst_shape.elem_count(), DType::F32, "qmatmul")?;
+        let command_buffer = device.command_buffer()?;
+        candle_metal_kernels::call_quantized_matmul_t(
+            device.device(),
+            &command_buffer,
+            device.kernels(),
+            self.dtype.into(),
+            (b, m, n, k),
+            storage.buffer(),
+            layout.start_offset() * storage.dtype().size_in_bytes(),
+            &self.buffer,
+            &dst,
+        )
+        .map_err(MetalError::from)?;
+        let dst_storage = crate::MetalStorage::new(dst, device, DType::F32);
+        Ok((dst_storage, dst_shape))
+    }
 }

 pub fn load_quantized_metal<T: super::GgmlType + Send + Sync + 'static>(
@ -153,3 +211,24 @@ fn read_to_vec<T: Clone>(buffer: &Buffer, n: usize) -> Vec<T> {
    let slice = unsafe { std::slice::from_raw_parts(ptr, n) };
    slice.to_vec()
 }
+
+impl From<GgmlDType> for candle_metal_kernels::GgmlDType {
+    fn from(value: GgmlDType) -> Self {
+        match value {
+            GgmlDType::Q4_0 => candle_metal_kernels::GgmlDType::Q4_0,
+            GgmlDType::Q4_1 => candle_metal_kernels::GgmlDType::Q4_1,
+            GgmlDType::Q5_0 => candle_metal_kernels::GgmlDType::Q5_0,
+            GgmlDType::Q5_1 => candle_metal_kernels::GgmlDType::Q5_1,
+            GgmlDType::Q8_0 => candle_metal_kernels::GgmlDType::Q8_0,
+            GgmlDType::Q8_1 => candle_metal_kernels::GgmlDType::Q8_1,
+            GgmlDType::Q2K => candle_metal_kernels::GgmlDType::Q2K,
+            GgmlDType::Q3K => candle_metal_kernels::GgmlDType::Q3K,
+            GgmlDType::Q4K => candle_metal_kernels::GgmlDType::Q4K,
+            GgmlDType::Q5K => candle_metal_kernels::GgmlDType::Q5K,
+            GgmlDType::Q6K => candle_metal_kernels::GgmlDType::Q6K,
+            GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K,
+            GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16,
+            GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32,
+        }
+    }
+}
--- a/candle-core/src/quantized/mod.rs
+++ b/candle-core/src/quantized/mod.rs
@ -1,16 +1,19 @@
-#[cfg(feature = "metal")]
-use crate::{backend::BackendStorage, DType};
 use crate::{CpuStorage, Device, Result, Shape, Storage, Tensor};
 use k_quants::*;
 use std::borrow::Cow;

 #[cfg(target_feature = "avx")]
 pub mod avx;
+mod dummy_metal;
 pub mod ggml_file;
 pub mod gguf_file;
 pub mod k_quants;
 #[cfg(feature = "metal")]
 pub mod metal;
+#[cfg(not(feature = "metal"))]
+mod metal {
+    pub use super::dummy_metal::*;
+}
 #[cfg(target_feature = "neon")]
 pub mod neon;
 #[cfg(target_feature = "simd128")]
@ -32,19 +35,9 @@ impl Device {
                let storage = dtype.cpu_zeros(elem_count);
                Ok(QStorage::Cpu(storage))
            }
-            #[cfg(feature = "metal")]
            Device::Metal(metal) => {
-                let size = elem_count * dtype.type_size() / dtype.block_size();
-                let buffer = metal.allocate_zeros(size)?;
-                Ok(QStorage::Metal(metal::QMetalStorage::new(
-                    buffer,
-                    metal.clone(),
-                    dtype,
-                )))
-            }
-            #[cfg(not(feature = "metal"))]
-            Device::Metal(_metal) => {
-                crate::bail!("Metal feature not activated");
+                let storage = metal::QMetalStorage::zeros(metal, elem_count, dtype)?;
+                Ok(QStorage::Metal(storage))
            }
            Device::Cuda(_cuda) => {
                crate::bail!("Cuda ggml quantization not supported");
@ -55,7 +48,6 @@ impl Device {

 pub enum QStorage {
    Cpu(Box<dyn QuantizedType>),
-    #[cfg(feature = "metal")]
    Metal(metal::QMetalStorage),
 }

@ -63,7 +55,6 @@ impl QStorage {
    fn block_size(&self) -> usize {
        match self {
            QStorage::Cpu(storage) => storage.block_size(),
-            #[cfg(feature = "metal")]
            QStorage::Metal(storage) => storage.dtype().block_size(),
        }
    }
@ -71,16 +62,21 @@ impl QStorage {
    fn dtype(&self) -> GgmlDType {
        match self {
            QStorage::Cpu(storage) => storage.dtype(),
-            #[cfg(feature = "metal")]
            QStorage::Metal(storage) => storage.dtype(),
        }
    }

+    fn device(&self) -> Device {
+        match self {
+            QStorage::Cpu(_storage) => Device::Cpu,
+            QStorage::Metal(storage) => Device::Metal(storage.device().clone()),
+        }
+    }
+
    fn size_in_bytes(&self) -> usize {
        match self {
            QStorage::Cpu(storage) => storage.storage_size_in_bytes(),
-            #[cfg(feature = "metal")]
-            QStorage::Metal(storage) => storage.buffer().length() as usize,
+            QStorage::Metal(storage) => storage.storage_size_in_bytes(),
        }
    }

@ -89,7 +85,6 @@ impl QStorage {
            (QStorage::Cpu(storage), Storage::Cpu(src)) => {
                storage.from_float(src.as_slice::<f32>()?)?;
            }
-            #[cfg(feature = "metal")]
            (QStorage::Metal(storage), Storage::Metal(src)) => storage.quantize(src)?,
            _ => crate::bail!("Invalid dequantize storage locations do not match"),
        }
@ -99,7 +94,6 @@ impl QStorage {
    fn dequantize(&self, elem_count: usize) -> Result<Storage> {
        match self {
            QStorage::Cpu(storage) => Ok(Storage::Cpu(storage.dequantize(elem_count)?)),
-            #[cfg(feature = "metal")]
            QStorage::Metal(storage) => Ok(Storage::Metal(storage.dequantize(elem_count)?)),
        }
    }
@ -112,7 +106,6 @@ impl QStorage {
                let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
                Ok(Cow::from(data))
            }
-            #[cfg(feature = "metal")]
            QStorage::Metal(_storage) => {
                crate::bail!("not implemented");
            }
@ -336,6 +329,10 @@ impl QTensor {
        self.storage.dtype()
    }

+    pub fn device(&self) -> Device {
+        self.storage.device()
+    }
+
    pub fn rank(&self) -> usize {
        self.shape.rank()
    }
@ -427,8 +424,7 @@ impl crate::CustomOp1 for QTensor {
        #[allow(clippy::infallible_destructuring_match)]
        let self_storage = match &self.storage {
            QStorage::Cpu(storage) => storage,
-            #[cfg(feature = "metal")]
-            _ => crate::bail!("Invalid storage"),
+            QStorage::Metal(_) => crate::bail!("Invalid storage"),
        };
        let slice = storage.as_slice::<f32>()?;
        let slice = &slice[layout.start_offset()..layout.start_offset() + src_shape.elem_count()];
@ -437,79 +433,16 @@ impl crate::CustomOp1 for QTensor {
        Ok((crate::CpuStorage::F32(dst_storage), dst_shape))
    }

-    #[cfg(feature = "metal")]
    fn metal_fwd(
        &self,
        storage: &crate::MetalStorage,
        layout: &crate::Layout,
    ) -> Result<(crate::MetalStorage, Shape)> {
-        use crate::MetalError;
-
-        if !layout.is_contiguous() {
-            crate::bail!("input tensor is not contiguous {layout:?}")
-        }
-        let src_shape = layout.shape();
-        // self is transposed so n is first then k.
-        if src_shape.rank() < 2 {
-            crate::bail!("input tensor has only one dimension {layout:?}")
-        }
-        let (n, k) = self.shape.dims2()?;
-        let mut dst_shape = src_shape.dims().to_vec();
-
-        let (b, m) = match dst_shape.len() {
-            3 => (dst_shape[0], dst_shape[1]),
-            2 => (1, dst_shape[0]),
-            n => crate::bail!("Invalid rank {n} for quantized matmul metal"),
-        };
-        let last_k = dst_shape.pop().unwrap();
-        if last_k != k {
-            crate::bail!("input tensor {layout:?} incompatible with {:?}", self.shape)
-        }
-        dst_shape.push(n);
-        let dst_shape = Shape::from(dst_shape);
-        let device = storage.device().clone();
-        let dst = device.new_buffer(dst_shape.elem_count(), DType::F32, "qmatmul")?;
-        let (buffer, dtype) = match &self.storage {
-            QStorage::Metal(metal) => (metal.buffer(), metal.dtype()),
+        let self_storage = match &self.storage {
+            QStorage::Metal(metal) => metal,
            _ => unreachable!("Cannot call metal matmul on non metal QTensor"),
        };
-        let command_buffer = device.command_buffer()?;
-        candle_metal_kernels::call_quantized_matmul_t(
-            device.device(),
-            &command_buffer,
-            device.kernels(),
-            dtype.into(),
-            (b, m, n, k),
-            storage.buffer(),
-            layout.start_offset() * storage.dtype().size_in_bytes(),
-            buffer,
-            &dst,
-        )
-        .map_err(MetalError::from)?;
-        let dst_storage = crate::MetalStorage::new(dst, device, DType::F32);
-        Ok((dst_storage, dst_shape))
-    }
-}
-
-#[cfg(feature = "metal")]
-impl From<GgmlDType> for candle_metal_kernels::GgmlDType {
-    fn from(value: GgmlDType) -> Self {
-        match value {
-            GgmlDType::Q4_0 => candle_metal_kernels::GgmlDType::Q4_0,
-            GgmlDType::Q4_1 => candle_metal_kernels::GgmlDType::Q4_1,
-            GgmlDType::Q5_0 => candle_metal_kernels::GgmlDType::Q5_0,
-            GgmlDType::Q5_1 => candle_metal_kernels::GgmlDType::Q5_1,
-            GgmlDType::Q8_0 => candle_metal_kernels::GgmlDType::Q8_0,
-            GgmlDType::Q8_1 => candle_metal_kernels::GgmlDType::Q8_1,
-            GgmlDType::Q2K => candle_metal_kernels::GgmlDType::Q2K,
-            GgmlDType::Q3K => candle_metal_kernels::GgmlDType::Q3K,
-            GgmlDType::Q4K => candle_metal_kernels::GgmlDType::Q4K,
-            GgmlDType::Q5K => candle_metal_kernels::GgmlDType::Q5K,
-            GgmlDType::Q6K => candle_metal_kernels::GgmlDType::Q6K,
-            GgmlDType::Q8K => candle_metal_kernels::GgmlDType::Q8K,
-            GgmlDType::F16 => candle_metal_kernels::GgmlDType::F16,
-            GgmlDType::F32 => candle_metal_kernels::GgmlDType::F32,
-        }
+        self_storage.fwd(&self.shape, storage, layout)
    }
 }

--- a/candle-core/src/quantized/neon.rs
+++ b/candle-core/src/quantized/neon.rs
@ -12,6 +12,14 @@ use core::arch::arm::*;
 #[cfg(target_arch = "aarch64")]
 use core::arch::aarch64::*;

+#[inline(always)]
+unsafe fn vdotq_s32(a: int8x16_t, b: int8x16_t) -> int32x4_t {
+    // TODO: dotprod
+    let p0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
+    let p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+    vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))
+}
+
 #[inline(always)]
 pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) -> Result<f32> {
    let qk = QK8_0;
@ -43,15 +51,8 @@ pub(crate) fn vec_dot_q4_0_q8_0(n: usize, xs: &[BlockQ4_0], ys: &[BlockQ8_0]) ->
            let v1_0l = vld1q_s8(y0.qs.as_ptr());
            let v1_0h = vld1q_s8(y0.qs.as_ptr().add(16));

-            // TODO: Support dotprod when it's available outside of nightly.
-            let pl0l = vmull_s8(vget_low_s8(v0_0ls), vget_low_s8(v1_0l));
-            let pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
-            let ph0l = vmull_s8(vget_low_s8(v0_0hs), vget_low_s8(v1_0h));
-            let ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
-
-            let pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-            let ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
+            let pl0 = vdotq_s32(v0_0ls, v1_0l);
+            let ph0 = vdotq_s32(v0_0hs, v1_0h);
            sumv0 = vmlaq_n_f32(
                sumv0,
                vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
@ -82,14 +83,8 @@ pub(crate) fn vec_dot_q8_0_q8_0(n: usize, xs: &[BlockQ8_0], ys: &[BlockQ8_0]) ->
            let y0_0 = vld1q_s8(y0.qs.as_ptr());
            let y0_1 = vld1q_s8(y0.qs.as_ptr().add(16));

-            // TODO dotprod once this is the intrinsics are.
-            let p0_0 = vmull_s8(vget_low_s8(x0_0), vget_low_s8(y0_0));
-            let p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-            let p0_2 = vmull_s8(vget_low_s8(x0_1), vget_low_s8(y0_1));
-            let p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-            let p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-            let p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+            let p0 = vdotq_s32(x0_0, y0_0);
+            let p1 = vdotq_s32(x0_1, y0_1);

            sumv0 = vmlaq_n_f32(
                sumv0,
@ -118,10 +113,7 @@ pub(crate) fn vec_dot_q8k_q8k(n: usize, xs: &[BlockQ8K], ys: &[BlockQ8K]) -> Res
            for i in (0..QK_K).step_by(16) {
                let xs = vld1q_s8(xs.add(i));
                let ys = vld1q_s8(ys.add(i));
-                let xy_lo = vmull_s8(vget_low_s8(xs), vget_low_s8(ys));
-                let xy_up = vmull_s8(vget_high_s8(xs), vget_high_s8(ys));
-
-                let xy = vaddq_s32(vpaddlq_s16(xy_lo), vpaddlq_s16(xy_up));
+                let xy = vdotq_s32(xs, ys);
                sum_i = vaddq_s32(sum_i, xy)
            }
            sumf += vaddvq_s32(sum_i) as f32 * scale
@ -191,30 +183,16 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.2, m4b), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.3, m4b), q6h_3));

-                // TODO: dotprod
-
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
-                );
+                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
+                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
+                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
                scale = scale.add(2);

-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
-                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
-                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
-                );
+                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
+                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
+                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
                scale = scale.add(2);

                let q8bytes = vld1q_s8_x4(q8);
@ -234,29 +212,16 @@ pub(crate) fn vec_dot_q6k_q8k(n: usize, xs: &[BlockQ6K], ys: &[BlockQ8K]) -> Res
                let q6bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.2, 4), q6h_2));
                let q6bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.3, 4), q6h_3));

-                // TODO: dotprod case.
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q6bytes_0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q6bytes_1), vget_high_s8(q8bytes.1)),
-                );
+                let p0 = vdotq_s32(q6bytes_0, q8bytes.0);
+                let p1 = vdotq_s32(q6bytes_1, q8bytes.1);
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p0) as i32 * scale0 + vaddvq_s16(p1) as i32 * scale1;
+                isum += vaddvq_s32(p0) * scale0 + vaddvq_s32(p1) * scale1;
                scale = scale.add(2);

-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_2), vget_low_s8(q8bytes.2)),
-                    vmull_s8(vget_high_s8(q6bytes_2), vget_high_s8(q8bytes.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q6bytes_3), vget_low_s8(q8bytes.3)),
-                    vmull_s8(vget_high_s8(q6bytes_3), vget_high_s8(q8bytes.3)),
-                );
+                let p2 = vdotq_s32(q6bytes_2, q8bytes.2);
+                let p3 = vdotq_s32(q6bytes_3, q8bytes.3);
                let (scale0, scale1) = (*scale as i32, *scale.add(1) as i32);
-                isum += vaddvq_s16(p2) as i32 * scale0 + vaddvq_s16(p3) as i32 * scale1;
+                isum += vaddvq_s32(p2) * scale0 + vaddvq_s32(p3) * scale1;
                scale = scale.add(2);
            }
            sum += d_all * y.d * ((isum - 32 * isum_mins) as f32);
@ -333,28 +298,14 @@ pub(crate) fn vec_dot_q5k_q8k(n: usize, xs: &[BlockQ5K], ys: &[BlockQ8K]) -> Res
                let q5bytes_2 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.0, 4), q5h_2));
                let q5bytes_3 = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.1, 4), q5h_3));

-                // TODO: dotprod
-
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q5bytes_0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q5bytes_1), vget_high_s8(q8bytes.1)),
-                );
-                sumi += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * *scales as i32;
+                let p0 = vdotq_s32(q5bytes_0, q8bytes.0);
+                let p1 = vdotq_s32(q5bytes_1, q8bytes.1);
+                sumi += vaddvq_s32(vaddq_s32(p0, p1)) * *scales as i32;
                scales = scales.add(1);

-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_2), vget_low_s8(q8bytes.2)),
-                    vmull_s8(vget_high_s8(q5bytes_2), vget_high_s8(q8bytes.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q5bytes_3), vget_low_s8(q8bytes.3)),
-                    vmull_s8(vget_high_s8(q5bytes_3), vget_high_s8(q8bytes.3)),
-                );
-                sumi += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * *scales as i32;
+                let p2 = vdotq_s32(q5bytes_2, q8bytes.2);
+                let p3 = vdotq_s32(q5bytes_3, q8bytes.3);
+                sumi += vaddvq_s32(vaddq_s32(p2, p3)) * *scales as i32;
                scales = scales.add(1);
            }
            sumf += d * sumi as f32 - dmin * sumi_mins as f32;
@ -417,22 +368,15 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
            for j in 0..QK_K / 64 {
                let q4bits = vld1q_u8_x2(q4);
                q4 = q4.add(32);
-                // TODO: dotprod
                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
                let q4bytes = int8x16x2_t(
                    vreinterpretq_s8_u8(vandq_u8(q4bits.0, m4b)),
                    vreinterpretq_s8_u8(vandq_u8(q4bits.1, m4b)),
                );
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
-                );
-                sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) as i32 * scales[2 * j] as i32;
+                let p0 = vdotq_s32(q4bytes.0, q8bytes.0);
+                let p1 = vdotq_s32(q4bytes.1, q8bytes.1);
+                sumi1 += vaddvq_s32(vaddq_s32(p0, p1)) * scales[2 * j] as i32;

                let q8bytes = vld1q_s8_x2(q8);
                q8 = q8.add(32);
@ -440,15 +384,9 @@ pub(crate) fn vec_dot_q4k_q8k(n: usize, xs: &[BlockQ4K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.0, 4)),
                    vreinterpretq_s8_u8(vshrq_n_u8(q4bits.1, 4)),
                );
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.0), vget_low_s8(q8bytes.0)),
-                    vmull_s8(vget_high_s8(q4bytes.0), vget_high_s8(q8bytes.0)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q4bytes.1), vget_low_s8(q8bytes.1)),
-                    vmull_s8(vget_high_s8(q4bytes.1), vget_high_s8(q8bytes.1)),
-                );
-                sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) as i32 * scales[2 * j + 1] as i32;
+                let p2 = vdotq_s32(q4bytes.0, q8bytes.0);
+                let p3 = vdotq_s32(q4bytes.1, q8bytes.1);
+                sumi2 += vaddvq_s32(vaddq_s32(p2, p3)) * scales[2 * j + 1] as i32;
            }
            sumf += d * (sumi1 + sumi2) as f32;
        }
@ -526,27 +464,14 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );

-                // TODO: dotprod
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_1.0)),
-                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_1.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_1.1)),
-                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_1.1)),
-                );
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_1.2)),
-                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_1.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_1.3)),
-                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_1.3)),
-                );
-                isum += vaddvq_s16(p0) as i32 * *scale as i32
-                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
-                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
-                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
+                let p0 = vdotq_s32(q3bytes_0, q8bytes_1.0);
+                let p1 = vdotq_s32(q3bytes_1, q8bytes_1.1);
+                let p2 = vdotq_s32(q3bytes_2, q8bytes_1.2);
+                let p3 = vdotq_s32(q3bytes_3, q8bytes_1.3);
+                isum += vaddvq_s32(p0) * *scale as i32
+                    + vaddvq_s32(p1) * *scale.add(1) as i32
+                    + vaddvq_s32(p2) * *scale.add(2) as i32
+                    + vaddvq_s32(p3) * *scale.add(3) as i32;
                scale = scale.add(4);

                let q3h_0 = vbicq_u8(m2, qhbits.0);
@ -571,27 +496,14 @@ pub(crate) fn vec_dot_q3k_q8k(n: usize, xs: &[BlockQ3K], ys: &[BlockQ8K]) -> Res
                    vreinterpretq_s8_u8(q3h_3),
                );

-                // TODO: dotprod
-                let p0 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_0), vget_low_s8(q8bytes_2.0)),
-                    vmull_s8(vget_high_s8(q3bytes_0), vget_high_s8(q8bytes_2.0)),
-                );
-                let p1 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_1), vget_low_s8(q8bytes_2.1)),
-                    vmull_s8(vget_high_s8(q3bytes_1), vget_high_s8(q8bytes_2.1)),
-                );
-                let p2 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_2), vget_low_s8(q8bytes_2.2)),
-                    vmull_s8(vget_high_s8(q3bytes_2), vget_high_s8(q8bytes_2.2)),
-                );
-                let p3 = vaddq_s16(
-                    vmull_s8(vget_low_s8(q3bytes_3), vget_low_s8(q8bytes_2.3)),
-                    vmull_s8(vget_high_s8(q3bytes_3), vget_high_s8(q8bytes_2.3)),
-                );
-                isum += vaddvq_s16(p0) as i32 * *scale as i32
-                    + vaddvq_s16(p1) as i32 * *scale.add(1) as i32
-                    + vaddvq_s16(p2) as i32 * *scale.add(2) as i32
-                    + vaddvq_s16(p3) as i32 * *scale.add(3) as i32;
+                let p0 = vdotq_s32(q3bytes_0, q8bytes_2.0);
+                let p1 = vdotq_s32(q3bytes_1, q8bytes_2.1);
+                let p2 = vdotq_s32(q3bytes_2, q8bytes_2.2);
+                let p3 = vdotq_s32(q3bytes_3, q8bytes_2.3);
+                isum += vaddvq_s32(p0) * *scale as i32
+                    + vaddvq_s32(p1) * *scale.add(1) as i32
+                    + vaddvq_s32(p2) * *scale.add(2) as i32
+                    + vaddvq_s32(p3) * *scale.add(3) as i32;
                scale = scale.add(4);

                if j == 0 {
@ -649,7 +561,6 @@ pub(crate) fn vec_dot_q2k_q8k(n: usize, xs: &[BlockQ2K], ys: &[BlockQ8K]) -> Res
            let mut is = 0usize;

            // TODO: dotprod
-
            for _j in 0..QK_K / 128 {
                let q2bits = vld1q_u8_x2(q2);
                q2 = q2.add(32);
@ -696,14 +607,7 @@ unsafe fn multiply_accum_with_scale(
    q2bytes: int8x16x2_t,
    q8bytes: int8x16x2_t,
 ) -> i32 {
-    let p1 = vaddq_s16(
-        vmull_s8(vget_low_s8(q2bytes.0), vget_low_s8(q8bytes.0)),
-        vmull_s8(vget_high_s8(q2bytes.0), vget_high_s8(q8bytes.0)),
-    );
-    let p2 = vaddq_s16(
-        vmull_s8(vget_low_s8(q2bytes.1), vget_low_s8(q8bytes.1)),
-        vmull_s8(vget_high_s8(q2bytes.1), vget_high_s8(q8bytes.1)),
-    );
-    vaddvq_s16(p1) as i32 * aux[is + index] as i32
-        + vaddvq_s16(p2) as i32 * aux[is + 1 + index] as i32
+    let p1 = vdotq_s32(q2bytes.0, q8bytes.0);
+    let p2 = vdotq_s32(q2bytes.1, q8bytes.1);
+    vaddvq_s32(p1) * aux[is + index] as i32 + vaddvq_s32(p2) * aux[is + 1 + index] as i32
 }
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -426,9 +426,7 @@ impl Tensor {
        if buffer_size != shape.elem_count() {
            return Err(Error::ShapeMismatch { buffer_size, shape }.bt());
        }
-        // println!("from vec {buffer_size}");
        let storage = device.storage_owned(data)?;
-        // println!("Created storage");
        let none = BackpropOp::none();
        Ok(from_storage(storage, shape, none, is_variable))
    }
@ -806,6 +804,35 @@ impl Tensor {
        }
    }

+    /// Roll the tensor input along the given dimension.
+    /// Elements that are shifted beyond the last position are re-introduced at the first position.
+    ///
+    /// ```rust
+    /// # use candle_core::{Tensor, Device};
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.roll(1, 0)?;
+    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[4., 5.], [0., 1.], [2., 3.]]);
+    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
+    /// let tensor = tensor.roll(-1, 0)?;
+    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[2., 3.], [4., 5.], [0., 1.]]);
+    /// # Ok::<(), candle_core::Error>(())
+    /// ```
+    pub fn roll<D>(&self, shift: i32, dim: D) -> Result<Self>
+    where
+        D: Dim + Clone,
+    {
+        let dim = dim.to_index(self.shape(), "roll")?;
+        let dim_size = self.dim(dim)?;
+        let shift = shift.rem_euclid(dim_size as i32) as usize;
+        if shift == 0 {
+            Ok(self.clone())
+        } else {
+            let a = self.narrow(dim, 0, dim_size - shift)?;
+            let b = self.narrow(dim, dim_size - shift, shift)?;
+            Tensor::cat(&[&b, &a], dim)
+        }
+    }
+
    /// Returns the sum of all elements in the input tensor. The sum is performed over all the
    /// input dimensions.
    ///
@ -1855,9 +1882,9 @@ impl Tensor {
    /// this new node. The storage of this tensor is shared with the initial tensor.
    ///
    /// If the tensor is already detached from the computation graph, the same tensor is returned.
-    pub fn detach(&self) -> Result<Tensor> {
+    pub fn detach(&self) -> Tensor {
        if self.op.is_none() && !self.is_variable {
-            Ok(self.clone())
+            self.clone()
        } else {
            let tensor_ = Tensor_ {
                id: TensorId::new(),
@ -1868,7 +1895,7 @@ impl Tensor {
                dtype: self.dtype,
                device: self.device.clone(),
            };
-            Ok(Tensor(Arc::new(tensor_)))
+            Tensor(Arc::new(tensor_))
        }
    }

@ -2580,11 +2607,21 @@ impl Tensor {
    }

    /// Returns log(sum(exp(tensor), dim)).
-    pub fn logsumexp<D: Dims>(&self, sum_dims: D) -> Result<Self> {
+    pub fn log_sum_exp<D: Dims>(&self, sum_dims: D) -> Result<Self> {
        let exp = self.exp()?;
        let sum = exp.sum(sum_dims)?;
        sum.log()
    }
+
+    /// Pointwise pow operation.
+    pub fn pow(&self, rhs: &Tensor) -> Result<Self> {
+        rhs.mul(&self.log()?)?.exp()
+    }
+
+    /// Broadcasting version of `pow`.
+    pub fn broadcast_pow(&self, rhs: &Tensor) -> Result<Self> {
+        rhs.broadcast_mul(&self.log()?)?.exp()
+    }
 }

 macro_rules! bin_trait {
--- a/candle-core/src/variable.rs
+++ b/candle-core/src/variable.rs
@ -107,6 +107,10 @@ impl Var {
        Ok(Self(inner))
    }

+    pub fn as_detached_tensor(&self) -> Tensor {
+        self.0.detach()
+    }
+
    pub fn as_tensor(&self) -> &Tensor {
        &self.0
    }
--- a/candle-core/tests/conv_tests.rs
+++ b/candle-core/tests/conv_tests.rs
@ -50,17 +50,15 @@ fn conv1d(dev: &Device) -> Result<()> {
        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
        [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352]
    );
-    if dev.is_cpu() {
-        let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
-        assert_eq!(res.dims(), [1, 2, 7]);
-        assert_eq!(
-            test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
-            [
-                0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
-                4.7076, -5.9745, -0.8276, 1.621
-            ],
-        );
-    }
+    let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1)?;
+    assert_eq!(res.dims(), [1, 2, 7]);
+    assert_eq!(
+        test_utils::to_vec1_round(&res.flatten_all()?, 4)?,
+        [
+            0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538,
+            4.7076, -5.9745, -0.8276, 1.621
+        ],
+    );
    Ok(())
 }

--- a/candle-core/tests/fortran_tensor_3d.pth
+++ b/candle-core/tests/fortran_tensor_3d.pth
--- a/candle-core/tests/pth.py
+++ b/candle-core/tests/pth.py
@ -0,0 +1,37 @@
+import torch
+from collections import OrderedDict
+
+# Write a trivial tensor to a pt file
+a= torch.tensor([[1,2,3,4], [5,6,7,8]])
+o = OrderedDict()
+o["test"] = a
+
+# Write a trivial tensor to a pt file
+torch.save(o, "test.pt")
+
+############################################################################################################
+# Write a trivial tensor to a pt file with a key
+torch.save({"model_state_dict": o}, "test_with_key.pt")
+
+############################################################################################################
+# Create a tensor with fortran contiguous memory layout
+import numpy as np
+
+# Step 1: Create a 3D NumPy array with Fortran order using a range of numbers
+# For example, creating a 2x3x4 array
+array_fortran = np.asfortranarray(np.arange(1, 2*3*4 + 1).reshape(2, 3, 4))
+
+# Verify the memory order
+print("Is Fortran contiguous (F order):", array_fortran.flags['F_CONTIGUOUS'])  # Should be True
+print("Is C contiguous (C order):", array_fortran.flags['C_CONTIGUOUS'])  # Should be False
+
+# Step 2: Convert the NumPy array to a PyTorch tensor
+tensor_fortran = torch.from_numpy(array_fortran)
+
+# Verify the tensor layout
+print("Tensor stride:", tensor_fortran.stride())  # Stride will reflect the Fortran memory layout
+
+# Step 3: Save the PyTorch tensor to a .pth file
+torch.save({"tensor_fortran": tensor_fortran}, 'fortran_tensor_3d.pth')
+
+print("3D Tensor saved with Fortran layout.")
--- a/candle-core/tests/pth_tests.rs
+++ b/candle-core/tests/pth_tests.rs
@ -0,0 +1,31 @@
+/// Regression test for pth files not loading on Windows.
+#[test]
+fn test_pth() {
+    let tensors = candle_core::pickle::PthTensors::new("tests/test.pt", None).unwrap();
+    tensors.get("test").unwrap().unwrap();
+}
+
+#[test]
+fn test_pth_with_key() {
+    let tensors =
+        candle_core::pickle::PthTensors::new("tests/test_with_key.pt", Some("model_state_dict"))
+            .unwrap();
+    tensors.get("test").unwrap().unwrap();
+}
+
+#[test]
+fn test_pth_fortran_congiguous() {
+    let tensors =
+        candle_core::pickle::PthTensors::new("tests/fortran_tensor_3d.pth", None).unwrap();
+    let tensor = tensors.get("tensor_fortran").unwrap().unwrap();
+
+    assert_eq!(tensor.dims3().unwrap(), (2, 3, 4));
+
+    assert_eq!(
+        tensor.to_vec3::<i64>().unwrap(),
+        [
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+            [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]
+        ]
+    );
+}
--- a/candle-core/tests/quantized_tests.rs
+++ b/candle-core/tests/quantized_tests.rs
@ -1,4 +1,5 @@
 use candle_core::{
+    bail,
    quantized::{self, GgmlDType},
    test_device,
    test_utils::to_vec2_round,
@ -46,6 +47,10 @@ fn test_matmul(
 }

 fn quantized_matmul(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let (m, k, n) = (3, 64, 4);
    let lhs = (0..(m * k)).map(|v| v as f32).collect::<Vec<_>>();
    let tensor_lhs = Tensor::from_slice(&lhs, (m, k), device)?;
@ -100,6 +105,10 @@ fn quantized_matmul(device: &Device) -> Result<()> {
 }

 fn quantized_matmul_neg(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let (m, k, n) = (3, 64, 4);
    let lhs = (0..(m * k))
        .map(|v| v as f32 - (m * k) as f32 / 2.0)
@ -169,6 +178,10 @@ test_device!(
 );

 fn quantize_q4_0(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();

    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
@ -196,6 +209,10 @@ fn quantize_q4_0(device: &Device) -> Result<()> {
 }

 fn quantize_q4_1(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_1)?;
@ -222,6 +239,10 @@ fn quantize_q4_1(device: &Device) -> Result<()> {
 }

 fn quantize_q5_0(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_0)?;
@ -248,6 +269,10 @@ fn quantize_q5_0(device: &Device) -> Result<()> {
 }

 fn quantize_q5_1(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
    let src = Tensor::from_slice(&src, (32 * 4,), device)?;
    let quant = quantized::QTensor::quantize(&src, GgmlDType::Q5_1)?;
@ -309,7 +334,8 @@ fn compare_with_error(values: &[f32], expected: &[f32], tolerance: f32) {
    }
 }

-/// Creates a vector simillarly to the one used in GGML unit tests: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
+/// Creates a vector similar to the ones used in GGML unit tests:
+/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L26-L30
 fn create_ggml_like_vector(offset: f32) -> Vec<f32> {
    (0..GGML_TEST_SIZE)
        .map(|i| 0.1 + 2.0 * (i as f32 + offset).cos())
@ -328,7 +354,8 @@ fn calculate_rmse(a: &[f32], b: &[f32]) -> f32 {
    sum / a.len() as f32
 }

-/// Mirrores the GGML quanitzation unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
+/// Similar to the GGML quantization unit test:
+/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L43-L50
 fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f32) -> Result<()> {
    let src = create_ggml_like_vector(0.0);
    let src = Tensor::from_slice(&src, (GGML_TEST_SIZE,), device)?;
@ -336,7 +363,7 @@ fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f3
    let dst = quant.dequantize(device)?;
    let error = calculate_rmse(&src.to_vec1::<f32>()?, &dst.to_vec1::<f32>()?);
    if error > max_error {
-        candle_core::bail!(
+        bail!(
            "Quantization error {} exceeds max error {}",
            error,
            max_error
@ -346,6 +373,10 @@ fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f3
 }

 fn quantize_q2k(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let dtype = GgmlDType::Q2K;

    let src = get_test_vector2(0.5, 1024, device)?;
@ -380,6 +411,10 @@ fn quantize_q2k(device: &Device) -> Result<()> {
 }

 fn quantize_q3k(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let dtype = GgmlDType::Q3K;
    let src = get_test_vector2(0.5, 1024, device)?;
    let quant = quantized::QTensor::quantize(&src, dtype)?;
@ -413,6 +448,10 @@ fn quantize_q3k(device: &Device) -> Result<()> {
 }

 fn quantize_q4k(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let dtype = GgmlDType::Q4K;
    let src = get_test_vector2(0.5, 1024, device)?;
    let quant = quantized::QTensor::quantize(&src, dtype)?;
@ -446,6 +485,10 @@ fn quantize_q4k(device: &Device) -> Result<()> {
 }

 fn quantize_q5k(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let dtype = GgmlDType::Q5K;
    let src = get_test_vector2(0.5, 1024, device)?;
    let quant = quantized::QTensor::quantize(&src, dtype)?;
@ -463,7 +506,7 @@ fn quantize_q5k(device: &Device) -> Result<()> {
    let dst = round_vector(&dst);
    assert_eq!(
        [dst[0], dst[128], dst[256], dst[512], dst[800], dst[1023]],
-        [-0.499, -0.372, -0.249, 0.001, 0.279, 0.499]
+        [-0.5, -0.373, -0.25, 0.0, 0.279, 0.499]
    );

    let src_big = get_test_vector2(128.0, 1024, device)?;
@ -479,6 +522,10 @@ fn quantize_q5k(device: &Device) -> Result<()> {
 }

 fn quantize_q6k(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let dtype = GgmlDType::Q6K;
    let src = get_test_vector2(0.5, 1024, device)?;
    let quant = quantized::QTensor::quantize(&src, dtype)?;
@ -512,6 +559,10 @@ fn quantize_q6k(device: &Device) -> Result<()> {
 }

 fn quantize_q8k(device: &Device) -> Result<()> {
+    // TODO Enable this later when we enable cuda.
+    if device.is_cuda() {
+        return Ok(());
+    }
    let dtype = GgmlDType::Q8K;
    let src = get_test_vector2(0.5, 1024, device)?;
    let quant = quantized::QTensor::quantize(&src, dtype)?;
@ -620,54 +671,66 @@ fn ggml_reference_matmul_error(dtype: GgmlDType) -> Result<f32> {
        GgmlDType::Q5K => 0.000740,
        GgmlDType::Q6K => 0.000952,
        GgmlDType::Q4_0 => 0.001143,
-        GgmlDType::Q4_1 => 0.007784,
+        GgmlDType::Q4_1 => 0.008,
        GgmlDType::Q5_0 => 0.001353,
-        GgmlDType::Q5_1 => 0.001363,
+        GgmlDType::Q5_1 => 0.00149,
        GgmlDType::Q8_0 => 0.000092,

        // Not from the ggml repo.
        GgmlDType::Q8K => 0.00065,
-        _ => candle_core::bail!("No GGML results for quantization type {dtype:?}",),
+        _ => bail!("No GGML results for quantization type {dtype:?}",),
    };
    Ok(err)
 }

-/// Mirrores the GGML matmul unit test: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
+/// Similar to the GGML matmul unit test:
+/// https://github.com/ggerganov/llama.cpp/blob/master/tests/test-quantize-fns.cpp#L76-L91
 fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
    let a = create_ggml_like_vector(0.0);
    let b = create_ggml_like_vector(1.0);
+    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 1.0)?;
+    // Another example that is more likely to trigger the overflow reported in #1526
+    let a = (0..GGML_TEST_SIZE)
+        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
+        .collect::<Vec<_>>();
+    let b = (0..GGML_TEST_SIZE)
+        .map(|i| i as f32 / GGML_TEST_SIZE as f32)
+        .collect::<Vec<_>>();
+    ggml_matmul_error_test_::<T>(a.as_slice(), b.as_slice(), 2.0)?;
+    Ok(())
+}
+
+fn ggml_matmul_error_test_<T: GgmlType>(a: &[f32], b: &[f32], err_m: f32) -> Result<()> {
    let length = a.len();

    let mut a_quant = vec![T::zeros(); length / T::BLCK_SIZE];
    let mut b_quant = vec![T::VecDotType::zeros(); length / T::VecDotType::BLCK_SIZE];
-    T::from_float(&a, &mut a_quant)?;
-    T::VecDotType::from_float(&b, &mut b_quant)?;
+    T::from_float(a, &mut a_quant)?;
+    T::VecDotType::from_float(b, &mut b_quant)?;

    let result = T::vec_dot(length, &a_quant, &b_quant)?;
    let result_unopt = T::vec_dot_unopt(length, &a_quant, &b_quant)?;
-    let reference_result = vec_dot_reference(&a, &b);
+    let reference_result = vec_dot_reference(a, b);

    if (result - result_unopt).abs() / length as f32 > 1e-6 {
-        candle_core::bail!(
+        bail!(
            "the opt and unopt vec-dot returned different values, opt {result}, unopt {result_unopt}"
        )
    }

    let error = (result - reference_result).abs() / length as f32;

-    let ggml_error = ggml_reference_matmul_error(T::DTYPE)?;
+    let ggml_error = ggml_reference_matmul_error(T::DTYPE)? * err_m;

    if !error.is_finite() || error > GGML_MAX_DOT_PRODUCT_ERROR {
-        candle_core::bail!(
-            "Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",
-        );
+        bail!("Dot product error {error} exceeds max error {GGML_MAX_DOT_PRODUCT_ERROR}",);
    }

    // We diverge slightly due to different rounding behavior / f16 to f32 conversions in GGML
    // => we use a slightly higher error threshold
    const ERROR_LENIENCY: f32 = 0.00001;
    if error - ERROR_LENIENCY > ggml_error {
-        candle_core::bail!(
+        bail!(
            "Dot product error {} exceeds ggml reference error {}",
            error,
            ggml_error
@ -676,6 +739,16 @@ fn ggml_matmul_error_test<T: GgmlType>() -> Result<()> {
    Ok(())
 }

+#[test]
+fn quantized_mm() -> Result<()> {
+    ggml_matmul_error_test::<k_quants::BlockQ4_0>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ4_1>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ5_0>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ5_1>()?;
+    ggml_matmul_error_test::<k_quants::BlockQ8_0>()?;
+    Ok(())
+}
+
 /// generates random tensors of size `m x k` and `n x k` and calculates their expected matrix multiplication result.
 fn get_random_tensors(
    m: usize,
@ -705,6 +778,10 @@ macro_rules! quantized_matmul {
    // stable. https://github.com/rust-lang/rust/issues/29599
    ($fn_name: ident, $fn_name_cpu: ident, $fn_name_cuda: ident, $fn_name_metal: ident, $dtype: expr) => {
        fn $fn_name(device: &Device) -> Result<()> {
+            if device.is_cuda() {
+                // TODO Enable Cuda GGML sometime maybe.
+                return Ok(());
+            }
            test_matmul(device, (1, 3, 4, 256), $dtype)?;
            Ok(())
        }
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -1245,11 +1245,23 @@ fn assert_close(a: &Tensor, b: &Tensor, epsilon: f64) -> Result<()> {
 }

 #[test]
-fn logsumexp() -> Result<()> {
+fn log_sum_exp() -> Result<()> {
    let input = Tensor::new(&[[1f64, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
-    let output = input.logsumexp(D::Minus1)?;
+    let output = input.log_sum_exp(D::Minus1)?;
    // The expectations obtained from pytorch.
    let expected = Tensor::new(&[3.4076, 6.4076], &Device::Cpu)?;
    assert_close(&output, &expected, 0.00001)?;
    Ok(())
 }
+
+#[test]
+fn pow() -> Result<()> {
+    let lhs = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
+    let rhs = (&lhs - 2.)?;
+    let res = lhs.pow(&rhs)?;
+    assert_eq!(
+        test_utils::to_vec2_round(&res, 4)?,
+        [[1.0, 1.0, 3.0], [16.0, 125.0, 1296.0001]]
+    );
+    Ok(())
+}
--- a/candle-core/tests/test.pt
+++ b/candle-core/tests/test.pt
--- a/candle-core/tests/test_with_key.pt
+++ b/candle-core/tests/test_with_key.pt
--- a/candle-datasets/Cargo.toml
+++ b/candle-datasets/Cargo.toml
@ -11,8 +11,8 @@ readme = "README.md"

 [dependencies]
 byteorder = { workspace = true }
-candle = { path = "../candle-core", version = "0.3.3", package = "candle-core" }
-candle-nn = { path = "../candle-nn", version = "0.3.3" }
+candle = { workspace = true }
+candle-nn = { workspace = true }
 hf-hub = { workspace = true}
 intel-mkl-src = { workspace = true, optional = true }
 memmap2 = { workspace = true }
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -11,17 +11,17 @@ readme = "README.md"

 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.3.3", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.3.3" }
-candle-nn = { path = "../candle-nn", version = "0.3.3" }
-candle-transformers = { path = "../candle-transformers", version = "0.3.3" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.3", optional = true }
-candle-onnx = { path = "../candle-onnx", version = "0.3.3", optional = true }
+candle = { workspace = true }
+candle-datasets = { workspace = true }
+candle-nn = { workspace = true }
+candle-transformers = { workspace = true }
+candle-flash-attn = { workspace = true, optional = true }
+candle-onnx = { workspace = true, optional = true }

 csv = "1.3.0"
 cudarc = { workspace = true, optional = true }
 half = { workspace = true, optional = true }
-hf-hub = { workspace = true, features=["tokio"]}
+hf-hub = { workspace = true, features = ["tokio"] }
 image = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 num-traits = { workspace = true }
@ -30,7 +30,9 @@ rayon = { workspace = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
+symphonia = { version = "0.5.3", features = ["all"] }
 tokenizers = { workspace = true, features = ["onig"] }
+cpal= { version = "0.15.2", optional = true }

 [dev-dependencies]
 anyhow = { workspace = true }
@ -43,23 +45,24 @@ rusttype = { workspace = true }
 tracing = { workspace = true }
 tracing-chrome = { workspace = true }
 tracing-subscriber = { workspace = true }
-wav = { workspace = true }
 # Necessary to disambiguate with tokio in wasm examples which are 1.28.1
 tokio = "1.29.1"

 [build-dependencies]
 anyhow = { workspace = true }
+bindgen_cuda = { version = "0.1.1", optional = true }

 [features]
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
-cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda", "dep:bindgen_cuda"]
 cudnn = ["candle/cudnn"]
 flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
 mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"]
 nccl = ["cuda", "cudarc/nccl", "dep:half"]
 onnx = ["candle-onnx"]
 metal = ["candle/metal", "candle-nn/metal"]
+microphone = ["cpal"]

 [[example]]
 name = "llama_multiprocess"
@ -76,3 +79,7 @@ required-features = ["onnx"]
 [[example]]
 name = "onnx_basics"
 required-features = ["onnx"]
+
+[[example]]
+name = "whisper-microphone"
+required-features = ["microphone"]
--- a/candle-examples/build.rs
+++ b/candle-examples/build.rs
@ -4,251 +4,28 @@ use std::io::Write;
 use std::path::PathBuf;

 struct KernelDirectories {
-    kernel_dir: &'static str,
+    kernel_glob: &'static str,
    rust_target: &'static str,
    include_dirs: &'static [&'static str],
 }

-const DIRS: [KernelDirectories; 1] = [KernelDirectories {
-    kernel_dir: "examples/custom-ops/kernels/",
+const KERNEL_DIRS: [KernelDirectories; 1] = [KernelDirectories {
+    kernel_glob: "examples/custom-ops/kernels/*.cu",
    rust_target: "examples/custom-ops/cuda_kernels.rs",
    include_dirs: &[],
 }];

-impl KernelDirectories {
-    fn maybe_build_ptx(
-        &self,
-        cu_file: &std::path::Path,
-        ptx_file: &std::path::Path,
-        compute_cap: usize,
-    ) -> Result<()> {
-        let should_compile = if ptx_file.exists() {
-            let ptx_modified = ptx_file.metadata()?.modified()?;
-            let cu_modified = cu_file.metadata()?.modified()?;
-            cu_modified.duration_since(ptx_modified).is_ok()
-        } else {
-            true
-        };
-        if should_compile {
-            #[cfg(feature = "cuda")]
-            {
-                let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
-                println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
-                let mut command = std::process::Command::new("nvcc");
-                let out_dir = ptx_file.parent().context("no parent for ptx file")?;
-                let include_dirs: Vec<String> =
-                    self.include_dirs.iter().map(|c| format!("-I{c}")).collect();
-                command
-                    .arg(format!("--gpu-architecture=sm_{compute_cap}"))
-                    .arg("--ptx")
-                    .args(["--default-stream", "per-thread"])
-                    .args(["--output-directory", out_dir.to_str().unwrap()])
-                    .arg(format!("-I/{}", self.kernel_dir))
-                    .args(include_dirs)
-                    .arg(cu_file);
-                if let Ok(ccbin_path) = &ccbin_env {
-                    command
-                        .arg("-allow-unsupported-compiler")
-                        .args(["-ccbin", ccbin_path]);
-                }
-                let output = command
-                    .spawn()
-                    .context("failed spawning nvcc")?
-                    .wait_with_output()?;
-                if !output.status.success() {
-                    anyhow::bail!(
-                    "nvcc error while compiling {cu_file:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
-                    String::from_utf8_lossy(&output.stdout),
-                    String::from_utf8_lossy(&output.stderr)
-                )
-                }
-            }
-            #[cfg(not(feature = "cuda"))]
-            std::fs::OpenOptions::new()
-                .create(true)
-                .write(true)
-                .open(ptx_file)?;
-        }
-        Ok(())
-    }
-    fn process(&self, out_dir: &std::path::Path, compute_cap: usize) -> Result<()> {
-        println!("cargo:rerun-if-changed={}", self.kernel_dir);
-        let kernel_dir = PathBuf::from(self.kernel_dir);
-        let out_dir = out_dir.join(self.kernel_dir);
-        if !out_dir.exists() {
-            std::fs::create_dir_all(&out_dir)?;
-        }
-        let mut cu_files = vec![];
-        let mut cuh_files = vec![];
-        for file in std::fs::read_dir(kernel_dir)?.flatten() {
-            let file = file.path();
-            match file.extension().and_then(|v| v.to_str()) {
-                Some("cu") => cu_files.push(file),
-                Some("cuh") => cuh_files.push(file),
-                _ => {}
-            }
-        }
-
-        let mut ptx_paths = vec![];
-        for cu_file in cu_files.iter() {
-            let file_stem = cu_file
-                .file_stem()
-                .with_context(|| format!("no stem {cu_file:?}"))?;
-            let file_stem = file_stem.to_string_lossy().into_owned();
-            let ptx_file = out_dir.join(&format!("{file_stem}.ptx"));
-            self.maybe_build_ptx(cu_file, &ptx_file, compute_cap)?;
-            ptx_paths.push(ptx_file);
-        }
-
-        let regenerate_rs_file = true;
-        if regenerate_rs_file {
-            let mut file = std::fs::File::create(self.rust_target)?;
-            for ptx_path in ptx_paths {
-                let name = ptx_path
-                    .file_stem()
-                    .context("empty stem")?
-                    .to_string_lossy();
-                file.write_all(b"#[rustfmt::skip]\n")?;
-                let const_definition = format!(
-                    r#"pub const {}: &str = include_str!(concat!(env!("OUT_DIR"), "/{}/{name}.ptx"));"#,
-                    name.to_uppercase().replace('.', "_"),
-                    self.kernel_dir,
-                );
-                file.write_all(const_definition.as_bytes())?;
-                file.write_all(b"\n")?;
-            }
-        }
-        Ok(())
-    }
-}
-
 fn main() -> Result<()> {
    println!("cargo:rerun-if-changed=build.rs");

-    let out_dir = std::env::var("OUT_DIR").context("OUT_DIR not set")?;
-    let out_dir = PathBuf::from(out_dir);
    #[cfg(feature = "cuda")]
-    set_cuda_include_dir()?;
-    #[cfg(feature = "cuda")]
-    let compute_cap = compute_cap()?;
-    #[cfg(not(feature = "cuda"))]
-    let compute_cap = 0;
-    for d in DIRS {
-        d.process(&out_dir, compute_cap)?
+    {
+        for kdir in KERNEL_DIRS.iter() {
+            let builder = bindgen_cuda::Builder::default().kernel_paths_glob(kdir.kernel_glob);
+            println!("cargo:info={builder:?}");
+            let bindings = builder.build_ptx().unwrap();
+            bindings.write(kdir.rust_target).unwrap()
+        }
    }
    Ok(())
 }
-
-fn set_cuda_include_dir() -> Result<()> {
-    // NOTE: copied from cudarc build.rs.
-    let env_vars = [
-        "CUDA_PATH",
-        "CUDA_ROOT",
-        "CUDA_TOOLKIT_ROOT_DIR",
-        "CUDNN_LIB",
-    ];
-    let env_vars = env_vars
-        .into_iter()
-        .map(std::env::var)
-        .filter_map(Result::ok)
-        .map(Into::<PathBuf>::into);
-
-    let roots = [
-        "/usr",
-        "/usr/local/cuda",
-        "/opt/cuda",
-        "/usr/lib/cuda",
-        "C:/Program Files/NVIDIA GPU Computing Toolkit",
-        "C:/CUDA",
-    ];
-    let roots = roots.into_iter().map(Into::<PathBuf>::into);
-    let root = env_vars
-        .chain(roots)
-        .find(|path| path.join("include").join("cuda.h").is_file())
-        .context("cannot find include/cuda.h")?;
-    println!(
-        "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
-        root.join("include").display()
-    );
-    Ok(())
-}
-
-#[allow(unused)]
-fn compute_cap() -> Result<usize> {
-    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
-    // Try to parse compute cap from env
-    let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
-        compute_cap_str
-            .parse::<usize>()
-            .context("Could not parse code")?
-    } else {
-        // Grab compute cap from nvidia-smi
-        let out = std::process::Command::new("nvidia-smi")
-                    .arg("--query-gpu=compute_cap")
-                    .arg("--format=csv")
-                    .output()
-                    .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
-        let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
-        let mut lines = out.lines();
-        assert_eq!(
-            lines.next().context("missing line in stdout")?,
-            "compute_cap"
-        );
-        let cap = lines
-            .next()
-            .context("missing line in stdout")?
-            .replace('.', "");
-        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
-        cap.parse::<usize>()
-            .with_context(|| format!("cannot parse as int {cap}"))?
-    };
-
-    // Grab available GPU codes from nvcc and select the highest one
-    let max_nvcc_code = {
-        let out = std::process::Command::new("nvcc")
-                    .arg("--list-gpu-code")
-                    .output()
-                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
-        let out = std::str::from_utf8(&out.stdout).unwrap();
-
-        let out = out.lines().collect::<Vec<&str>>();
-        let mut codes = Vec::with_capacity(out.len());
-        for code in out {
-            let code = code.split('_').collect::<Vec<&str>>();
-            if !code.is_empty() && code.contains(&"sm") {
-                if let Ok(num) = code[1].parse::<usize>() {
-                    codes.push(num);
-                }
-            }
-        }
-        codes.sort();
-        if !codes.contains(&compute_cap) {
-            anyhow::bail!(
-                "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
-            );
-        }
-        *codes.last().unwrap()
-    };
-
-    // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
-    // then choose the highest gpu code in nvcc
-    if compute_cap > max_nvcc_code {
-        println!(
-            "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
-        );
-        compute_cap = max_nvcc_code;
-    }
-
-    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
-    if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-        compute_cap = compute_cap_str
-            .parse::<usize>()
-            .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
-        println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
-    }
-    println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
-    Ok(compute_cap)
-}
--- a/candle-examples/examples/chatglm/main.rs
+++ b/candle-examples/examples/chatglm/main.rs
@ -0,0 +1,237 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::Parser;
+
+use candle_transformers::models::chatglm::{Config, Model};
+
+use candle::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: Tokenizer,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+    verbose_prompt: bool,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        verbose_prompt: bool,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer,
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            verbose_prompt,
+            device: device.clone(),
+        }
+    }
+
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        println!("starting the inference loop");
+        let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?;
+        if tokens.is_empty() {
+            anyhow::bail!("Empty prompts are not supported in the chatglm model.")
+        }
+        if self.verbose_prompt {
+            for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
+                let token = token.replace('▁', " ").replace("<0x0A>", "\n");
+                println!("{id:7} -> '{token}'");
+            }
+        }
+        let mut tokens = tokens.get_ids().to_vec();
+        let mut generated_tokens = 0usize;
+        let eos_token = match self.tokenizer.get_vocab(true).get("</s>") {
+            Some(token) => *token,
+            None => anyhow::bail!("cannot find the endoftext token"),
+        };
+        print!("{prompt}");
+        std::io::stdout().flush()?;
+        let start_gen = std::time::Instant::now();
+        for index in 0..sample_len {
+            let context_size = if index > 0 { 1 } else { tokens.len() };
+            let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = self.model.forward(&input)?;
+            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token {
+                break;
+            }
+            let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?;
+            print!("{token}");
+            std::io::stdout().flush()?;
+        }
+        let dt = start_gen.elapsed();
+        println!(
+            "\n{generated_tokens} tokens generated ({:.2} token/s)",
+            generated_tokens as f64 / dt.as_secs_f64(),
+        );
+        Ok(())
+    }
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    /// Display the token for the specified prompt.
+    #[arg(long)]
+    verbose_prompt: bool,
+
+    #[arg(long)]
+    prompt: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 5000)]
+    sample_len: usize,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long)]
+    revision: Option<String>,
+
+    #[arg(long)]
+    weight_file: Option<String>,
+
+    #[arg(long)]
+    tokenizer: Option<String>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id.to_string(),
+        None => "THUDM/chatglm3-6b".to_string(),
+    };
+    let revision = match args.revision {
+        Some(rev) => rev.to_string(),
+        None => "main".to_string(),
+    };
+    let repo = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));
+    let tokenizer_filename = match args.tokenizer {
+        Some(file) => std::path::PathBuf::from(file),
+        None => api
+            .model("lmz/candle-chatglm".to_string())
+            .get("chatglm-tokenizer.json")?,
+    };
+    let filenames = match args.weight_file {
+        Some(weight_file) => vec![std::path::PathBuf::from(weight_file)],
+        None => candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?,
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+
+    let start = std::time::Instant::now();
+    let config = Config::glm3_6b();
+    let device = candle_examples::device(args.cpu)?;
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
+    let model = Model::new(&config, vb)?;
+
+    println!("loaded the model in {:?}", start.elapsed());
+
+    let mut pipeline = TextGeneration::new(
+        model,
+        tokenizer,
+        args.seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        args.verbose_prompt,
+        &device,
+    );
+    pipeline.run(&args.prompt, args.sample_len)?;
+    Ok(())
+}
--- a/candle-examples/examples/convnext/README.md
+++ b/candle-examples/examples/convnext/README.md
@ -0,0 +1,22 @@
+# candle-convnext
+
+[A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545).
+
+This candle implementation uses a pre-trained ConvNeXt network for inference. The
+classification head has been trained on the ImageNet dataset and returns the
+probabilities for the top-5 classes.
+
+## Running an example
+
+```
+$ cargo run --example convnext --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg --which tiny
+
+loaded image Tensor[dims 3, 224, 224; f32]
+model built
+mountain bike, all-terrain bike, off-roader: 84.09%
+bicycle-built-for-two, tandem bicycle, tandem: 4.15%
+maillot                 : 0.74%
+crash helmet            : 0.54%
+unicycle, monocycle     : 0.44%
+
+```
--- a/candle-examples/examples/convnext/main.rs
+++ b/candle-examples/examples/convnext/main.rs
@ -0,0 +1,102 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::{Parser, ValueEnum};
+
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::convnext;
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    Tiny,
+    Small,
+    Base,
+    Large,
+    XLarge,
+}
+
+impl Which {
+    fn model_filename(&self) -> String {
+        let name = match self {
+            Self::Tiny => "tiny",
+            Self::Small => "small",
+            Self::Base => "base",
+            Self::Large => "large",
+            Self::XLarge => "xlarge",
+        };
+        // The XLarge model only has an ImageNet-22K variant
+        let variant = match self {
+            Self::XLarge => "fb_in22k_ft_in1k",
+            _ => "fb_in1k",
+        };
+
+        format!("timm/convnext_{name}.{variant}")
+    }
+
+    fn config(&self) -> convnext::Config {
+        match self {
+            Self::Tiny => convnext::Config::tiny(),
+            Self::Small => convnext::Config::small(),
+            Self::Base => convnext::Config::base(),
+            Self::Large => convnext::Config::large(),
+            Self::XLarge => convnext::Config::xlarge(),
+        }
+    }
+}
+
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+
+    #[arg(long)]
+    image: String,
+
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    #[arg(value_enum, long, default_value_t=Which::Tiny)]
+    which: Which,
+}
+
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+
+    let device = candle_examples::device(args.cpu)?;
+
+    let image = candle_examples::imagenet::load_image224(args.image)?;
+    println!("loaded image {image:?}");
+
+    let model_file = match args.model {
+        None => {
+            let model_name = args.which.model_filename();
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model(model_name);
+            api.get("model.safetensors")?
+        }
+        Some(model) => model.into(),
+    };
+
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let model = convnext::convnext(&args.which.config(), 1000, vb)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/custom-ops/cuda_kernels.rs
+++ b/candle-examples/examples/custom-ops/cuda_kernels.rs
@ -1,2 +0,0 @@
-#[rustfmt::skip]
-pub const LAYERNORM_KERNELS: &str = include_str!(concat!(env!("OUT_DIR"), "/examples/custom-ops/kernels//layernorm_kernels.ptx"));
--- a/candle-examples/examples/custom-ops/main.rs
+++ b/candle-examples/examples/custom-ops/main.rs
@ -6,7 +6,8 @@
 #[cfg(feature = "mkl")]
 extern crate intel_mkl_src;

-#[allow(unused)]
+#[rustfmt::skip]
+#[cfg(feature = "cuda")]
 mod cuda_kernels;

 use clap::Parser;
--- a/candle-examples/examples/llama/main.rs
+++ b/candle-examples/examples/llama/main.rs
@ -165,14 +165,14 @@ fn main() -> Result<()> {
    let mut index_pos = 0;
    let mut token_generated = 0;
    for index in 0..args.sample_len {
-        let context_size = if cache.use_kv_cache && index > 0 {
-            1
+        let (context_size, context_index) = if cache.use_kv_cache && index > 0 {
+            (1, index_pos)
        } else {
-            tokens.len()
+            (tokens.len(), 0)
        };
        let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
        let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
-        let logits = llama.forward(&input, index_pos)?;
+        let logits = llama.forward(&input, context_index)?;
        let logits = logits.squeeze(0)?;
        let logits = if args.repeat_penalty == 1. {
            logits
--- a/candle-examples/examples/mamba-minimal/README.md
+++ b/candle-examples/examples/mamba-minimal/README.md
@ -2,6 +2,9 @@

 This is based on [mamba-minimal](https://github.com/johnma2006/mamba-minimal).

+Compared to the mamba example, this version can handle training but is much
+slower.
+
 ## Running the example

 ```bash
--- a/candle-examples/examples/mamba/README.md
+++ b/candle-examples/examples/mamba/README.md
@ -0,0 +1,17 @@
+# candle-mamba: Mamba implementation
+
+Candle implementation of *Mamba* [1] inference only. Mamba is an alternative to
+the transformer architecture. It leverages State Space Models (SSMs) with the
+goal of being computationally efficient on long sequences. The implementation is
+based on [mamba.rs](https://github.com/LaurentMazare/mamba.rs).
+
+- [1]. [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752).
+
+Compared to the mamba-minimal example, this version is far more efficient but
+would only work for inference.
+## Running the example
+
+```bash
+$ cargo run --example mamba-minimal --release -- --prompt "Mamba is the"
+```
+
--- a/candle-examples/examples/mamba/main.rs
+++ b/candle-examples/examples/mamba/main.rs
@ -0,0 +1,299 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::{Parser, ValueEnum};
+
+use candle_transformers::models::mamba::{Config, Model, State};
+
+use candle::{DType, Device, Tensor};
+use candle_examples::token_output_stream::TokenOutputStream;
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+
+struct TextGeneration {
+    model: Model,
+    config: Config,
+    device: Device,
+    tokenizer: TokenOutputStream,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        config: Config,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            config,
+            tokenizer: TokenOutputStream::new(tokenizer),
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            device: device.clone(),
+        }
+    }
+
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        self.tokenizer.clear();
+        let mut tokens = self
+            .tokenizer
+            .tokenizer()
+            .encode(prompt, true)
+            .map_err(E::msg)?
+            .get_ids()
+            .to_vec();
+        let mut generated_tokens = 0usize;
+        let eos_token = match self.tokenizer.get_token("<|endoftext|>") {
+            Some(token) => token,
+            None => anyhow::bail!("cannot find the </s> token"),
+        };
+        let mut state = State::new(1, &self.config, &self.device)?;
+        let mut next_logits = None;
+        for &t in tokens.iter() {
+            let input = Tensor::new(&[t], &self.device)?;
+            let logits = self.model.forward(&input, &mut state)?;
+            next_logits = Some(logits);
+            if let Some(t) = self.tokenizer.next_token(t)? {
+                print!("{t}")
+            }
+        }
+        std::io::stdout().flush()?;
+
+        let start_gen = std::time::Instant::now();
+        for _ in 0..sample_len {
+            let logits = match next_logits.as_ref() {
+                Some(logits) => logits,
+                None => anyhow::bail!("cannot work on an empty prompt"),
+            };
+            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token {
+                break;
+            }
+            if let Some(t) = self.tokenizer.next_token(next_token)? {
+                print!("{t}");
+                std::io::stdout().flush()?;
+            }
+
+            let input = Tensor::new(&[next_token], &self.device)?;
+            next_logits = Some(self.model.forward(&input, &mut state)?)
+        }
+        let dt = start_gen.elapsed();
+        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
+            print!("{rest}");
+        }
+        std::io::stdout().flush()?;
+        println!(
+            "\n{generated_tokens} tokens generated ({:.2} token/s)",
+            generated_tokens as f64 / dt.as_secs_f64(),
+        );
+        Ok(())
+    }
+}
+
+#[derive(Parser, ValueEnum, Clone, Copy, PartialEq, Eq, Debug)]
+enum Which {
+    Mamba130m,
+    Mamba370m,
+    Mamba790m,
+    Mamba1_4b,
+    Mamba2_8b,
+    Mamba2_8bSlimPj,
+}
+
+impl std::fmt::Display for Which {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl Which {
+    fn model_id(&self) -> &'static str {
+        match self {
+            Self::Mamba130m => "state-spaces/mamba-130m",
+            Self::Mamba370m => "state-spaces/mamba-370m",
+            Self::Mamba790m => "state-spaces/mamba-790m",
+            Self::Mamba1_4b => "state-spaces/mamba-1.4b",
+            Self::Mamba2_8b => "state-spaces/mamba-2.8b",
+            Self::Mamba2_8bSlimPj => "state-spaces/mamba-2.8b-slimpj'",
+        }
+    }
+
+    fn revision(&self) -> &'static str {
+        match self {
+            Self::Mamba130m
+            | Self::Mamba370m
+            | Self::Mamba790m
+            | Self::Mamba1_4b
+            | Self::Mamba2_8bSlimPj => "refs/pr/1",
+            Self::Mamba2_8b => "refs/pr/4",
+        }
+    }
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    #[arg(long)]
+    prompt: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 5000)]
+    sample_len: usize,
+
+    #[arg(long, default_value = "mamba130m")]
+    which: Which,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long)]
+    revision: Option<String>,
+
+    #[arg(long)]
+    tokenizer_file: Option<String>,
+
+    #[arg(long)]
+    weight_files: Option<String>,
+
+    #[arg(long)]
+    config_file: Option<String>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let repo = api.repo(Repo::with_revision(
+        args.model_id
+            .unwrap_or_else(|| args.which.model_id().to_string()),
+        RepoType::Model,
+        args.revision
+            .unwrap_or_else(|| args.which.revision().to_string()),
+    ));
+    let tokenizer_filename = match args.tokenizer_file {
+        Some(file) => std::path::PathBuf::from(file),
+        None => api
+            .model("EleutherAI/gpt-neox-20b".to_string())
+            .get("tokenizer.json")?,
+    };
+    let config_filename = match args.config_file {
+        Some(file) => std::path::PathBuf::from(file),
+        None => repo.get("config.json")?,
+    };
+    let filenames = match args.weight_files {
+        Some(files) => files
+            .split(',')
+            .map(std::path::PathBuf::from)
+            .collect::<Vec<_>>(),
+        None => {
+            vec![repo.get("model.safetensors")?]
+        }
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+
+    let start = std::time::Instant::now();
+    let config: Config = serde_json::from_slice(&std::fs::read(config_filename)?)?;
+    let device = candle_examples::device(args.cpu)?;
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
+    let model = Model::new(&config, vb.pp("backbone"))?;
+    println!("loaded the model in {:?}", start.elapsed());
+
+    let mut pipeline = TextGeneration::new(
+        model,
+        config,
+        tokenizer,
+        args.seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        &device,
+    );
+    pipeline.run(&args.prompt, args.sample_len)?;
+    Ok(())
+}
--- a/candle-examples/examples/mobileone/README.md
+++ b/candle-examples/examples/mobileone/README.md
@ -0,0 +1,22 @@
+# candle-mobileone
+
+[MobileOne: An Improved One millisecond Mobile Backbone](https://arxiv.org/abs/2206.04040).
+
+This candle implementation uses a pre-trained MobileOne network for inference. The
+classification head has been trained on the ImageNet dataset and returns the
+probabilities for the top-5 classes.
+
+## Running an example
+
+```
+$ cargo run --example mobileone --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg --which s2
+
+loaded image Tensor[dims 3, 224, 224; f32]
+model built
+mountain bike, all-terrain bike, off-roader: 79.33%
+bicycle-built-for-two, tandem bicycle, tandem: 15.32%
+crash helmet            : 2.58%
+unicycle, monocycle     : 1.70%
+alp                     : 0.21%
+
+```
--- a/candle-examples/examples/mobileone/main.rs
+++ b/candle-examples/examples/mobileone/main.rs
@ -0,0 +1,96 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::{Parser, ValueEnum};
+
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::mobileone;
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    S0,
+    S1,
+    S2,
+    S3,
+    S4,
+}
+
+impl Which {
+    fn model_filename(&self) -> String {
+        let name = match self {
+            Self::S0 => "s0",
+            Self::S1 => "s1",
+            Self::S2 => "s2",
+            Self::S3 => "s3",
+            Self::S4 => "s4",
+        };
+        format!("timm/mobileone_{}.apple_in1k", name)
+    }
+
+    fn config(&self) -> mobileone::Config {
+        match self {
+            Self::S0 => mobileone::Config::s0(),
+            Self::S1 => mobileone::Config::s1(),
+            Self::S2 => mobileone::Config::s2(),
+            Self::S3 => mobileone::Config::s3(),
+            Self::S4 => mobileone::Config::s4(),
+        }
+    }
+}
+
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+
+    #[arg(long)]
+    image: String,
+
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    #[arg(value_enum, long, default_value_t=Which::S0)]
+    which: Which,
+}
+
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+
+    let device = candle_examples::device(args.cpu)?;
+
+    let image = candle_examples::imagenet::load_image224(args.image)?;
+    println!("loaded image {image:?}");
+
+    let model_file = match args.model {
+        None => {
+            let model_name = args.which.model_filename();
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model(model_name);
+            api.get("model.safetensors")?
+        }
+        Some(model) => model.into(),
+    };
+
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let model = mobileone::mobileone(&args.which.config(), 1000, vb)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/onnx/README.md
+++ b/candle-examples/examples/onnx/README.md
@ -1,10 +1,39 @@
 ## Using ONNX models in Candle

-This example demonstrates how to run ONNX based models in Candle, the model
-being used here is a small sequeezenet variant.
+This example demonstrates how to run [ONNX](https://github.com/onnx/onnx) based models in Candle.

-You can run the example with the following command:
+It contains small variants of two models, [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf) (default) and [EfficientNet](https://arxiv.org/pdf/1905.11946.pdf).
+
+You can run the examples with following commands:

 ```bash
-cargo run --example squeezenet-onnx --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
+cargo run --example onnx --features=onnx --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
+```
+
+Use the `--which` flag to specify explicitly which network to use, i.e.
+
+```bash
+$ cargo run --example onnx --features=onnx --release -- --which squeeze-net --image candle-examples/examples/yolo-v8/assets/bike.jpg
+
+    Finished release [optimized] target(s) in 0.21s
+     Running `target/release/examples/onnx --which squeeze-net --image candle-examples/examples/yolo-v8/assets/bike.jpg`
+loaded image Tensor[dims 3, 224, 224; f32]
+unicycle, monocycle                               : 83.23%
+ballplayer, baseball player                       : 3.68%
+bearskin, busby, shako                            : 1.54%
+military uniform                                  : 0.78%
+cowboy hat, ten-gallon hat                        : 0.76%
+```
+
+```bash
+$ cargo run --example onnx --features=onnx --release -- --which efficient-net --image candle-examples/examples/yolo-v8/assets/bike.jpg
+
+    Finished release [optimized] target(s) in 0.20s
+     Running `target/release/examples/onnx --which efficient-net --image candle-examples/examples/yolo-v8/assets/bike.jpg`
+loaded image Tensor[dims 224, 224, 3; f32]
+bicycle-built-for-two, tandem bicycle, tandem     : 99.16%
+mountain bike, all-terrain bike, off-roader       : 0.60%
+unicycle, monocycle                               : 0.17%
+crash helmet                                      : 0.02%
+alp                                               : 0.02%
 ```
--- a/candle-examples/examples/phi/main.rs
+++ b/candle-examples/examples/phi/main.rs
@ -8,6 +8,7 @@ use anyhow::{Error as E, Result};
 use clap::{Parser, ValueEnum};

 use candle_transformers::models::mixformer::{Config, MixFormerSequentialForCausalLM as MixFormer};
+use candle_transformers::models::phi::{Config as PhiConfig, Model as Phi};
 use candle_transformers::models::quantized_mixformer::MixFormerSequentialForCausalLM as QMixFormer;

 use candle::{DType, Device, Tensor};
@ -18,6 +19,7 @@ use tokenizers::Tokenizer;

 enum Model {
    MixFormer(MixFormer),
+    Phi(Phi),
    Quantized(QMixFormer),
 }

@ -84,6 +86,7 @@ impl TextGeneration {
            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
            let logits = match &mut self.model {
                Model::MixFormer(m) => m.forward(&input)?,
+                Model::Phi(m) => m.forward(&input)?,
                Model::Quantized(m) => m.forward(&input)?,
            };
            let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
@ -117,7 +120,7 @@ impl TextGeneration {
    }
 }

-#[derive(Clone, Copy, Debug, ValueEnum)]
+#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)]
 enum WhichModel {
    #[value(name = "1")]
    V1,
@ -125,6 +128,8 @@ enum WhichModel {
    V1_5,
    #[value(name = "2")]
    V2,
+    #[value(name = "2-old")]
+    V2Old,
    PuffinPhiV2,
    PhiHermes,
 }
@ -169,7 +174,7 @@ struct Args {
    #[arg(long)]
    model_id: Option<String>,

-    #[arg(long, default_value = "1.5")]
+    #[arg(long, default_value = "2")]
    model: WhichModel,

    #[arg(long)]
@ -230,7 +235,7 @@ fn main() -> Result<()> {
                match args.model {
                    WhichModel::V1 => "microsoft/phi-1".to_string(),
                    WhichModel::V1_5 => "microsoft/phi-1_5".to_string(),
-                    WhichModel::V2 => "microsoft/phi-2".to_string(),
+                    WhichModel::V2 | WhichModel::V2Old => "microsoft/phi-2".to_string(),
                    WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
                        "lmz/candle-quantized-phi".to_string()
                    }
@ -245,8 +250,9 @@ fn main() -> Result<()> {
                "main".to_string()
            } else {
                match args.model {
-                    WhichModel::V1 => "refs/pr/2".to_string(),
-                    WhichModel::V1_5 => "refs/pr/18".to_string(),
+                    WhichModel::V1 => "refs/pr/8".to_string(),
+                    WhichModel::V1_5 => "refs/pr/73".to_string(),
+                    WhichModel::V2Old => "834565c23f9b28b96ccbeabe614dd906b6db551a".to_string(),
                    WhichModel::V2 | WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
                        "main".to_string()
                    }
@ -258,7 +264,9 @@ fn main() -> Result<()> {
    let tokenizer_filename = match args.tokenizer {
        Some(file) => std::path::PathBuf::from(file),
        None => match args.model {
-            WhichModel::V1 | WhichModel::V1_5 | WhichModel::V2 => repo.get("tokenizer.json")?,
+            WhichModel::V1 | WhichModel::V1_5 | WhichModel::V2 | WhichModel::V2Old => {
+                repo.get("tokenizer.json")?
+            }
            WhichModel::PuffinPhiV2 | WhichModel::PhiHermes => {
                repo.get("tokenizer-puffin-phi-v2.json")?
            }
@ -271,14 +279,14 @@ fn main() -> Result<()> {
                match args.model {
                    WhichModel::V1 => vec![repo.get("model-v1-q4k.gguf")?],
                    WhichModel::V1_5 => vec![repo.get("model-q4k.gguf")?],
-                    WhichModel::V2 => vec![repo.get("model-v2-q4k.gguf")?],
+                    WhichModel::V2 | WhichModel::V2Old => vec![repo.get("model-v2-q4k.gguf")?],
                    WhichModel::PuffinPhiV2 => vec![repo.get("model-puffin-phi-v2-q4k.gguf")?],
                    WhichModel::PhiHermes => vec![repo.get("model-phi-hermes-1_3B-q4k.gguf")?],
                }
            } else {
                match args.model {
                    WhichModel::V1 | WhichModel::V1_5 => vec![repo.get("model.safetensors")?],
-                    WhichModel::V2 => candle_examples::hub_load_safetensors(
+                    WhichModel::V2 | WhichModel::V2Old => candle_examples::hub_load_safetensors(
                        &repo,
                        "model.safetensors.index.json",
                    )?,
@ -292,33 +300,44 @@ fn main() -> Result<()> {
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

    let start = std::time::Instant::now();
-    let config = match args.model {
+    let config = || match args.model {
        WhichModel::V1 => Config::v1(),
        WhichModel::V1_5 => Config::v1_5(),
-        WhichModel::V2 => Config::v2(),
+        WhichModel::V2 | WhichModel::V2Old => Config::v2(),
        WhichModel::PuffinPhiV2 => Config::puffin_phi_v2(),
        WhichModel::PhiHermes => Config::phi_hermes_1_3b(),
    };
    let device = candle_examples::device(args.cpu)?;
    let model = if args.quantized {
+        let config = config();
        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
            &filenames[0],
            &device,
        )?;
-        println!("Loaded vb");
        let model = match args.model {
-            WhichModel::V2 => QMixFormer::new_v2(&config, vb)?,
+            WhichModel::V2 | WhichModel::V2Old => QMixFormer::new_v2(&config, vb)?,
            _ => QMixFormer::new(&config, vb)?,
        };
-        println!("Loaded model");
        Model::Quantized(model)
    } else {
        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? };
-        let model = match args.model {
-            WhichModel::V2 => MixFormer::new_v2(&config, vb)?,
-            _ => MixFormer::new(&config, vb)?,
-        };
-        Model::MixFormer(model)
+        match args.model {
+            WhichModel::V1 | WhichModel::V1_5 | WhichModel::V2 => {
+                let config_filename = repo.get("config.json")?;
+                let config = std::fs::read_to_string(config_filename)?;
+                let config: PhiConfig = serde_json::from_str(&config)?;
+                let phi = Phi::new(&config, vb)?;
+                Model::Phi(phi)
+            }
+            WhichModel::V2Old => {
+                let config = config();
+                Model::MixFormer(MixFormer::new_v2(&config, vb)?)
+            }
+            WhichModel::PhiHermes | WhichModel::PuffinPhiV2 => {
+                let config = config();
+                Model::MixFormer(MixFormer::new(&config, vb)?)
+            }
+        }
    };
    println!("loaded the model in {:?}", start.elapsed());

@ -398,6 +417,10 @@ fn mmlu<P: AsRef<std::path::Path>>(
                    m.clear_kv_cache();
                    m.forward(&input)?
                }
+                Model::Phi(m) => {
+                    m.clear_kv_cache();
+                    m.forward(&input)?
+                }
                Model::Quantized(m) => {
                    m.clear_kv_cache();
                    m.forward(&input)?
--- a/candle-examples/examples/qwen/main.rs
+++ b/candle-examples/examples/qwen/main.rs
@ -0,0 +1,281 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::Parser;
+
+use candle_transformers::models::qwen2::{Config, Model};
+
+use candle::{DType, Device, Tensor};
+use candle_examples::token_output_stream::TokenOutputStream;
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use tokenizers::Tokenizer;
+
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: TokenOutputStream,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        device: &Device,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer: TokenOutputStream::new(tokenizer),
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            device: device.clone(),
+        }
+    }
+
+    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
+        use std::io::Write;
+        self.tokenizer.clear();
+        let mut tokens = self
+            .tokenizer
+            .tokenizer()
+            .encode(prompt, true)
+            .map_err(E::msg)?
+            .get_ids()
+            .to_vec();
+        for &t in tokens.iter() {
+            if let Some(t) = self.tokenizer.next_token(t)? {
+                print!("{t}")
+            }
+        }
+        std::io::stdout().flush()?;
+
+        let mut generated_tokens = 0usize;
+        let eos_token = match self.tokenizer.get_token("<|endoftext|>") {
+            Some(token) => token,
+            None => anyhow::bail!("cannot find the <|endoftext|> token"),
+        };
+        let start_gen = std::time::Instant::now();
+        for index in 0..sample_len {
+            let context_size = if index > 0 { 1 } else { tokens.len() };
+            let start_pos = tokens.len().saturating_sub(context_size);
+            let ctxt = &tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let logits = self.model.forward(&input, start_pos)?;
+            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
+            let logits = if self.repeat_penalty == 1. {
+                logits
+            } else {
+                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                candle_transformers::utils::apply_repeat_penalty(
+                    &logits,
+                    self.repeat_penalty,
+                    &tokens[start_at..],
+                )?
+            };
+
+            let next_token = self.logits_processor.sample(&logits)?;
+            tokens.push(next_token);
+            generated_tokens += 1;
+            if next_token == eos_token {
+                break;
+            }
+            if let Some(t) = self.tokenizer.next_token(next_token)? {
+                print!("{t}");
+                std::io::stdout().flush()?;
+            }
+        }
+        let dt = start_gen.elapsed();
+        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
+            print!("{rest}");
+        }
+        std::io::stdout().flush()?;
+        println!(
+            "\n{generated_tokens} tokens generated ({:.2} token/s)",
+            generated_tokens as f64 / dt.as_secs_f64(),
+        );
+        Ok(())
+    }
+}
+
+#[derive(Clone, Copy, Debug, clap::ValueEnum, PartialEq, Eq)]
+enum WhichModel {
+    #[value(name = "0.5b")]
+    W0_5b,
+    #[value(name = "1.8b")]
+    W1_8b,
+    #[value(name = "4b")]
+    W4b,
+    #[value(name = "7b")]
+    W7b,
+    #[value(name = "14b")]
+    W14b,
+    #[value(name = "72b")]
+    W72b,
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    #[arg(long)]
+    use_flash_attn: bool,
+
+    #[arg(long)]
+    prompt: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 10000)]
+    sample_len: usize,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long, default_value = "main")]
+    revision: String,
+
+    #[arg(long)]
+    tokenizer_file: Option<String>,
+
+    #[arg(long)]
+    weight_files: Option<String>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+
+    #[arg(long, default_value = "0.5b")]
+    model: WhichModel,
+}
+
+fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx(),
+        candle::utils::with_neon(),
+        candle::utils::with_simd128(),
+        candle::utils::with_f16c()
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.),
+        args.repeat_penalty,
+        args.repeat_last_n
+    );
+
+    let start = std::time::Instant::now();
+    let api = Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id,
+        None => {
+            let size = match args.model {
+                WhichModel::W0_5b => "0.5B",
+                WhichModel::W1_8b => "1.8B",
+                WhichModel::W4b => "4B",
+                WhichModel::W7b => "7B",
+                WhichModel::W14b => "14B",
+                WhichModel::W72b => "72B",
+            };
+            format!("Qwen/Qwen1.5-{size}")
+        }
+    };
+    let repo = api.repo(Repo::with_revision(
+        model_id,
+        RepoType::Model,
+        args.revision,
+    ));
+    let tokenizer_filename = match args.tokenizer_file {
+        Some(file) => std::path::PathBuf::from(file),
+        None => repo.get("tokenizer.json")?,
+    };
+    let filenames = match args.weight_files {
+        Some(files) => files
+            .split(',')
+            .map(std::path::PathBuf::from)
+            .collect::<Vec<_>>(),
+        None => match args.model {
+            WhichModel::W0_5b | WhichModel::W1_8b => vec![repo.get("model.safetensors")?],
+            WhichModel::W4b | WhichModel::W7b | WhichModel::W14b | WhichModel::W72b => {
+                candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?
+            }
+        },
+    };
+    println!("retrieved the files in {:?}", start.elapsed());
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+
+    let start = std::time::Instant::now();
+    let config_file = repo.get("config.json")?;
+    let config: Config = serde_json::from_slice(&std::fs::read(config_file)?)?;
+    let device = candle_examples::device(args.cpu)?;
+    let dtype = if device.is_cuda() {
+        DType::BF16
+    } else {
+        DType::F32
+    };
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
+    let model = Model::new(&config, vb)?;
+
+    println!("loaded the model in {:?}", start.elapsed());
+
+    let mut pipeline = TextGeneration::new(
+        model,
+        tokenizer,
+        args.seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        &device,
+    );
+    pipeline.run(&args.prompt, args.sample_len)?;
+    Ok(())
+}
--- a/candle-examples/examples/reinforcement-learning/ddpg.rs
+++ b/candle-examples/examples/reinforcement-learning/ddpg.rs
@ -411,7 +411,7 @@ impl DDPG<'_> {
    pub fn actions(&mut self, state: &Tensor) -> Result<f32> {
        let actions = self
            .actor
-            .forward(&state.detach()?.unsqueeze(0)?)?
+            .forward(&state.detach().unsqueeze(0)?)?
            .squeeze(0)?;
        let actions = if self.train {
            (actions + self.ou_noise.sample()?)?
--- a/candle-examples/examples/reinforcement-learning/policy_gradient.rs
+++ b/candle-examples/examples/reinforcement-learning/policy_gradient.rs
@ -74,7 +74,7 @@ pub fn run() -> Result<()> {
        loop {
            let action = {
                let action_probs: Vec<f32> =
-                    softmax(&model.forward(&state.detach()?.unsqueeze(0)?)?, 1)?
+                    softmax(&model.forward(&state.detach().unsqueeze(0)?)?, 1)?
                        .squeeze(0)?
                        .to_vec1()?;
                weighted_sample(action_probs, &mut rng)? as i64
@ -109,7 +109,7 @@ pub fn run() -> Result<()> {

        let rewards = Tensor::from_vec(accumulate_rewards(&steps), batch_size, &Device::Cpu)?
            .to_dtype(DType::F32)?
-            .detach()?;
+            .detach();

        let actions_mask = {
            let actions: Vec<i64> = steps.iter().map(|s| s.action).collect();
@ -126,12 +126,12 @@ pub fn run() -> Result<()> {
                        .unwrap()
                })
                .collect();
-            Tensor::stack(&actions_mask, 0)?.detach()?
+            Tensor::stack(&actions_mask, 0)?.detach()
        };

        let states = {
            let states: Vec<Tensor> = steps.into_iter().map(|s| s.state).collect();
-            Tensor::stack(&states, 0)?.detach()?
+            Tensor::stack(&states, 0)?.detach()
        };

        let log_probs = actions_mask
--- a/candle-examples/examples/replit-code/main.rs
+++ b/candle-examples/examples/replit-code/main.rs
@ -236,18 +236,15 @@ fn main() -> Result<()> {
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

    let start = std::time::Instant::now();
-    let device = Device::Cpu;
+    let device = candle_examples::device(args.cpu)?;
    let config = Config::replit_code_v1_5_3b();
-    let (model, device) = if args.quantized {
+    let model = if args.quantized {
        let vb =
            candle_transformers::quantized_var_builder::VarBuilder::from_gguf(&filename, &device)?;
-        let model = Model::Q(Q::new(&config, vb.pp("transformer"))?);
-        (model, Device::Cpu)
+        Model::Q(Q::new(&config, vb.pp("transformer"))?)
    } else {
-        let device = candle_examples::device(args.cpu)?;
        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[filename], DType::F32, &device)? };
-        let model = Model::M(M::new(&config, vb.pp("transformer"))?);
-        (model, device)
+        Model::M(M::new(&config, vb.pp("transformer"))?)
    };
    println!("loaded the model in {:?}", start.elapsed());

--- a/candle-examples/examples/repvgg/README.md
+++ b/candle-examples/examples/repvgg/README.md
@ -0,0 +1,22 @@
+# candle-repvgg
+
+[RepVGG: Making VGG-style ConvNets Great Again](https://arxiv.org/abs/2101.03697).
+
+This candle implementation uses a pre-trained RepVGG network for inference. The
+classification head has been trained on the ImageNet dataset and returns the
+probabilities for the top-5 classes.
+
+## Running an example
+
+```
+$ cargo run --example repvgg --release -- --image candle-examples/examples/yolo-v8/assets/bike.jpg
+
+loaded image Tensor[dims 3, 224, 224; f32]
+model built
+mountain bike, all-terrain bike, off-roader: 61.70%
+bicycle-built-for-two, tandem bicycle, tandem: 33.14%
+unicycle, monocycle     : 4.88%
+crash helmet            : 0.15%
+moped                   : 0.04%
+
+```
--- a/candle-examples/examples/repvgg/main.rs
+++ b/candle-examples/examples/repvgg/main.rs
@ -0,0 +1,111 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::{Parser, ValueEnum};
+
+use candle::{DType, IndexOp, D};
+use candle_nn::{Module, VarBuilder};
+use candle_transformers::models::repvgg;
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Which {
+    A0,
+    A1,
+    A2,
+    B0,
+    B1,
+    B2,
+    B3,
+    B1G4,
+    B2G4,
+    B3G4,
+}
+
+impl Which {
+    fn model_filename(&self) -> String {
+        let name = match self {
+            Self::A0 => "a0",
+            Self::A1 => "a1",
+            Self::A2 => "a2",
+            Self::B0 => "b0",
+            Self::B1 => "b1",
+            Self::B2 => "b2",
+            Self::B3 => "b3",
+            Self::B1G4 => "b1g4",
+            Self::B2G4 => "b2g4",
+            Self::B3G4 => "b3g4",
+        };
+        format!("timm/repvgg_{}.rvgg_in1k", name)
+    }
+
+    fn config(&self) -> repvgg::Config {
+        match self {
+            Self::A0 => repvgg::Config::a0(),
+            Self::A1 => repvgg::Config::a1(),
+            Self::A2 => repvgg::Config::a2(),
+            Self::B0 => repvgg::Config::b0(),
+            Self::B1 => repvgg::Config::b1(),
+            Self::B2 => repvgg::Config::b2(),
+            Self::B3 => repvgg::Config::b3(),
+            Self::B1G4 => repvgg::Config::b1g4(),
+            Self::B2G4 => repvgg::Config::b2g4(),
+            Self::B3G4 => repvgg::Config::b3g4(),
+        }
+    }
+}
+
+#[derive(Parser)]
+struct Args {
+    #[arg(long)]
+    model: Option<String>,
+
+    #[arg(long)]
+    image: String,
+
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    #[arg(value_enum, long, default_value_t=Which::A0)]
+    which: Which,
+}
+
+pub fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+
+    let device = candle_examples::device(args.cpu)?;
+
+    let image = candle_examples::imagenet::load_image224(args.image)?;
+    println!("loaded image {image:?}");
+
+    let model_file = match args.model {
+        None => {
+            let model_name = args.which.model_filename();
+            let api = hf_hub::api::sync::Api::new()?;
+            let api = api.model(model_name);
+            api.get("model.safetensors")?
+        }
+        Some(model) => model.into(),
+    };
+
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+    let model = repvgg::repvgg(&args.which.config(), 1000, vb)?;
+    println!("model built");
+    let logits = model.forward(&image.unsqueeze(0)?)?;
+    let prs = candle_nn::ops::softmax(&logits, D::Minus1)?
+        .i(0)?
+        .to_vec1::<f32>()?;
+    let mut prs = prs.iter().enumerate().collect::<Vec<_>>();
+    prs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for &(category_idx, pr) in prs.iter().take(5) {
+        println!(
+            "{:24}: {:.2}%",
+            candle_examples::imagenet::CLASSES[category_idx],
+            100. * pr
+        );
+    }
+    Ok(())
+}
--- a/candle-examples/examples/stable-lm/README.md
+++ b/candle-examples/examples/stable-lm/README.md
@ -8,6 +8,13 @@ Card](https://huggingface.co/stabilityai/stablelm-3b-4e1t).
 Note that this model is gated so you will have to request access on the Hub in
 order to be able to use it.

+Other available models are Stable-Code-3B, StableLM-2 and Zephyr variants.
+
+StableLM-2 uses a Tiktoken based GPT-3.5/GPT-4 tokenizer not supported by
+Candle, so to run it you can download a somewhat compatible
+[tokenizer.json](https://huggingface.co/Xenova/gpt-4/resolve/main/tokenizer.json?download=true)
+and pass it via the --tokenizer-file argument.
+
 ## Running some example

 ```bash
--- a/candle-examples/examples/stable-lm/main.rs
+++ b/candle-examples/examples/stable-lm/main.rs
@ -5,7 +5,7 @@ extern crate intel_mkl_src;
 extern crate accelerate_src;

 use anyhow::{Error as E, Result};
-use clap::Parser;
+use clap::{Parser, ValueEnum};

 use candle_transformers::models::quantized_stable_lm::Model as QStableLM;
 use candle_transformers::models::stable_lm::{Config, Model as StableLM};
@ -122,6 +122,16 @@ impl TextGeneration {
    }
 }

+#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)]
+enum Which {
+    V1Orig,
+    V1,
+    V1Zephyr,
+    V2,
+    V2Zephyr,
+    Code,
+}
+
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
@ -152,15 +162,18 @@ struct Args {
    seed: u64,

    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 100)]
+    #[arg(long, short = 'n', default_value_t = 1000)]
    sample_len: usize,

-    #[arg(long, default_value = "lmz/candle-stablelm-3b-4e1t")]
-    model_id: String,
+    #[arg(long)]
+    model_id: Option<String>,

    #[arg(long, default_value = "main")]
    revision: String,

+    #[arg(long, default_value = "v2")]
+    which: Which,
+
    #[arg(long)]
    tokenizer_file: Option<String>,

@ -207,33 +220,80 @@ fn main() -> Result<()> {

    let start = std::time::Instant::now();
    let api = Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id,
+        None => match args.which {
+            Which::V1Orig => "lmz/candle-stablelm-3b-4e1t".to_string(),
+            Which::V1 => "stabilityai/stablelm-3b-4e1t".to_string(),
+            Which::V1Zephyr => "stabilityai/stablelm-zephyr-3b".to_string(),
+            Which::Code => "stabilityai/stable-code-3b".to_string(),
+            Which::V2 => "stabilityai/stablelm-2-1_6b".to_string(),
+            Which::V2Zephyr => "stabilityai/stablelm-2-zephyr-1_6b".to_string(),
+        },
+    };
+
    let repo = api.repo(Repo::with_revision(
-        args.model_id,
+        model_id,
        RepoType::Model,
        args.revision,
    ));
    let tokenizer_filename = match args.tokenizer_file {
        Some(file) => std::path::PathBuf::from(file),
-        None => repo.get("tokenizer.json")?,
+        None => match args.which {
+            Which::V1Orig | Which::V1 | Which::V1Zephyr | Which::Code => {
+                repo.get("tokenizer.json")?
+            }
+            Which::V2 | Which::V2Zephyr => api
+                .model("lmz/candle-stablelm".to_string())
+                .get("tokenizer-gpt4.json")?,
+        },
    };
    let filenames = match args.weight_files {
        Some(files) => files
            .split(',')
            .map(std::path::PathBuf::from)
            .collect::<Vec<_>>(),
-        None => {
-            if args.quantized {
-                vec![repo.get("model-q4k.gguf")?]
-            } else {
+        None => match (args.which, args.quantized) {
+            (Which::V1Orig | Which::V1, true) => vec![repo.get("model-q4k.gguf")?],
+            (Which::V2, true) => {
+                let gguf = api
+                    .model("lmz/candle-stablelm".to_string())
+                    .get("stablelm-2-1_6b-q4k.gguf")?;
+                vec![gguf]
+            }
+            (Which::V2Zephyr, true) => {
+                let gguf = api
+                    .model("lmz/candle-stablelm".to_string())
+                    .get("stablelm-2-zephyr-1_6b-q4k.gguf")?;
+                vec![gguf]
+            }
+            (Which::V1Zephyr | Which::Code, true) => {
+                anyhow::bail!("Quantized {:?} variant not supported.", args.which)
+            }
+            (Which::V1Orig | Which::V1 | Which::V1Zephyr | Which::V2 | Which::V2Zephyr, false) => {
                vec![repo.get("model.safetensors")?]
            }
-        }
+            (Which::Code, false) => {
+                candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?
+            }
+        },
    };
+
    println!("retrieved the files in {:?}", start.elapsed());
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;

    let start = std::time::Instant::now();
-    let config = Config::stablelm_3b_4e1t(args.use_flash_attn);
+    let config = match args.which {
+        Which::V1Orig => Config::stablelm_3b_4e1t(args.use_flash_attn),
+        Which::V1 | Which::V1Zephyr | Which::V2 | Which::V2Zephyr | Which::Code => {
+            let config_filename = repo.get("config.json")?;
+            let config = std::fs::read_to_string(config_filename)?;
+            let mut config: Config = serde_json::from_str(&config)?;
+            config.set_use_flash_attn(args.use_flash_attn);
+            config
+        }
+    };
+
    let device = candle_examples::device(args.cpu)?;
    let (model, device) = if args.quantized {
        let filename = &filenames[0];
--- a/candle-examples/examples/trocr/assets/noto.png
+++ b/candle-examples/examples/trocr/assets/noto.png
--- a/candle-examples/examples/trocr/main.rs
+++ b/candle-examples/examples/trocr/main.rs
@ -10,15 +10,36 @@ use clap::{Parser, ValueEnum};
 use candle::{DType, Tensor};
 use candle_examples::token_output_stream::TokenOutputStream;
 use candle_nn::VarBuilder;
-use candle_transformers::models::trocr;
+use candle_transformers::models::{trocr, vit};

 use tokenizers::Tokenizer;
 mod image_processor;

 #[derive(Clone, Debug, Copy, ValueEnum)]
 enum Which {
-    Base,
-    Large,
+    #[value(name = "base")]
+    BaseHandwritten,
+    #[value(name = "large")]
+    LargeHandwritten,
+    BasePrinted,
+    LargePrinted,
+}
+
+impl Which {
+    fn repo_and_branch_name(&self) -> (&str, &str) {
+        match self {
+            Self::BaseHandwritten => ("microsoft/trocr-base-handwritten", "refs/pr/3"),
+            Self::LargeHandwritten => ("microsoft/trocr-large-handwritten", "refs/pr/6"),
+            Self::BasePrinted => ("microsoft/trocr-base-printed", "refs/pr/7"),
+            Self::LargePrinted => ("microsoft/trocr-large-printed", "main"),
+        }
+    }
+}
+
+#[derive(Debug, Clone, serde::Deserialize)]
+struct Config {
+    encoder: vit::Config,
+    decoder: trocr::TrOCRConfig,
 }

 #[derive(Parser, Debug)]
@ -34,63 +55,64 @@ struct Args {
    #[arg(long)]
    cpu: bool,

-    /// Text to be translated
+    /// The image file to be processed.
    #[arg(long)]
    image: String,
+
+    /// Tokenization config.
+    #[arg(long)]
+    tokenizer: Option<String>,
 }

 pub fn main() -> anyhow::Result<()> {
-    use hf_hub::api::sync::Api;
    let args = Args::parse();
+    let api = hf_hub::api::sync::Api::new()?;

-    let tokenizer_dec = {
-        let tokenizer = Api::new()?
-            .model(String::from("ToluClassics/candle-trocr-tokenizer"))
-            .get("tokenizer.json")?;
-
-        Tokenizer::from_file(&tokenizer).map_err(E::msg)?
+    let mut tokenizer_dec = {
+        let tokenizer_file = match args.tokenizer {
+            None => api
+                .model(String::from("ToluClassics/candle-trocr-tokenizer"))
+                .get("tokenizer.json")?,
+            Some(tokenizer) => std::path::PathBuf::from(tokenizer),
+        };
+        let tokenizer = Tokenizer::from_file(&tokenizer_file).map_err(E::msg)?;
+        TokenOutputStream::new(tokenizer)
    };
-
-    let mut tokenizer_dec = TokenOutputStream::new(tokenizer_dec);
-
    let device = candle_examples::device(args.cpu)?;

    let vb = {
        let model = match args.model {
            Some(model) => std::path::PathBuf::from(model),
-            None => match args.which {
-                Which::Base => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "microsoft/trocr-base-handwritten".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/3".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-                Which::Large => Api::new()?
-                    .repo(hf_hub::Repo::with_revision(
-                        "microsoft/trocr-large-handwritten".to_string(),
-                        hf_hub::RepoType::Model,
-                        "refs/pr/6".to_string(),
-                    ))
-                    .get("model.safetensors")?,
-            },
+            None => {
+                let (repo, branch) = args.which.repo_and_branch_name();
+                api.repo(hf_hub::Repo::with_revision(
+                    repo.to_string(),
+                    hf_hub::RepoType::Model,
+                    branch.to_string(),
+                ))
+                .get("model.safetensors")?
+            }
        };
        println!("model: {:?}", model);
        unsafe { VarBuilder::from_mmaped_safetensors(&[model], DType::F32, &device)? }
    };

-    let encoder_config = match args.which {
-        Which::Base => candle_transformers::models::vit::Config::microsoft_trocr_base_handwritten(),
-        Which::Large => {
-            candle_transformers::models::vit::Config::microsoft_trocr_base_handwritten()
-        }
+    let (encoder_config, decoder_config) = {
+        let (repo, branch) = args.which.repo_and_branch_name();
+        let config_filename = api
+            .repo(hf_hub::Repo::with_revision(
+                repo.to_string(),
+                hf_hub::RepoType::Model,
+                branch.to_string(),
+            ))
+            .get("config.json")?;
+        let config: Config = serde_json::from_reader(std::fs::File::open(config_filename)?)?;
+        (config.encoder, config.decoder)
    };
-
-    let decoder_config = trocr::TrOCRConfig::default();
    let mut model = trocr::TrOCRModel::new(&encoder_config, &decoder_config, vb)?;

-    let config = image_processor::ProcessorConfig::default();
-    let processor = image_processor::ViTImageProcessor::new(&config);
+    let processor_config = image_processor::ProcessorConfig::default();
+    let processor = image_processor::ViTImageProcessor::new(&processor_config);

    let image = vec![args.image.as_str()];
    let image = processor.preprocess(image)?;
--- a/candle-examples/examples/trocr/readme.md
+++ b/candle-examples/examples/trocr/readme.md
@ -5,12 +5,27 @@ transcribe image text. See the associated [model
 card](https://huggingface.co/microsoft/trocr-base-printed) for details on
 the model itself.

+Supported models include:
+
+- `--which base`: small handwritten OCR model.
+- `--which large`: large handwritten OCR model.
+- `--which base-printed`: small printed OCR model.
+- `--which large-printed`: large printed OCR model.
+
 ## Running an example

 ```bash
-cargo run --example trocr --release --  --which base --cpu --image candle-examples/examples/trocr/assets/trocr.png
+cargo run --example trocr --release -- --image candle-examples/examples/trocr/assets/trocr.png
+cargo run --example trocr --release -- --which large --image candle-examples/examples/trocr/assets/trocr.png
+cargo run --example trocr --release -- --which base-printed --image candle-examples/examples/trocr/assets/noto.png
+cargo run --example trocr --release -- --which large-printed --image candle-examples/examples/trocr/assets/noto.png
 ```

+### Outputs
+
 ```
-<s> industry , Mr. Brown commented icily . " Let us have a</s>
+industry , Mr. Brown commented icily . " Let us have a
+industry , " Mr. Brown commented icily . " Let us have a
+THE QUICK BROWN FOR JUMPS OVER THE LAY DOG
+THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG
 ```
--- a/candle-examples/examples/whisper-microphone/main.rs
+++ b/candle-examples/examples/whisper-microphone/main.rs
@ -0,0 +1,673 @@
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+use anyhow::{Error as E, Result};
+use candle::{Device, IndexOp, Tensor};
+use candle_nn::{ops::softmax, VarBuilder};
+use clap::{Parser, ValueEnum};
+use hf_hub::{api::sync::Api, Repo, RepoType};
+use rand::{distributions::Distribution, SeedableRng};
+use std::iter;
+use tokenizers::Tokenizer;
+
+mod multilingual;
+
+use candle_transformers::models::whisper::{self as m, audio, Config};
+
+use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use std::sync::{Arc, Mutex};
+
+pub enum Model {
+    Normal(m::model::Whisper),
+    Quantized(m::quantized_model::Whisper),
+}
+
+// Maybe we should use some traits rather than doing the dispatch for all these.
+impl Model {
+    pub fn config(&self) -> &Config {
+        match self {
+            Self::Normal(m) => &m.config,
+            Self::Quantized(m) => &m.config,
+        }
+    }
+
+    pub fn encoder_forward(&mut self, x: &Tensor, flush: bool) -> candle::Result<Tensor> {
+        match self {
+            Self::Normal(m) => m.encoder.forward(x, flush),
+            Self::Quantized(m) => m.encoder.forward(x, flush),
+        }
+    }
+
+    pub fn decoder_forward(
+        &mut self,
+        x: &Tensor,
+        xa: &Tensor,
+        flush: bool,
+    ) -> candle::Result<Tensor> {
+        match self {
+            Self::Normal(m) => m.decoder.forward(x, xa, flush),
+            Self::Quantized(m) => m.decoder.forward(x, xa, flush),
+        }
+    }
+
+    pub fn decoder_final_linear(&self, x: &Tensor) -> candle::Result<Tensor> {
+        match self {
+            Self::Normal(m) => m.decoder.final_linear(x),
+            Self::Quantized(m) => m.decoder.final_linear(x),
+        }
+    }
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+struct DecodingResult {
+    tokens: Vec<u32>,
+    text: String,
+    avg_logprob: f64,
+    no_speech_prob: f64,
+    temperature: f64,
+    compression_ratio: f64,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Clone)]
+struct Segment {
+    start: f64,
+    duration: f64,
+    dr: DecodingResult,
+}
+
+struct Decoder {
+    model: Model,
+    rng: rand::rngs::StdRng,
+    task: Option<Task>,
+    timestamps: bool,
+    verbose: bool,
+    tokenizer: Tokenizer,
+    suppress_tokens: Tensor,
+    sot_token: u32,
+    transcribe_token: u32,
+    translate_token: u32,
+    eot_token: u32,
+    no_speech_token: u32,
+    no_timestamps_token: u32,
+    language_token: Option<u32>,
+}
+
+impl Decoder {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        device: &Device,
+        language_token: Option<u32>,
+        task: Option<Task>,
+        timestamps: bool,
+        verbose: bool,
+    ) -> Result<Self> {
+        let no_timestamps_token = token_id(&tokenizer, m::NO_TIMESTAMPS_TOKEN)?;
+        // Suppress the notimestamps token when in timestamps mode.
+        // https://github.com/openai/whisper/blob/e8622f9afc4eba139bf796c210f5c01081000472/whisper/decoding.py#L452
+        let suppress_tokens: Vec<f32> = (0..model.config().vocab_size as u32)
+            .map(|i| {
+                if model.config().suppress_tokens.contains(&i)
+                    || timestamps && i == no_timestamps_token
+                {
+                    f32::NEG_INFINITY
+                } else {
+                    0f32
+                }
+            })
+            .collect();
+        let suppress_tokens = Tensor::new(suppress_tokens.as_slice(), device)?;
+        let sot_token = token_id(&tokenizer, m::SOT_TOKEN)?;
+        let transcribe_token = token_id(&tokenizer, m::TRANSCRIBE_TOKEN)?;
+        let translate_token = token_id(&tokenizer, m::TRANSLATE_TOKEN)?;
+        let eot_token = token_id(&tokenizer, m::EOT_TOKEN)?;
+        let no_speech_token = m::NO_SPEECH_TOKENS
+            .iter()
+            .find_map(|token| token_id(&tokenizer, token).ok());
+        let no_speech_token = match no_speech_token {
+            None => anyhow::bail!("unable to find any non-speech token"),
+            Some(n) => n,
+        };
+        Ok(Self {
+            model,
+            rng: rand::rngs::StdRng::seed_from_u64(seed),
+            tokenizer,
+            task,
+            timestamps,
+            verbose,
+            suppress_tokens,
+            sot_token,
+            transcribe_token,
+            translate_token,
+            eot_token,
+            no_speech_token,
+            language_token,
+            no_timestamps_token,
+        })
+    }
+
+    fn decode(&mut self, mel: &Tensor, t: f64) -> Result<DecodingResult> {
+        let model = &mut self.model;
+        let audio_features = model.encoder_forward(mel, true)?;
+        if self.verbose {
+            println!("audio features: {:?}", audio_features.dims());
+        }
+        let sample_len = model.config().max_target_positions / 2;
+        let mut sum_logprob = 0f64;
+        let mut no_speech_prob = f64::NAN;
+        let mut tokens = vec![self.sot_token];
+        if let Some(language_token) = self.language_token {
+            tokens.push(language_token);
+        }
+        match self.task {
+            None | Some(Task::Transcribe) => tokens.push(self.transcribe_token),
+            Some(Task::Translate) => tokens.push(self.translate_token),
+        }
+        if !self.timestamps {
+            tokens.push(self.no_timestamps_token);
+        }
+        for i in 0..sample_len {
+            let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?;
+
+            // The model expects a batch dim but this inference loop does not handle
+            // it so we add it at this point.
+            let tokens_t = tokens_t.unsqueeze(0)?;
+            let ys = model.decoder_forward(&tokens_t, &audio_features, i == 0)?;
+
+            // Extract the no speech probability on the first iteration by looking at the first
+            // token logits and the probability for the according token.
+            if i == 0 {
+                let logits = model.decoder_final_linear(&ys.i(..1)?)?.i(0)?.i(0)?;
+                no_speech_prob = softmax(&logits, 0)?
+                    .i(self.no_speech_token as usize)?
+                    .to_scalar::<f32>()? as f64;
+            }
+
+            let (_, seq_len, _) = ys.dims3()?;
+            let logits = model
+                .decoder_final_linear(&ys.i((..1, seq_len - 1..))?)?
+                .i(0)?
+                .i(0)?;
+            // TODO: Besides suppress tokens, we should apply the heuristics from
+            // ApplyTimestampRules, i.e.:
+            // - Timestamps come in pairs, except before EOT.
+            // - Timestamps should be non-decreasing.
+            // - If the sum of the probabilities of timestamps is higher than any other tokens,
+            //   only consider timestamps when sampling.
+            // https://github.com/openai/whisper/blob/e8622f9afc4eba139bf796c210f5c01081000472/whisper/decoding.py#L439
+            let logits = logits.broadcast_add(&self.suppress_tokens)?;
+            let next_token = if t > 0f64 {
+                let prs = softmax(&(&logits / t)?, 0)?;
+                let logits_v: Vec<f32> = prs.to_vec1()?;
+                let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
+                distr.sample(&mut self.rng) as u32
+            } else {
+                let logits_v: Vec<f32> = logits.to_vec1()?;
+                logits_v
+                    .iter()
+                    .enumerate()
+                    .max_by(|(_, u), (_, v)| u.total_cmp(v))
+                    .map(|(i, _)| i as u32)
+                    .unwrap()
+            };
+            tokens.push(next_token);
+            let prob = softmax(&logits, candle::D::Minus1)?
+                .i(next_token as usize)?
+                .to_scalar::<f32>()? as f64;
+            if next_token == self.eot_token || tokens.len() > model.config().max_target_positions {
+                break;
+            }
+            sum_logprob += prob.ln();
+        }
+        let text = self.tokenizer.decode(&tokens, true).map_err(E::msg)?;
+        let avg_logprob = sum_logprob / tokens.len() as f64;
+
+        Ok(DecodingResult {
+            tokens,
+            text,
+            avg_logprob,
+            no_speech_prob,
+            temperature: t,
+            compression_ratio: f64::NAN,
+        })
+    }
+
+    fn decode_with_fallback(&mut self, segment: &Tensor) -> Result<DecodingResult> {
+        for (i, &t) in m::TEMPERATURES.iter().enumerate() {
+            let dr: Result<DecodingResult> = self.decode(segment, t);
+            if i == m::TEMPERATURES.len() - 1 {
+                return dr;
+            }
+            // On errors, we try again with a different temperature.
+            match dr {
+                Ok(dr) => {
+                    let needs_fallback = dr.compression_ratio > m::COMPRESSION_RATIO_THRESHOLD
+                        || dr.avg_logprob < m::LOGPROB_THRESHOLD;
+                    if !needs_fallback || dr.no_speech_prob > m::NO_SPEECH_THRESHOLD {
+                        return Ok(dr);
+                    }
+                }
+                Err(err) => {
+                    println!("Error running at {t}: {err}")
+                }
+            }
+        }
+        unreachable!()
+    }
+
+    fn run(&mut self, mel: &Tensor, times: Option<(f64, f64)>) -> Result<Vec<Segment>> {
+        let (_, _, content_frames) = mel.dims3()?;
+        let mut seek = 0;
+        let mut segments = vec![];
+        while seek < content_frames {
+            let start = std::time::Instant::now();
+            let time_offset = (seek * m::HOP_LENGTH) as f64 / m::SAMPLE_RATE as f64;
+            let segment_size = usize::min(content_frames - seek, m::N_FRAMES);
+            let mel_segment = mel.narrow(2, seek, segment_size)?;
+            let segment_duration = (segment_size * m::HOP_LENGTH) as f64 / m::SAMPLE_RATE as f64;
+            let dr = self.decode_with_fallback(&mel_segment)?;
+            seek += segment_size;
+            if dr.no_speech_prob > m::NO_SPEECH_THRESHOLD && dr.avg_logprob < m::LOGPROB_THRESHOLD {
+                println!("no speech detected, skipping {seek} {dr:?}");
+                continue;
+            }
+            let segment = Segment {
+                start: time_offset,
+                duration: segment_duration,
+                dr,
+            };
+            if self.timestamps {
+                println!(
+                    "{:.1}s -- {:.1}s",
+                    segment.start,
+                    segment.start + segment.duration,
+                );
+                let mut tokens_to_decode = vec![];
+                let mut prev_timestamp_s = 0f32;
+                for &token in segment.dr.tokens.iter() {
+                    if token == self.sot_token || token == self.eot_token {
+                        continue;
+                    }
+                    // The no_timestamp_token is the last before the timestamp ones.
+                    if token > self.no_timestamps_token {
+                        let timestamp_s = (token - self.no_timestamps_token + 1) as f32 / 50.;
+                        if !tokens_to_decode.is_empty() {
+                            let text = self
+                                .tokenizer
+                                .decode(&tokens_to_decode, true)
+                                .map_err(E::msg)?;
+                            println!("  {:.1}s-{:.1}s: {}", prev_timestamp_s, timestamp_s, text);
+                            tokens_to_decode.clear()
+                        }
+                        prev_timestamp_s = timestamp_s;
+                    } else {
+                        tokens_to_decode.push(token)
+                    }
+                }
+                if !tokens_to_decode.is_empty() {
+                    let text = self
+                        .tokenizer
+                        .decode(&tokens_to_decode, true)
+                        .map_err(E::msg)?;
+                    if !text.is_empty() {
+                        println!("  {:.1}s-...: {}", prev_timestamp_s, text);
+                    }
+                    tokens_to_decode.clear()
+                }
+            } else {
+                match times {
+                    Some((start, end)) => {
+                        println!("{:.1}s -- {:.1}s: {}", start, end, segment.dr.text)
+                    }
+                    None => {
+                        println!(
+                            "{:.1}s -- {:.1}s: {}",
+                            segment.start,
+                            segment.start + segment.duration,
+                            segment.dr.text,
+                        )
+                    }
+                }
+            }
+            if self.verbose {
+                println!("{seek}: {segment:?}, in {:?}", start.elapsed());
+            }
+            segments.push(segment)
+        }
+        Ok(segments)
+    }
+
+    fn set_language_token(&mut self, language_token: Option<u32>) {
+        self.language_token = language_token;
+    }
+
+    #[allow(dead_code)]
+    fn reset_kv_cache(&mut self) {
+        match &mut self.model {
+            Model::Normal(m) => m.reset_kv_cache(),
+            Model::Quantized(m) => m.reset_kv_cache(),
+        }
+    }
+
+    fn model(&mut self) -> &mut Model {
+        &mut self.model
+    }
+}
+
+pub fn token_id(tokenizer: &Tokenizer, token: &str) -> candle::Result<u32> {
+    match tokenizer.token_to_id(token) {
+        None => candle::bail!("no token-id for {token}"),
+        Some(id) => Ok(id),
+    }
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum Task {
+    Transcribe,
+    Translate,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, ValueEnum)]
+enum WhichModel {
+    Tiny,
+    #[value(name = "tiny.en")]
+    TinyEn,
+    Base,
+    #[value(name = "base.en")]
+    BaseEn,
+    Small,
+    #[value(name = "small.en")]
+    SmallEn,
+    Medium,
+    #[value(name = "medium.en")]
+    MediumEn,
+    Large,
+    LargeV2,
+    LargeV3,
+    #[value(name = "distil-medium.en")]
+    DistilMediumEn,
+    #[value(name = "distil-large-v2")]
+    DistilLargeV2,
+}
+
+impl WhichModel {
+    fn is_multilingual(&self) -> bool {
+        match self {
+            Self::Tiny
+            | Self::Base
+            | Self::Small
+            | Self::Medium
+            | Self::Large
+            | Self::LargeV2
+            | Self::LargeV3
+            | Self::DistilLargeV2 => true,
+            Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn | Self::DistilMediumEn => {
+                false
+            }
+        }
+    }
+
+    fn model_and_revision(&self) -> (&'static str, &'static str) {
+        match self {
+            Self::Tiny => ("openai/whisper-tiny", "main"),
+            Self::TinyEn => ("openai/whisper-tiny.en", "refs/pr/15"),
+            Self::Base => ("openai/whisper-base", "refs/pr/22"),
+            Self::BaseEn => ("openai/whisper-base.en", "refs/pr/13"),
+            Self::Small => ("openai/whisper-small", "main"),
+            Self::SmallEn => ("openai/whisper-small.en", "refs/pr/10"),
+            Self::Medium => ("openai/whisper-medium", "main"),
+            Self::MediumEn => ("openai/whisper-medium.en", "main"),
+            Self::Large => ("openai/whisper-large", "refs/pr/36"),
+            Self::LargeV2 => ("openai/whisper-large-v2", "refs/pr/57"),
+            Self::LargeV3 => ("openai/whisper-large-v3", "main"),
+            Self::DistilMediumEn => ("distil-whisper/distil-medium.en", "main"),
+            Self::DistilLargeV2 => ("distil-whisper/distil-large-v2", "main"),
+        }
+    }
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    cpu: bool,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    /// The model to use, check out available models:
+    /// https://huggingface.co/models?search=whisper
+    #[arg(long)]
+    revision: Option<String>,
+
+    /// The model to be used, can be tiny, small, medium.
+    #[arg(long, default_value = "tiny.en")]
+    model: WhichModel,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    seed: u64,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    tracing: bool,
+
+    #[arg(long)]
+    quantized: bool,
+
+    /// Language.
+    #[arg(long)]
+    language: Option<String>,
+
+    /// Task, when no task is specified, the input tokens contain only the sot token which can
+    /// improve things when in no-timestamp mode.
+    #[arg(long)]
+    task: Option<Task>,
+
+    /// Timestamps mode, this is not fully implemented yet.
+    #[arg(long)]
+    timestamps: bool,
+
+    /// Print the full DecodingResult structure rather than just the text.
+    #[arg(long)]
+    verbose: bool,
+}
+
+pub fn main() -> Result<()> {
+    use tracing_chrome::ChromeLayerBuilder;
+    use tracing_subscriber::prelude::*;
+
+    let args = Args::parse();
+    let _guard = if args.tracing {
+        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+        tracing_subscriber::registry().with(chrome_layer).init();
+        Some(guard)
+    } else {
+        None
+    };
+    let device = candle_examples::device(args.cpu)?;
+    let (default_model, default_revision) = if args.quantized {
+        ("lmz/candle-whisper", "main")
+    } else {
+        args.model.model_and_revision()
+    };
+    let default_model = default_model.to_string();
+    let default_revision = default_revision.to_string();
+    let (model_id, revision) = match (args.model_id, args.revision) {
+        (Some(model_id), Some(revision)) => (model_id, revision),
+        (Some(model_id), None) => (model_id, "main".to_string()),
+        (None, Some(revision)) => (default_model, revision),
+        (None, None) => (default_model, default_revision),
+    };
+
+    let (config_filename, tokenizer_filename, weights_filename) = {
+        let api = Api::new()?;
+        let repo = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));
+        let (config, tokenizer, model) = if args.quantized {
+            let ext = match args.model {
+                WhichModel::TinyEn => "tiny-en",
+                WhichModel::Tiny => "tiny",
+                _ => unimplemented!("no quantized support for {:?}", args.model),
+            };
+            (
+                repo.get(&format!("config-{ext}.json"))?,
+                repo.get(&format!("tokenizer-{ext}.json"))?,
+                repo.get(&format!("model-{ext}-q80.gguf"))?,
+            )
+        } else {
+            let config = repo.get("config.json")?;
+            let tokenizer = repo.get("tokenizer.json")?;
+            let model = repo.get("model.safetensors")?;
+            (config, tokenizer, model)
+        };
+        (config, tokenizer, model)
+    };
+    let config: Config = serde_json::from_str(&std::fs::read_to_string(config_filename)?)?;
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
+    let model = if args.quantized {
+        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
+            &weights_filename,
+            &device,
+        )?;
+        Model::Quantized(m::quantized_model::Whisper::load(&vb, config.clone())?)
+    } else {
+        let vb =
+            unsafe { VarBuilder::from_mmaped_safetensors(&[weights_filename], m::DTYPE, &device)? };
+        Model::Normal(m::model::Whisper::load(&vb, config.clone())?)
+    };
+    let language_token = None;
+    let mut dc = Decoder::new(
+        model,
+        tokenizer.clone(),
+        args.seed,
+        &device,
+        language_token,
+        args.task,
+        args.timestamps,
+        args.verbose,
+    )?;
+
+    let mel_bytes = match config.num_mel_bins {
+        80 => include_bytes!("../whisper/melfilters.bytes").as_slice(),
+        128 => include_bytes!("../whisper/melfilters128.bytes").as_slice(),
+        nmel => anyhow::bail!("unexpected num_mel_bins {nmel}"),
+    };
+    let mut mel_filters = vec![0f32; mel_bytes.len() / 4];
+    <byteorder::LittleEndian as byteorder::ByteOrder>::read_f32_into(mel_bytes, &mut mel_filters);
+
+    // Set up the input device and stream with the default input config.
+    let host = cpal::default_host();
+    let _device = "default";
+    let _device = if _device == "default" {
+        host.default_input_device()
+    } else {
+        host.input_devices()?
+            .find(|x| x.name().map(|y| y == _device).unwrap_or(false))
+    }
+    .expect("failed to find input device");
+
+    let _config = _device
+        .default_input_config()
+        .expect("Failed to get default input config");
+
+    let channel_count = _config.channels() as usize;
+
+    let audio_ring_buffer = Arc::new(Mutex::new(Vec::new()));
+    let audio_ring_buffer_2 = audio_ring_buffer.clone();
+
+    std::thread::spawn(move || loop {
+        let data = record_audio(&_device, &_config, 300).unwrap();
+        audio_ring_buffer.lock().unwrap().extend_from_slice(&data);
+        let max_len = data.len() * 16;
+        let data_len = data.len();
+        let len = audio_ring_buffer.lock().unwrap().len();
+        if len > max_len {
+            let mut data = audio_ring_buffer.lock().unwrap();
+            let new_data = data[data_len..].to_vec();
+            *data = new_data;
+        }
+    });
+
+    // loop to process the audio data forever (until the user stops the program)
+    println!("Transcribing audio...");
+    for (i, _) in iter::repeat(()).enumerate() {
+        std::thread::sleep(std::time::Duration::from_millis(1000));
+        let data = audio_ring_buffer_2.lock().unwrap().clone();
+        let pcm_data: Vec<_> = data[..data.len() / channel_count as usize]
+            .iter()
+            .map(|v| *v as f32 / 32768.)
+            .collect();
+        let mel = audio::pcm_to_mel(&config, &pcm_data, &mel_filters);
+        let mel_len = mel.len();
+        let mel = Tensor::from_vec(
+            mel,
+            (1, config.num_mel_bins, mel_len / config.num_mel_bins),
+            &device,
+        )?;
+
+        // on the first iteration, we detect the language and set the language token.
+        if i == 0 {
+            let language_token = match (args.model.is_multilingual(), args.language.clone()) {
+                (true, None) => Some(multilingual::detect_language(dc.model(), &tokenizer, &mel)?),
+                (false, None) => None,
+                (true, Some(language)) => match token_id(&tokenizer, &format!("<|{language}|>")) {
+                    Ok(token_id) => Some(token_id),
+                    Err(_) => anyhow::bail!("language {language} is not supported"),
+                },
+                (false, Some(_)) => {
+                    anyhow::bail!("a language cannot be set for non-multilingual models")
+                }
+            };
+            println!("language_token: {:?}", language_token);
+            dc.set_language_token(language_token);
+        }
+        dc.run(
+            &mel,
+            Some((
+                i as f64,
+                i as f64 + data.len() as f64 / m::SAMPLE_RATE as f64,
+            )),
+        )?;
+        dc.reset_kv_cache();
+    }
+
+    Ok(())
+}
+
+fn record_audio(
+    device: &cpal::Device,
+    config: &cpal::SupportedStreamConfig,
+    milliseconds: u64,
+) -> Result<Vec<i16>> {
+    let writer = Arc::new(Mutex::new(Vec::new()));
+    let writer_2 = writer.clone();
+    let stream = device.build_input_stream(
+        &config.config(),
+        move |data: &[f32], _: &cpal::InputCallbackInfo| {
+            let processed = data
+                .iter()
+                .map(|v| (v * 32768.0) as i16)
+                .collect::<Vec<i16>>();
+            writer_2.lock().unwrap().extend_from_slice(&processed);
+        },
+        move |err| {
+            eprintln!("an error occurred on stream: {}", err);
+        },
+        None,
+    )?;
+    stream.play()?;
+    std::thread::sleep(std::time::Duration::from_millis(milliseconds));
+    drop(stream);
+    let data = writer.lock().unwrap().clone();
+    let step = 3;
+    let data: Vec<i16> = data.iter().step_by(step).copied().collect();
+    Ok(data)
+}
--- a/candle-examples/examples/whisper-microphone/multilingual.rs
+++ b/candle-examples/examples/whisper-microphone/multilingual.rs
@ -0,0 +1,137 @@
+use crate::{token_id, Model};
+use candle::{IndexOp, Result, Tensor, D};
+use candle_transformers::models::whisper::{self as m};
+use tokenizers::Tokenizer;
+
+const LANGUAGES: [(&str, &str); 99] = [
+    ("en", "english"),
+    ("zh", "chinese"),
+    ("de", "german"),
+    ("es", "spanish"),
+    ("ru", "russian"),
+    ("ko", "korean"),
+    ("fr", "french"),
+    ("ja", "japanese"),
+    ("pt", "portuguese"),
+    ("tr", "turkish"),
+    ("pl", "polish"),
+    ("ca", "catalan"),
+    ("nl", "dutch"),
+    ("ar", "arabic"),
+    ("sv", "swedish"),
+    ("it", "italian"),
+    ("id", "indonesian"),
+    ("hi", "hindi"),
+    ("fi", "finnish"),
+    ("vi", "vietnamese"),
+    ("he", "hebrew"),
+    ("uk", "ukrainian"),
+    ("el", "greek"),
+    ("ms", "malay"),
+    ("cs", "czech"),
+    ("ro", "romanian"),
+    ("da", "danish"),
+    ("hu", "hungarian"),
+    ("ta", "tamil"),
+    ("no", "norwegian"),
+    ("th", "thai"),
+    ("ur", "urdu"),
+    ("hr", "croatian"),
+    ("bg", "bulgarian"),
+    ("lt", "lithuanian"),
+    ("la", "latin"),
+    ("mi", "maori"),
+    ("ml", "malayalam"),
+    ("cy", "welsh"),
+    ("sk", "slovak"),
+    ("te", "telugu"),
+    ("fa", "persian"),
+    ("lv", "latvian"),
+    ("bn", "bengali"),
+    ("sr", "serbian"),
+    ("az", "azerbaijani"),
+    ("sl", "slovenian"),
+    ("kn", "kannada"),
+    ("et", "estonian"),
+    ("mk", "macedonian"),
+    ("br", "breton"),
+    ("eu", "basque"),
+    ("is", "icelandic"),
+    ("hy", "armenian"),
+    ("ne", "nepali"),
+    ("mn", "mongolian"),
+    ("bs", "bosnian"),
+    ("kk", "kazakh"),
+    ("sq", "albanian"),
+    ("sw", "swahili"),
+    ("gl", "galician"),
+    ("mr", "marathi"),
+    ("pa", "punjabi"),
+    ("si", "sinhala"),
+    ("km", "khmer"),
+    ("sn", "shona"),
+    ("yo", "yoruba"),
+    ("so", "somali"),
+    ("af", "afrikaans"),
+    ("oc", "occitan"),
+    ("ka", "georgian"),
+    ("be", "belarusian"),
+    ("tg", "tajik"),
+    ("sd", "sindhi"),
+    ("gu", "gujarati"),
+    ("am", "amharic"),
+    ("yi", "yiddish"),
+    ("lo", "lao"),
+    ("uz", "uzbek"),
+    ("fo", "faroese"),
+    ("ht", "haitian creole"),
+    ("ps", "pashto"),
+    ("tk", "turkmen"),
+    ("nn", "nynorsk"),
+    ("mt", "maltese"),
+    ("sa", "sanskrit"),
+    ("lb", "luxembourgish"),
+    ("my", "myanmar"),
+    ("bo", "tibetan"),
+    ("tl", "tagalog"),
+    ("mg", "malagasy"),
+    ("as", "assamese"),
+    ("tt", "tatar"),
+    ("haw", "hawaiian"),
+    ("ln", "lingala"),
+    ("ha", "hausa"),
+    ("ba", "bashkir"),
+    ("jw", "javanese"),
+    ("su", "sundanese"),
+];
+
+/// Returns the token id for the selected language.
+pub fn detect_language(model: &mut Model, tokenizer: &Tokenizer, mel: &Tensor) -> Result<u32> {
+    let (_bsize, _, seq_len) = mel.dims3()?;
+    let mel = mel.narrow(
+        2,
+        0,
+        usize::min(seq_len, model.config().max_source_positions),
+    )?;
+    let device = mel.device();
+    let language_token_ids = LANGUAGES
+        .iter()
+        .map(|(t, _)| token_id(tokenizer, &format!("<|{t}|>")))
+        .collect::<Result<Vec<_>>>()?;
+    let sot_token = token_id(tokenizer, m::SOT_TOKEN)?;
+    let audio_features = model.encoder_forward(&mel, true)?;
+    let tokens = Tensor::new(&[[sot_token]], device)?;
+    let language_token_ids = Tensor::new(language_token_ids.as_slice(), device)?;
+    let ys = model.decoder_forward(&tokens, &audio_features, true)?;
+    let logits = model.decoder_final_linear(&ys.i(..1)?)?.i(0)?.i(0)?;
+    let logits = logits.index_select(&language_token_ids, 0)?;
+    let probs = candle_nn::ops::softmax(&logits, D::Minus1)?;
+    let probs = probs.to_vec1::<f32>()?;
+    let mut probs = LANGUAGES.iter().zip(probs.iter()).collect::<Vec<_>>();
+    probs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
+    for ((_, language), p) in probs.iter().take(5) {
+        println!("{language}: {p}")
+    }
+    let language = token_id(tokenizer, &format!("<|{}|>", probs[0].0 .0))?;
+    Ok(language)
+}
--- a/candle-examples/examples/whisper/main.rs
+++ b/candle-examples/examples/whisper/main.rs
@ -18,6 +18,8 @@ use rand::{distributions::Distribution, SeedableRng};
 use tokenizers::Tokenizer;

 mod multilingual;
+mod pcm_decode;
+
 use candle_transformers::models::whisper::{self as m, audio, Config};

 pub enum Model {
@ -535,17 +537,10 @@ fn main() -> Result<()> {
    let mut mel_filters = vec![0f32; mel_bytes.len() / 4];
    <byteorder::LittleEndian as byteorder::ByteOrder>::read_f32_into(mel_bytes, &mut mel_filters);

-    let mut input = std::fs::File::open(input)?;
-    let (header, data) = wav::read(&mut input)?;
-    println!("loaded wav data: {header:?}");
-    if header.sampling_rate != m::SAMPLE_RATE as u32 {
-        anyhow::bail!("wav file must have a {} sampling rate", m::SAMPLE_RATE)
+    let (pcm_data, sample_rate) = pcm_decode::pcm_decode(input)?;
+    if sample_rate != m::SAMPLE_RATE as u32 {
+        anyhow::bail!("input file must have a {} sampling rate", m::SAMPLE_RATE)
    }
-    let data = data.as_sixteen().expect("expected 16 bit wav file");
-    let pcm_data: Vec<_> = data[..data.len() / header.channel_count as usize]
-        .iter()
-        .map(|v| *v as f32 / 32768.)
-        .collect();
    println!("pcm data loaded {}", pcm_data.len());
    let mel = audio::pcm_to_mel(&config, &pcm_data, &mel_filters);
    let mel_len = mel.len();
--- a/candle-examples/examples/whisper/pcm_decode.rs
+++ b/candle-examples/examples/whisper/pcm_decode.rs
@ -0,0 +1,74 @@
+use symphonia::core::audio::{AudioBufferRef, Signal};
+use symphonia::core::codecs::{DecoderOptions, CODEC_TYPE_NULL};
+use symphonia::core::conv::FromSample;
+
+fn conv<T>(samples: &mut Vec<f32>, data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>)
+where
+    T: symphonia::core::sample::Sample,
+    f32: symphonia::core::conv::FromSample<T>,
+{
+    samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
+}
+
+pub(crate) fn pcm_decode<P: AsRef<std::path::Path>>(path: P) -> anyhow::Result<(Vec<f32>, u32)> {
+    // Open the media source.
+    let src = std::fs::File::open(path)?;
+
+    // Create the media source stream.
+    let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default());
+
+    // Create a probe hint using the file's extension. [Optional]
+    let hint = symphonia::core::probe::Hint::new();
+
+    // Use the default options for metadata and format readers.
+    let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
+    let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
+
+    // Probe the media source.
+    let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?;
+    // Get the instantiated format reader.
+    let mut format = probed.format;
+
+    // Find the first audio track with a known (decodeable) codec.
+    let track = format
+        .tracks()
+        .iter()
+        .find(|t| t.codec_params.codec != CODEC_TYPE_NULL)
+        .expect("no supported audio tracks");
+
+    // Use the default options for the decoder.
+    let dec_opts: DecoderOptions = Default::default();
+
+    // Create a decoder for the track.
+    let mut decoder = symphonia::default::get_codecs()
+        .make(&track.codec_params, &dec_opts)
+        .expect("unsupported codec");
+    let track_id = track.id;
+    let sample_rate = track.codec_params.sample_rate.unwrap_or(0);
+    let mut pcm_data = Vec::new();
+    // The decode loop.
+    while let Ok(packet) = format.next_packet() {
+        // Consume any new metadata that has been read since the last packet.
+        while !format.metadata().is_latest() {
+            format.metadata().pop();
+        }
+
+        // If the packet does not belong to the selected track, skip over it.
+        if packet.track_id() != track_id {
+            continue;
+        }
+        match decoder.decode(&packet)? {
+            AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
+            AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
+        }
+    }
+    Ok((pcm_data, sample_rate))
+}
--- a/candle-examples/examples/yi/main.rs
+++ b/candle-examples/examples/yi/main.rs
@ -104,6 +104,7 @@ impl TextGeneration {
                break;
            }
            if let Some(t) = self.tokenizer.next_token(next_token)? {
+                let t = t.replace("<|im_end|>", "\n");
                print!("{t}");
                std::io::stdout().flush()?;
            }
--- a/candle-examples/examples/yolo-v3/darknet.rs
+++ b/candle-examples/examples/yolo-v3/darknet.rs
@ -216,7 +216,7 @@ fn detect(
    xs: &Tensor,
    image_height: usize,
    classes: usize,
-    anchors: &Vec<(usize, usize)>,
+    anchors: &[(usize, usize)],
 ) -> Result<Tensor> {
    let (bsize, _channels, height, _width) = xs.dims4()?;
    let stride = image_height / height;
--- a/candle-examples/src/token_output_stream.rs
+++ b/candle-examples/src/token_output_stream.rs
@ -40,7 +40,7 @@ impl TokenOutputStream {
        };
        self.tokens.push(token);
        let text = self.decode(&self.tokens[self.prev_index..])?;
-        if text.len() > prev_text.len() && text.chars().last().unwrap().is_ascii() {
+        if text.len() > prev_text.len() && text.chars().last().unwrap().is_alphabetic() {
            let text = text.split_at(prev_text.len());
            self.prev_index = self.current_index;
            self.current_index = self.tokens.len();
--- a/candle-flash-attn/Cargo.toml
+++ b/candle-flash-attn/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-flash-attn"
-version = "0.3.3"
+version = "0.4.0"
 edition = "2021"

 description = "Flash attention layer for the candle ML framework."
@ -11,14 +11,14 @@ license = "MIT OR Apache-2.0"
 readme = "README.md"

 [dependencies]
-candle = { path = "../candle-core", features = ["cuda"], version = "0.3.3", package = "candle-core" }
+candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.4.0" }
 half = { version = "2.3.1", features = ["num-traits"] }

 [build-dependencies]
+bindgen_cuda = "0.1.1"
 anyhow = { version = "1", features = ["backtrace"] }
-num_cpus = "1.15.0"
-rayon = "1.7.0"
+

 [dev-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-nn = { path = "../candle-nn", version = "0.3.3", features = ["cuda"] }
+candle-nn = { path = "../candle-nn", features = ["cuda"] }
--- a/candle-flash-attn/build.rs
+++ b/candle-flash-attn/build.rs
@ -2,44 +2,32 @@
 // The cuda build time is very long so one can set the CANDLE_FLASH_ATTN_BUILD_DIR environment
 // variable in order to cache the compiled artifacts and avoid recompiling too often.
 use anyhow::{Context, Result};
-use rayon::prelude::*;
 use std::path::PathBuf;
-use std::str::FromStr;

 const KERNEL_FILES: [&str; 17] = [
-    "flash_api.cu",
-    "flash_fwd_hdim128_fp16_sm80.cu",
-    "flash_fwd_hdim160_fp16_sm80.cu",
-    "flash_fwd_hdim192_fp16_sm80.cu",
-    "flash_fwd_hdim224_fp16_sm80.cu",
-    "flash_fwd_hdim256_fp16_sm80.cu",
-    "flash_fwd_hdim32_fp16_sm80.cu",
-    "flash_fwd_hdim64_fp16_sm80.cu",
-    "flash_fwd_hdim96_fp16_sm80.cu",
-    "flash_fwd_hdim128_bf16_sm80.cu",
-    "flash_fwd_hdim160_bf16_sm80.cu",
-    "flash_fwd_hdim192_bf16_sm80.cu",
-    "flash_fwd_hdim224_bf16_sm80.cu",
-    "flash_fwd_hdim256_bf16_sm80.cu",
-    "flash_fwd_hdim32_bf16_sm80.cu",
-    "flash_fwd_hdim64_bf16_sm80.cu",
-    "flash_fwd_hdim96_bf16_sm80.cu",
+    "kernels/flash_api.cu",
+    "kernels/flash_fwd_hdim128_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim160_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim192_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim224_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim256_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim32_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim64_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim96_fp16_sm80.cu",
+    "kernels/flash_fwd_hdim128_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim160_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim192_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim224_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim256_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim32_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim64_bf16_sm80.cu",
+    "kernels/flash_fwd_hdim96_bf16_sm80.cu",
 ];

 fn main() -> Result<()> {
-    let num_cpus = std::env::var("RAYON_NUM_THREADS").map_or_else(
-        |_| num_cpus::get_physical(),
-        |s| usize::from_str(&s).unwrap(),
-    );
-
-    rayon::ThreadPoolBuilder::new()
-        .num_threads(num_cpus)
-        .build_global()
-        .unwrap();
-
    println!("cargo:rerun-if-changed=build.rs");
    for kernel_file in KERNEL_FILES.iter() {
-        println!("cargo:rerun-if-changed=kernels/{kernel_file}");
+        println!("cargo:rerun-if-changed={kernel_file}");
    }
    println!("cargo:rerun-if-changed=kernels/flash_fwd_kernel.h");
    println!("cargo:rerun-if-changed=kernels/flash_fwd_launch_template.h");
@ -66,223 +54,30 @@ fn main() -> Result<()> {
            ))
        }
    };
-    set_cuda_include_dir()?;

-    let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
-    println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
-
-    let compute_cap = compute_cap()?;
+    let kernels = KERNEL_FILES.iter().collect();
+    let builder = bindgen_cuda::Builder::default()
+        .kernel_paths(kernels)
+        .out_dir(build_dir.clone())
+        .arg("-std=c++17")
+        .arg("-O3")
+        .arg("-U__CUDA_NO_HALF_OPERATORS__")
+        .arg("-U__CUDA_NO_HALF_CONVERSIONS__")
+        .arg("-U__CUDA_NO_HALF2_OPERATORS__")
+        .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
+        .arg("-Icutlass/include")
+        .arg("--expt-relaxed-constexpr")
+        .arg("--expt-extended-lambda")
+        .arg("--use_fast_math")
+        .arg("--verbose");

    let out_file = build_dir.join("libflashattention.a");
+    builder.build_lib(out_file);

-    let kernel_dir = PathBuf::from("kernels");
-    let cu_files: Vec<_> = KERNEL_FILES
-        .iter()
-        .map(|f| {
-            let mut obj_file = out_dir.join(f);
-            obj_file.set_extension("o");
-            (kernel_dir.join(f), obj_file)
-        })
-        .collect();
-    let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified());
-    let should_compile = if out_file.exists() {
-        kernel_dir
-            .read_dir()
-            .expect("kernels folder should exist")
-            .any(|entry| {
-                if let (Ok(entry), Ok(out_modified)) = (entry, &out_modified) {
-                    let in_modified = entry.metadata().unwrap().modified().unwrap();
-                    in_modified.duration_since(*out_modified).is_ok()
-                } else {
-                    true
-                }
-            })
-    } else {
-        true
-    };
-    if should_compile {
-        cu_files
-            .par_iter()
-            .map(|(cu_file, obj_file)| {
-                let mut command = std::process::Command::new("nvcc");
-                command
-                    .arg("-std=c++17")
-                    .arg("-O3")
-                    .arg("-U__CUDA_NO_HALF_OPERATORS__")
-                    .arg("-U__CUDA_NO_HALF_CONVERSIONS__")
-                    .arg("-U__CUDA_NO_HALF2_OPERATORS__")
-                    .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
-                    .arg(format!("--gpu-architecture=sm_{compute_cap}"))
-                    .arg("-c")
-                    .args(["-o", obj_file.to_str().unwrap()])
-                    .args(["--default-stream", "per-thread"])
-                    .arg("-Icutlass/include")
-                    .arg("--expt-relaxed-constexpr")
-                    .arg("--expt-extended-lambda")
-                    .arg("--use_fast_math")
-                    .arg("--verbose");
-                if let Ok(ccbin_path) = &ccbin_env {
-                    command
-                        .arg("-allow-unsupported-compiler")
-                        .args(["-ccbin", ccbin_path]);
-                }
-                command.arg(cu_file);
-                let output = command
-                    .spawn()
-                    .context("failed spawning nvcc")?
-                    .wait_with_output()?;
-                if !output.status.success() {
-                    anyhow::bail!(
-                        "nvcc error while executing compiling: {:?}\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
-                        &command,
-                        String::from_utf8_lossy(&output.stdout),
-                        String::from_utf8_lossy(&output.stderr)
-                    )
-                }
-                Ok(())
-            })
-            .collect::<Result<()>>()?;
-        let obj_files = cu_files.iter().map(|c| c.1.clone()).collect::<Vec<_>>();
-        let mut command = std::process::Command::new("nvcc");
-        command
-            .arg("--lib")
-            .args(["-o", out_file.to_str().unwrap()])
-            .args(obj_files);
-        let output = command
-            .spawn()
-            .context("failed spawning nvcc")?
-            .wait_with_output()?;
-        if !output.status.success() {
-            anyhow::bail!(
-                "nvcc error while linking: {:?}\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
-                &command,
-                String::from_utf8_lossy(&output.stdout),
-                String::from_utf8_lossy(&output.stderr)
-            )
-        }
-    }
    println!("cargo:rustc-link-search={}", build_dir.display());
    println!("cargo:rustc-link-lib=flashattention");
    println!("cargo:rustc-link-lib=dylib=cudart");
    println!("cargo:rustc-link-lib=dylib=stdc++");

-    /* laurent: I tried using the cc cuda integration as below but this lead to ptaxs never
-       finishing to run for some reason. Calling nvcc manually worked fine.
-    cc::Build::new()
-        .cuda(true)
-        .include("cutlass/include")
-        .flag("--expt-relaxed-constexpr")
-        .flag("--default-stream")
-        .flag("per-thread")
-        .flag(&format!("--gpu-architecture=sm_{compute_cap}"))
-        .file("kernels/flash_fwd_hdim32_fp16_sm80.cu")
-        .compile("flashattn");
-    */
    Ok(())
 }
-
-fn set_cuda_include_dir() -> Result<()> {
-    // NOTE: copied from cudarc build.rs.
-    let env_vars = [
-        "CUDA_PATH",
-        "CUDA_ROOT",
-        "CUDA_TOOLKIT_ROOT_DIR",
-        "CUDNN_LIB",
-    ];
-    let env_vars = env_vars
-        .into_iter()
-        .map(std::env::var)
-        .filter_map(Result::ok)
-        .map(Into::<PathBuf>::into);
-
-    let roots = [
-        "/usr",
-        "/usr/local/cuda",
-        "/opt/cuda",
-        "/usr/lib/cuda",
-        "C:/Program Files/NVIDIA GPU Computing Toolkit",
-        "C:/CUDA",
-    ];
-    let roots = roots.into_iter().map(Into::<PathBuf>::into);
-    let root = env_vars
-        .chain(roots)
-        .find(|path| path.join("include").join("cuda.h").is_file())
-        .context("cannot find include/cuda.h")?;
-    println!(
-        "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
-        root.join("include").display()
-    );
-    Ok(())
-}
-
-#[allow(unused)]
-fn compute_cap() -> Result<usize> {
-    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
-    // Try to parse compute caps from env
-    let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
-        compute_cap_str
-            .parse::<usize>()
-            .context("Could not parse compute cap")?
-    } else {
-        // Use nvidia-smi to get the current compute cap
-        let out = std::process::Command::new("nvidia-smi")
-            .arg("--query-gpu=compute_cap")
-            .arg("--format=csv")
-            .output()
-            .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
-        let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
-        let mut lines = out.lines();
-        assert_eq!(
-            lines.next().context("missing line in stdout")?,
-            "compute_cap"
-        );
-        let cap = lines
-            .next()
-            .context("missing line in stdout")?
-            .replace('.', "");
-        let cap = cap
-            .parse::<usize>()
-            .with_context(|| format!("cannot parse as int {cap}"))?;
-        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
-        cap
-    };
-
-    // Grab available GPU codes from nvcc and select the highest one
-    let (supported_nvcc_codes, max_nvcc_code) = {
-        let out = std::process::Command::new("nvcc")
-            .arg("--list-gpu-code")
-            .output()
-            .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
-        let out = std::str::from_utf8(&out.stdout).unwrap();
-
-        let out = out.lines().collect::<Vec<&str>>();
-        let mut codes = Vec::with_capacity(out.len());
-        for code in out {
-            let code = code.split('_').collect::<Vec<&str>>();
-            if !code.is_empty() && code.contains(&"sm") {
-                if let Ok(num) = code[1].parse::<usize>() {
-                    codes.push(num);
-                }
-            }
-        }
-        codes.sort();
-        let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
-        (codes, max_nvcc_code)
-    };
-
-    // Check that nvcc supports the asked compute caps
-    if !supported_nvcc_codes.contains(&compute_cap) {
-        anyhow::bail!(
-            "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
-        );
-    }
-    if compute_cap > max_nvcc_code {
-        anyhow::bail!(
-            "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
-        );
-    }
-
-    Ok(compute_cap)
-}
--- a/candle-flash-attn/kernels/alibi.h
+++ b/candle-flash-attn/kernels/alibi.h
@ -0,0 +1,62 @@
+#include <cmath>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_causal, typename Engine, typename Layout>
+inline __device__ void apply_alibi(Tensor<Engine, Layout> &tensor, 
+                                   const int col_idx_offset_,
+                                   const int max_seqlen_k, 
+                                   const int row_idx_offset,
+                                   const int max_seqlen_q, 
+                                   const int warp_row_stride,
+                                   const float alibi_slope) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+        #pragma unroll
+        for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+            const int col_idx_base = col_idx_offset + nj * 8;
+            #pragma unroll
+            for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                const int col_idx = col_idx_base + j;
+                #pragma unroll
+                for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                    tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                }
+            }
+        }
+    } else {  // Bias depends on both row_idx and col_idx
+        #pragma unroll
+        for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+            const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+            #pragma unroll
+            for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                const int row_idx = row_idx_base + i * 8;
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * 8;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j;
+                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                    }
+                }
+            }
+        }
+    }
+}
+
+}  // namespace flash
--- a/candle-flash-attn/kernels/block_info.h
+++ b/candle-flash-attn/kernels/block_info.h
@ -14,9 +14,12 @@ struct BlockInfo {
    template<typename Params>
    __device__ BlockInfo(const Params &params, const int bidb)
        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
-        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr ? -1 : params.cu_seqlens_k[bidb])
+        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
        , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
-        , actual_seqlen_k(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : params.cu_seqlens_k[bidb + 1] - sum_s_k)
+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+        , seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb]))
+        , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew))
        {
        }

@ -32,8 +35,10 @@ struct BlockInfo {

    const int sum_s_q;
    const int sum_s_k;
-    const uint32_t actual_seqlen_q;
-    const uint32_t actual_seqlen_k;
+    const int actual_seqlen_q;
+    // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0.
+    const int seqlen_k_cache;
+    const int actual_seqlen_k;
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/candle-flash-attn/kernels/flash.h
+++ b/candle-flash-attn/kernels/flash.h
@ -7,15 +7,6 @@
 #include <cuda.h>
 #include <vector>

-// #ifdef OLD_GENERATOR_PATH
-// #include <ATen/CUDAGeneratorImpl.h>
-// #else
-// #include <ATen/cuda/CUDAGeneratorImpl.h>
-// #endif
-//
-// #include <ATen/cuda/CUDAGraphsUtils.cuh>
-
-
 constexpr int TOTAL_DIM = 0;
 constexpr int H_DIM = 1;
 constexpr int D_DIM = 2;
@ -53,6 +44,7 @@ struct Flash_fwd_params : public Qkv_params {

    // The O matrix (output).
    void * __restrict__ o_ptr;
+    void * __restrict__ oaccum_ptr;

    // The stride between rows of O.
    index_t o_batch_stride;
@ -64,9 +56,10 @@ struct Flash_fwd_params : public Qkv_params {

    // The pointer to the softmax sum.
    void * __restrict__ softmax_lse_ptr;
+    void * __restrict__ softmax_lseaccum_ptr;

    // The dimensions.
-    int b, seqlen_q, seqlen_k, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded;
+    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim;

    // The scaling factors for the kernel.
    float scale_softmax;
@ -76,8 +69,30 @@ struct Flash_fwd_params : public Qkv_params {
    int * __restrict__ cu_seqlens_q;
    int * __restrict__ cu_seqlens_k;

+    // If provided, the actual length of each k sequence.
+    int * __restrict__ seqused_k;
+
    int *__restrict__ blockmask;

+    // The K_new and V_new matrices.
+    void * __restrict__ knew_ptr;
+    void * __restrict__ vnew_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    index_t knew_batch_stride;
+    index_t vnew_batch_stride;
+    index_t knew_row_stride;
+    index_t vnew_row_stride;
+    index_t knew_head_stride;
+    index_t vnew_head_stride;
+
+    // The cos and sin matrices for rotary embedding.
+    void * __restrict__ rotary_cos_ptr;
+    void * __restrict__ rotary_sin_ptr;
+
+    // The indices to index into the KV cache.
+    int *__restrict__ cache_batch_idx;
+
    // The dropout probability (probability of keeping an activation).
    float p_dropout;
    // uint32_t p_dropout_in_uint;
@ -88,11 +103,22 @@ struct Flash_fwd_params : public Qkv_params {
    float rp_dropout;
    float scale_softmax_rp_dropout;

-    // Random state.
-    // at::PhiloxCudaState philox_args;
+    // Local window size
+    int window_size_left, window_size_right;

    bool is_bf16;
    bool is_causal;
+
+    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+    bool is_seqlens_k_cumulative;
+
+    bool is_rotary_interleaved;
+
+    int num_splits;  // For split-KV version
+
+    void * __restrict__ alibi_slopes_ptr;
+    index_t alibi_slopes_batch_stride;
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -132,10 +158,14 @@ struct Flash_bwd_params : public Flash_fwd_params {

    // The pointer to the softmax d sum.
    void *__restrict__ dsoftmax_sum;
+
+    bool deterministic;
+    index_t dq_accum_split_stride;
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////

 template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);

 template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream, const bool configure);
--- a/candle-flash-attn/kernels/flash_api.cu
+++ b/candle-flash-attn/kernels/flash_api.cu
@ -1,17 +1,15 @@
 #include "flash_fwd_launch_template.h"

-// void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream) {
-//     FWD_HEADDIM_SWITCH(params.d, [&] {
-//         run_mha_fwd_<cutlass::half_t, kHeadDim>(params, stream);
-//     });
-// }
-
-void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream) {
-  FP16_SWITCH(!params.is_bf16, [&] {
-      FWD_HEADDIM_SWITCH(params.d, [&] {
-          run_mha_fwd_<elem_type, kHeadDim>(params, stream);
-          });
-      });
+void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        FWD_HEADDIM_SWITCH(params.d, [&] {
+//            if (params.num_splits <= 1 && !force_split_kernel) {  // If we don't set it num_splits == 0
+            run_mha_fwd_<elem_type, kHeadDim>(params, stream);
+//            } else {
+//                run_mha_fwd_splitkv_dispatch<elem_type, kHeadDim>(params, stream);
+//            }
+        });
+    });
 }

 extern "C" void run_mha(
@ -20,6 +18,7 @@ extern "C" void run_mha(
    void *v_ptr,
    void *o_ptr,
    void *softmax_lse_ptr,
+    void *alibi_slopes_ptr,

    int32_t *cu_seqlens_q_ptr,
    int32_t *cu_seqlens_k_ptr,
@ -28,6 +27,7 @@ extern "C" void run_mha(
    uint32_t k_batch_stride,
    uint32_t v_batch_stride,
    uint32_t o_batch_stride,
+    uint32_t alibi_slopes_batch_stride,

    uint32_t q_row_stride,
    uint32_t k_row_stride,
@ -51,8 +51,11 @@ extern "C" void run_mha(
    uint32_t seqlen_q_rounded,
    uint32_t seqlen_k_rounded,

+    int is_bf16,
    int is_causal,
-    int is_bf16
+
+    int window_size_left,
+    int window_size_right
 ) {
    Flash_fwd_params params;
    // Reset the parameters
@ -65,12 +68,14 @@ extern "C" void run_mha(
    params.o_ptr = o_ptr;

    params.softmax_lse_ptr = softmax_lse_ptr;
+    params.alibi_slopes_ptr = alibi_slopes_ptr;

    // All stride are in elements, not bytes.
    params.q_batch_stride = q_batch_stride;
    params.k_batch_stride = k_batch_stride;
    params.v_batch_stride = v_batch_stride;
    params.o_batch_stride = o_batch_stride;
+    params.alibi_slopes_batch_stride = alibi_slopes_batch_stride;

    params.q_row_stride = q_row_stride;
    params.k_row_stride = k_row_stride;
@ -92,7 +97,6 @@ extern "C" void run_mha(
    params.seqlen_k_rounded = seqlen_k_rounded;
    params.d = d;
    params.d_rounded = d_rounded;
-    params.is_causal = is_causal;

    // Set the different scale values.
    params.scale_softmax = softmax_scale;
@ -106,6 +110,14 @@ extern "C" void run_mha(
    params.cu_seqlens_q = cu_seqlens_q_ptr;
    params.cu_seqlens_k = cu_seqlens_k_ptr;
    params.p_ptr = nullptr; // used for `return_softmax`.
+    params.seqused_k = nullptr;
+
+    params.is_causal = is_causal;
+    params.window_size_left = window_size_left;
+    params.window_size_right = window_size_right;
+
+    params.is_seqlens_k_cumulative = true;
+    params.num_splits = 1;

    cudaStream_t stream = 0; // Use the default stream.
    run_mha_fwd(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim128_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim128_bf16_sm80.cu
@ -1,18 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     if (params.p_dropout == 1.f) {
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 64, 4, false, false, elem_type>, false>(params, stream);
-//     } else {
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 32, 4, false, false, elem_type>, true>(params, stream);
-//     }
-// }
 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim128<cutlass::bfloat16_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim128_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim128_fp16_sm80.cu
@ -1,31 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     if (params.p_dropout == 1.f) {
-//         // Using 8 warps (128 x 128 and 256 x 64) is 28% slower for seqlen=2k
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 64, 4, false, false, elem_type>, false>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 64, 4, true, false, elem_type>, false>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 64, 4, false, true, elem_type>, false>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 64, 4, true, true, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 32, 4, false, false, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 64, 64, 4, false, false, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 64, 128, 4, false, false, elem_type>, false>(params, stream);
-//         // 1st ones are good for H100, A100
-//         // 2nd one is good for A6000 bc we get slightly better occupancy
-//     } else {
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 32, 4, false, false, elem_type>, true>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 32, 4, true, false, elem_type>, true>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<128, 128, 32, 4, true, true, elem_type>, true>(params, stream);
-//         // 1st one is good for H100, A100, A6000
-//     }
-// }
-
 template<>
 void run_mha_fwd_<cutlass::half_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim128<cutlass::half_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim160_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim160_bf16_sm80.cu
@ -1,16 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 160>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 32, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//     });
-// }
 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 160>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim160<cutlass::bfloat16_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim160_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim160_fp16_sm80.cu
@ -1,26 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 160>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 32, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 32, 4, false, true, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<160, 64, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 64, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<160, 64, 128, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<160, 64, 64, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 64, 8, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<160, 128, 128, 8, false, elem_type>>(params, stream);
-//         // For A6000, no-causal, 1st is fastest. causal, 4th is fastest.
-//         // For A100, H100, 1st is fastest.
-//     });
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 160>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim160<cutlass::half_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim192_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim192_bf16_sm80.cu
@ -1,16 +1,10 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//     });
-// }
-template<> void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim192<cutlass::bfloat16_t>(params, stream);
 }
--- a/candle-flash-attn/kernels/flash_fwd_hdim192_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim192_fp16_sm80.cu
@ -1,26 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 32, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 32, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         // This one is slightly faster for causal?
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 64, 8, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 32, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 64, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 64, 128, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<192, 128, 128, 8, false, elem_type>>(params, stream);
-//     });
-//     // For A100 H100, 1st is faster with dropout, 3rd is faster without dropout
-//     // For A6000, 1st is faster when causal, 3rd is faster when not causal
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim192<cutlass::half_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim224_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim224_bf16_sm80.cu
@ -1,9 +1,10 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::bfloat16_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim224<cutlass::bfloat16_t>(params, stream);
 }
--- a/candle-flash-attn/kernels/flash_fwd_hdim224_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim224_fp16_sm80.cu
@ -1,9 +1,10 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::half_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::half_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim224<cutlass::half_t>(params, stream);
 }
--- a/candle-flash-attn/kernels/flash_fwd_hdim256_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim256_bf16_sm80.cu
@ -1,9 +1,10 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim256<cutlass::bfloat16_t>(params, stream);
 }
--- a/candle-flash-attn/kernels/flash_fwd_hdim256_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim256_fp16_sm80.cu
@ -1,9 +1,10 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-template<> void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim256<cutlass::half_t>(params, stream);
 }
--- a/candle-flash-attn/kernels/flash_fwd_hdim32_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim32_bf16_sm80.cu
@ -1,6 +1,6 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

--- a/candle-flash-attn/kernels/flash_fwd_hdim32_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim32_fp16_sm80.cu
@ -1,22 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<32, 128, 128, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         // For dropout there might be a lot of register spilling?
-//         // These two are very slow due to register spilling
-//         // run_flash_fwd<Flash_fwd_kernel_traits<32, 256, 128, 4, false, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<32, 128, 256, 4, false, elem_type>>(params, stream);
-//         // This one is slightly slower
-//         // run_flash_fwd<Flash_fwd_kernel_traits<32, 256, 64, 4, false, elem_type>>(params, stream);
-//     });
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim32<cutlass::half_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim64_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim64_bf16_sm80.cu
@ -1,18 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     if (params.p_dropout == 1.f) {
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, false, elem_type>, false>(params, stream);
-//     } else {
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, false, false, elem_type>, true>(params, stream);
-//     }
-// }
 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim64<cutlass::bfloat16_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim64_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim64_fp16_sm80.cu
@ -1,25 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     if (params.p_dropout == 1.f) {
-//         // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower
-//         // Using block size (64 x 256) is 27% slower for seqlen=2k
-//         // Using block size (256 x 64) is 85% slower for seqlen=2k, because of register spilling
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 128, 4, false, false, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, false, elem_type>, false>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, true, elem_type>, false>(params, stream);
-//     } else {
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, false, false, elem_type>, true>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, true, elem_type>, true>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<64, 128, 64, 4, true, false, elem_type>, true>(params, stream);
-//     }
-// }
 template<>
 void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim64<cutlass::half_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim96_bf16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim96_bf16_sm80.cu
@ -1,16 +1,9 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::bfloat16_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::bfloat16_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, true, false, elem_type>, Is_dropout>(params, stream);
-//     });
-// }
 template<>
 void run_mha_fwd_<cutlass::bfloat16_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim96<cutlass::bfloat16_t>(params, stream);
--- a/candle-flash-attn/kernels/flash_fwd_hdim96_fp16_sm80.cu
+++ b/candle-flash-attn/kernels/flash_fwd_hdim96_fp16_sm80.cu
@ -1,23 +1,10 @@
 // Copyright (c) 2023, Tri Dao.
-
 // Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"

 #include "flash_fwd_launch_template.h"

-// template<>
-// void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
-//     using elem_type = cutlass::half_t;
-//     BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, true, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, true, true, elem_type>, Is_dropout>(params, stream);
-//         // This 3rd one is good for H100, and A100, A6000
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, false, false, elem_type>, Is_dropout>(params, stream);
-//         run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 64, 4, false, true, elem_type>, Is_dropout>(params, stream);
-//         // These two are always slower
-//         // run_flash_fwd<Flash_fwd_kernel_traits<96, 128, 128, 4, true, elem_type>>(params, stream);
-//         // run_flash_fwd<Flash_fwd_kernel_traits<96, 64, 128, 4, true, elem_type>>(params, stream);
-//     });
-// }
-template<> void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
+template<>
+void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
    run_mha_fwd_hdim96<cutlass::half_t>(params, stream);
 }
--- a/candle-flash-attn/kernels/flash_fwd_kernel.h
+++ b/candle-flash-attn/kernels/flash_fwd_kernel.h
@ -4,20 +4,18 @@

 #pragma once

-#include <cmath>
 #include <cute/algorithm/copy.hpp>
-#include <cute/algorithm/gemm.hpp>

 #include <cutlass/cutlass.h>
 #include <cutlass/array.h>
 #include <cutlass/numeric_types.h>
-#include <cutlass/numeric_conversion.h>

 #include "block_info.h"
 #include "kernel_traits.h"
 #include "utils.h"
 #include "softmax.h"
-#include "philox.cuh"
+
+#include "alibi.h"

 namespace flash {

@ -25,49 +23,6 @@ using namespace cute;

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-template <int MMA_M,
-          class... Args,
-          class TiledMMA>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_A_warpcontiguousM(Copy_Atom<Args...> const& copy_atom,
-                                 TiledMMA           const& tiled_mma) {
-    using TileShape_MNK = typename TiledMMA::TiledShape_MNK;
-    using AtomShape_MNK = typename TiledMMA::AtomShape_MNK;
-    constexpr int AtomShape_M = decltype(size<0>(AtomShape_MNK{}))::value;
-    constexpr int kNWarps = decltype(size<0>(TileShape_MNK{}))::value / AtomShape_M;
-    constexpr int MMAStride_M = MMA_M * AtomShape_M;
-    auto t = make_tile(Layout<Shape<Int<AtomShape_M>, Int<kNWarps>>,
-                              Stride<_1, Int<MMAStride_M>> >{},
-                       make_layout(size<2>(TileShape_MNK{})));
-    // if (cute::thread0()) {printf("make_tiled_copy_A_warpcontiguousM "); print(t); printf("\n");  }
-    return make_tiled_copy_impl(copy_atom, tiled_mma.get_layoutA_TV(), t);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int MMA_M,
-          class... Args,
-          class TiledMMA>
-CUTE_HOST_DEVICE
-auto
-make_tiled_copy_C_warpcontiguousM(Copy_Atom<Args...> const& copy_atom,
-                                 TiledMMA           const& tiled_mma) {
-    using TileShape_MNK = typename TiledMMA::TiledShape_MNK;
-    using AtomShape_MNK = typename TiledMMA::AtomShape_MNK;
-    constexpr int AtomShape_M = decltype(size<0>(AtomShape_MNK{}))::value;
-    constexpr int kNWarps = decltype(size<0>(TileShape_MNK{}))::value / AtomShape_M;
-    constexpr int MMAStride_M = MMA_M * AtomShape_M;
-    auto t = make_tile(Layout<Shape<Int<AtomShape_M>, Int<kNWarps>>,
-                              Stride<_1, Int<MMAStride_M>> >{},
-                       // TODO: Shouldn't this be size<1>?
-                       make_layout(size<2>(TileShape_MNK{})));
-    // if (cute::thread0()) {printf("make_tiled_copy_C_warpcontiguousM "); print(t); printf("\n");  }
-    return make_tiled_copy_impl(copy_atom, tiled_mma.get_layoutC_TV(), t);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 template<bool Is_first, bool Check_inf=false, typename Tensor0, typename Tensor1, typename Tensor2>
 inline __device__ void softmax_rescale_o(Tensor0 &scores, Tensor1 &scores_max, Tensor1 &scores_sum,
                                         Tensor2 &acc_o, float softmax_scale_log2) {
@ -77,7 +32,7 @@ inline __device__ void softmax_rescale_o(Tensor0 &scores, Tensor1 &scores_max, T
        flash::reduce_sum(scores, scores_sum);
    } else {
        Tensor scores_max_prev = make_fragment_like(scores_max);
-        copy(scores_max, scores_max_prev);
+        cute::copy(scores_max, scores_max_prev);
        flash::template reduce_max</*zero_init=*/false>(scores, scores_max);
        // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
        Tensor acc_o_rowcol = make_tensor(acc_o.data(), flash::convert_layout_acc_rowcol(acc_o.layout()));
@ -103,23 +58,22 @@ inline __device__ void softmax_rescale_o(Tensor0 &scores, Tensor1 &scores_max, T

 template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename TiledCopy>
 inline __device__ void write_softmax_to_gmem(
-    Tensor<Engine0, Layout0> const &tOrP, Tensor<Engine1, Layout1> &tPgP, TiledCopy gmem_thr_copy_P
+    Tensor<Engine0, Layout0> const &tOrP, Tensor<Engine1, Layout1> &tPgP, TiledCopy gmem_tiled_copy_P
 ) {
    // Reshape tOrP from (8, MMA_M, MMA_N) to (8, MMA_M * MMA_N)
    Layout l = tOrP.layout();
    Tensor tPrP = make_tensor(tOrP.data(), make_layout(get<0>(l), make_layout(get<1>(l), get<2>(l))));
    CUTE_STATIC_ASSERT_V(size<2>(tPgP) == _1{});
-    // TODO(laurent): reactivate the following
-    // CUTE_STATIC_ASSERT_V(size<1>(tPrP) == size<1>(tPgP));
+    CUTE_STATIC_ASSERT_V(size<1>(tPrP) == size<1>(tPgP));
    #pragma unroll
    for (int mi = 0; mi < size<1>(tPrP); ++mi) {
-        copy(gmem_thr_copy_P, tPrP(_, mi), tPgP(_, mi, 0));
+        cute::copy(gmem_tiled_copy_P, tPrP(_, mi), tPgP(_, mi, 0));
    }
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_N, bool Is_even_K, bool Return_softmax, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Return_softmax, typename Params>
 inline __device__ void compute_attn_1rowblock(const Params &params, const int bidb, const int bidh, const int m_block) {

    using Element = typename Kernel_traits::Element;
@ -138,16 +92,65 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
    constexpr int kNWarps = Kernel_traits::kNWarps;
    constexpr int MMA_M = kBlockM / decltype(size<0>(typename Kernel_traits::TiledMma::TiledShape_MNK{}))::value;

-    const BlockInfo</*Varlen=*/!Is_even_N> binfo(params, bidb);
-    if (m_block * kBlockM >= binfo.actual_seqlen_q || binfo.actual_seqlen_k == 0) return;
+    const BlockInfo</*Varlen=*/!Is_even_MN> binfo(params, bidb);
+    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;

+    const int n_block_min = !Is_local ? 0 : std::max(0, (m_block * kBlockM + binfo.actual_seqlen_k - binfo.actual_seqlen_q - params.window_size_left) / kBlockN);
    int n_block_max = cute::ceil_div(binfo.actual_seqlen_k, kBlockN);
-    if (Is_causal) {
-        n_block_max = std::min(n_block_max, cute::ceil_div((m_block + 1) * kBlockM, kBlockN));
+    if (Is_causal || Is_local) {
+        n_block_max = std::min(n_block_max,
+                               cute::ceil_div((m_block + 1) * kBlockM + binfo.actual_seqlen_k - binfo.actual_seqlen_q + params.window_size_right, kBlockN));
        // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) {
        //     printf("m_block = %d, n_block_max = %d\n", m_block, n_block_max);
        // }
    }
+    // We exit early and write 0 to gO and gLSE. This also covers the case where actual_seqlen_k == 0.
+    // Otherwise we might read OOB elements from gK and gV.
+    if ((Is_causal || Is_local || !Is_even_MN) && n_block_max <= n_block_min) {
+        // Save seed and offset for backward. If we don't have this here, the 0-th thread block might
+        // exit early and no one saves the rng state.
+//        if (Is_dropout && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && tidx == 0) {
+//            auto seeds = at::cuda::philox::unpack(params.philox_args);
+//            params.rng_state[0] = std::get<0>(seeds);
+//            params.rng_state[1] = std::get<1>(seeds);
+//            params.rng_state[0] = 0;
+//            params.rng_state[1] = 0;
+//        }
+        const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
+            + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
+        const index_t row_offset_lse = (bidb * params.h + bidh) * params.seqlen_q + m_block * kBlockM;
+        Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.o_ptr) + row_offset_o),
+                                Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                                make_stride(params.o_row_stride, _1{}));
+        Tensor gLSE = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.softmax_lse_ptr) + row_offset_lse),
+                                  Shape<Int<kBlockM>>{}, Stride<_1>{});
+
+        typename Kernel_traits::GmemTiledCopyO gmem_tiled_copy_O;
+        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(tidx);
+        Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+        Tensor tOrO = make_tensor<Element>(shape(tOgO));
+        clear(tOrO);
+        // Construct identity layout for sO
+        Tensor cO = make_identity_tensor(make_shape(size<0>(gO), size<1>(gO)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        // Repeat the partitioning with identity layouts
+        Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+        if (!Is_even_K) {
+            #pragma unroll
+            for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d; }
+        }
+        // Clear_OOB_K must be false since we don't want to write zeros to gmem
+        flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, binfo.actual_seqlen_q - m_block * kBlockM
+        );
+        #pragma unroll
+        for (int m = 0; m < size<1>(tOgO); ++m) {
+            const int row = get<0>(tOcO(0, m, 0));
+            if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) { gLSE(row) = INFINITY; }
+        }
+        return;
+    }
+    // if (tidx == 0) { printf("m_block = %d, n_block_min = %d, n_block_max = %d\n", m_block, n_block_min, n_block_max); }

    // We iterate over the blocks in reverse order. This is because the last block is the only one
    // that needs masking when we read K and V from global memory. Moreover, iterating in reverse
@ -185,8 +188,10 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
    Tensor sVt = make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposed{});
    Tensor sVtNoSwizzle = make_tensor(sV.data(), typename Kernel_traits::SmemLayoutVtransposedNoSwizzle{});

-    auto gmem_thr_copy_QKV = typename Kernel_traits::GmemTiledCopyQKV{}.get_thread_slice(tidx);
-    auto gmem_thr_copy_P = typename Kernel_traits::GmemTiledCopyP{}.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_QKV;
+    auto gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopyP gmem_tiled_copy_P;
+    auto gmem_thr_copy_P = gmem_tiled_copy_P.get_thread_slice(tidx);

    Tensor tQgQ = gmem_thr_copy_QKV.partition_S(gQ);
    Tensor tQsQ = gmem_thr_copy_QKV.partition_D(sQ);
@ -208,16 +213,18 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
    // Copy Atom retiling
    //

-    auto smem_thr_copy_Q = make_tiled_copy_A(typename Kernel_traits::SmemCopyAtom{}, tiled_mma).get_thread_slice(tidx);
-    // auto smem_thr_copy_Q = make_tiled_copy_A_warpcontiguousM<MMA_M>(typename Kernel_traits::SmemCopyAtom{}, tiled_mma).get_thread_slice(tidx);
+    auto smem_tiled_copy_Q = make_tiled_copy_A(typename Kernel_traits::SmemCopyAtom{}, tiled_mma);
+    auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(tidx);
    // if (cute::thread0()) {smem_thr_copy_Q.print_all();}
    Tensor tSsQ = smem_thr_copy_Q.partition_S(sQ);
    // if (cute::thread0()) {print(tSsQ.layout()); printf("\n");}

-    auto smem_thr_copy_K = make_tiled_copy_B(typename Kernel_traits::SmemCopyAtom{}, tiled_mma).get_thread_slice(tidx);
+    auto smem_tiled_copy_K = make_tiled_copy_B(typename Kernel_traits::SmemCopyAtom{}, tiled_mma);
+    auto smem_thr_copy_K = smem_tiled_copy_K.get_thread_slice(tidx);
    Tensor tSsK = smem_thr_copy_K.partition_S(sK);

-    auto smem_thr_copy_V = make_tiled_copy_B(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma).get_thread_slice(tidx);
+    auto smem_tiled_copy_V = make_tiled_copy_B(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma);
+    auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(tidx);
    Tensor tOsVt = smem_thr_copy_V.partition_S(sVt);

    // TODO: this might need to change if we change the mma instruction in SM70
@ -268,8 +275,8 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi

    Tensor tQrQ = make_fragment_like(tQgQ);
    // We don't need to clear the sQ smem tiles since we'll only write out the valid outputs
-    flash::copy</*Is_even_MN=*/false, Is_even_K>(gmem_thr_copy_QKV, tQgQ, tQsQ, tQcQ, tQpQ,
-                                                 binfo.actual_seqlen_q - m_block * kBlockM);
+    flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tQgQ, tQsQ, tQcQ, tQpQ,
+                                       binfo.actual_seqlen_q - m_block * kBlockM);
    if (Kernel_traits::Is_Q_in_regs) { cute::cp_async_fence(); }

    // // Copy rmem to smem
@ -285,14 +292,14 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
        __syncthreads();
        Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
        CUTE_STATIC_ASSERT_V(size<1>(tSsQ) == size<1>(tSrQ_copy_view));            // M
-        copy(smem_thr_copy_Q, tSsQ, tSrQ_copy_view);
+        cute::copy(smem_tiled_copy_Q, tSsQ, tSrQ_copy_view);
        __syncthreads();
    }

    int n_block = n_block_max - 1;
    // We don't need to clear the sK smem tiles since we'll mask out the scores anyway.
-    flash::copy<Is_even_N, Is_even_K>(gmem_thr_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV,
-                                      binfo.actual_seqlen_k - n_block * kBlockN);
+    flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV,
+                                       binfo.actual_seqlen_k - n_block * kBlockN);
    cute::cp_async_fence();
    // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z < 2) { print(tKgK); }
    // __syncthreads();
@ -302,7 +309,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
        __syncthreads();
        Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
        CUTE_STATIC_ASSERT_V(size<1>(tSsQ) == size<1>(tSrQ_copy_view));            // M
-        copy(smem_thr_copy_Q, tSsQ, tSrQ_copy_view);
+        cute::copy(smem_tiled_copy_Q, tSsQ, tSrQ_copy_view);
    }

    // auto seeds = at::cuda::philox::unpack(params.philox_args);
@ -313,13 +320,19 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi

    clear(acc_o);

+    float alibi_slope = !Has_alibi ? 0.0f : reinterpret_cast<float *>(params.alibi_slopes_ptr)[bidb * params.alibi_slopes_batch_stride + bidh] / params.scale_softmax;
+
    // For performance reason, we separate out two kinds of iterations:
    // those that need masking on S, and those that don't.
    // We need masking on S for the very last block when K and V has length not multiple of kBlockN.
    // We also need masking on S if it's causal, for the last ceil_div(kBlockM, kBlockN) blocks.
    // We will have at least 1 "masking" iteration.

-    constexpr int n_masking_steps = Is_causal ? cute::ceil_div(kBlockM, kBlockN) : 1;
+    // If not even_N, then seqlen_k might end in the middle of a block. In that case we need to
+    // mask 2 blocks (e.g. when kBlockM == kBlockN), not just 1.
+    constexpr int n_masking_steps = (!Is_causal && !Is_local)
+        ? 1
+        : ((Is_even_MN && Is_causal) ? cute::ceil_div(kBlockM, kBlockN) : cute::ceil_div(kBlockM, kBlockN) + 1);
    #pragma unroll
    for (int masking_step = 0; masking_step < n_masking_steps; ++masking_step, --n_block) {
        Tensor acc_s = partition_fragment_C(tiled_mma, Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M, MMA_N)
@ -330,28 +343,42 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
        // Advance gV
        if (masking_step > 0) {
            tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
-            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_thr_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
+            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
        } else {
            // Clear the smem tiles to account for predicated off loads
-            flash::copy<Is_even_N, Is_even_K, /*Clear_OOB_MN=*/true>(
-                gmem_thr_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN
+            flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/true>(
+                gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN
            );
        }
        cute::cp_async_fence();

        flash::gemm</*A_in_regs=*/Kernel_traits::Is_Q_in_regs>(
-            acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_thr_copy_Q, smem_thr_copy_K
+            acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_tiled_copy_Q, smem_tiled_copy_K,
+            smem_thr_copy_Q, smem_thr_copy_K
        );
        // if (cute::thread0()) { print(acc_s); }

        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
        Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
-        // if (cute::thread0()) { print(scores); }
+        // if (cute::thread0()) { print_tensor(scores); }
        // We don't put the masking before the matmul S = Q K^T because we don't clear sK
        // for rows outside actual_seqlen_k. So those rows could have Inf / NaN, and the matmul
        // can produce Inf / NaN.
-        if (!Is_causal) {
-            if (!Is_even_N) { flash::apply_mask(scores, binfo.actual_seqlen_k - n_block * kBlockN); }
+
+        if (Has_alibi) {
+            flash::apply_alibi<Is_causal>(
+                scores, 
+                n_block * kBlockN, 
+                binfo.actual_seqlen_k,
+                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
+                binfo.actual_seqlen_q, 
+                kNWarps * 16,
+                alibi_slope
+            );
+        }
+
+        if (!Is_causal && !Is_local) {
+            if (!Is_even_MN) { flash::apply_mask(scores, binfo.actual_seqlen_k - n_block * kBlockN); }
        } else {
            // Tensor caccS = make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});    // (BLK_M,BLK_N) -> (blk_m,blk_n)
            // Tensor taccScS = thr_mma.partition_C(caccS);                           // (MMA,MMA_M,MMA_N)
@ -364,20 +391,24 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
            // Idk why it's get<1> and not get<0> of the stride.
            // if (cute::thread0()) { print(idx_row.layout()); print(stride<1>(idx_row)); printf("stride = %d \n", get<1>(stride<1>(idx_row))); }
            // I can't get the stride from idx_row
-            flash::apply_mask_causal(scores, n_block * kBlockN, binfo.actual_seqlen_k,
-                                     // m_block * kBlockM + get<0>(idx_row(0)),
-                                     m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
-                                     kNWarps * 16);
-                                     // m_block * kBlockM + (tidx / 32) * 16, kNWarps * 16);
-                                     // m_block * kBlockM + (tidx / 32) * (kBlockM / kNWarps), 16);
+            flash::apply_mask_local</*HasWSLeft=*/Is_local>(
+                scores, n_block * kBlockN, binfo.actual_seqlen_k,
+                // m_block * kBlockM + get<0>(idx_row(0)),
+                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
+                binfo.actual_seqlen_q, kNWarps * 16,
+                params.window_size_left, params.window_size_right
+                // m_block * kBlockM + (tidx / 32) * 16, kNWarps * 16
+                // m_block * kBlockM + (tidx / 32) * (kBlockM / kNWarps), 16
+            );
+            // if (cute::thread0()) { print_tensor(scores); }
        }

        flash::cp_async_wait<0>();
        __syncthreads();
-        if (n_block > 0) {
+        if (n_block > n_block_min) {
            // Advance gK
            tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
-            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_thr_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
+            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
            // This cp_async_fence needs to be in the if block, otherwise the synchronization
            // isn't right and we get race conditions.
            cute::cp_async_fence();
@ -385,24 +416,24 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi

        // TODO: when we have key_padding_mask we'll need to Check_inf
        masking_step == 0
-            ? softmax_rescale_o</*Is_first=*/true,  /*Check_inf=*/Is_causal>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2)
-            : softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_causal>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);
+            ? softmax_rescale_o</*Is_first=*/true,  /*Check_inf=*/Is_causal || Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2)
+            : softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_causal || Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);

        // Convert scores from fp32 to fp16/bf16
        Tensor rP = flash::convert_type<Element>(scores);
        // Reshape rP from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
        // if using m16n8k16 or ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
        Tensor tOrP = make_tensor(rP.data(), flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMma>(rP.layout()));
-        uint32_t block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
-        uint32_t block_col_idx = n_block * (kBlockN / 32);
+        int block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
+        int block_col_idx = n_block * (kBlockN / 32);
        if (Return_softmax) {
            Tensor tOrP_copy = make_fragment_like(tOrP);
-            copy(tOrP, tOrP_copy);
+            cute::copy(tOrP, tOrP_copy);
            flash::apply_dropout</*encode_dropout_in_sign_bit=*/true>(
                tOrP_copy, params.p_dropout_in_uint8_t, seed, offset,
                block_row_idx, block_col_idx, kNWarps
            );
-            flash::write_softmax_to_gmem(tOrP_copy, tPgP, gmem_thr_copy_P);
+            flash::write_softmax_to_gmem(tOrP_copy, tPgP, gmem_tiled_copy_P);
            tPgP.data() = tPgP.data() + (-kBlockN);
        }
        if (Is_dropout) {
@ -411,37 +442,38 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
        }
        // if (cute::thread0()) { print(tOrP); }

-        flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_thr_copy_V);
+        flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
        // if (cute::thread0()) { print(scores); }

        // This check is at the end of the loop since we always have at least 1 iteration
-        if (n_masking_steps > 1 && n_block <= 0) {
+        if (n_masking_steps > 1 && n_block <= n_block_min) {
            --n_block;
            break;
        }
    }

    // These are the iterations where we don't need masking on S
-    for (; n_block >= 0; --n_block) {
+    for (; n_block >= n_block_min; --n_block) {
        Tensor acc_s = partition_fragment_C(tiled_mma, Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M, MMA_N)
        clear(acc_s);
        flash::cp_async_wait<0>();
        __syncthreads();
        // Advance gV
        tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
-        flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_thr_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
+        flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
        cute::cp_async_fence();

        flash::gemm</*A_in_regs=*/Kernel_traits::Is_Q_in_regs>(
-            acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_thr_copy_Q, smem_thr_copy_K
+            acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma, smem_tiled_copy_Q, smem_tiled_copy_K,
+            smem_thr_copy_Q, smem_thr_copy_K
        );

        flash::cp_async_wait<0>();
        __syncthreads();
-        if (n_block > 0) {
+        if (n_block > n_block_min) {
            // Advance gK
            tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
-            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_thr_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
+            flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
            // This cp_async_fence needs to be in the if block, otherwise the synchronization
            // isn't right and we get race conditions.
            cute::cp_async_fence();
@ -449,22 +481,44 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi

        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
        Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
-        softmax_rescale_o</*Is_first=*/false>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);
+        
+        if (Has_alibi) {
+            flash::apply_alibi<Is_causal>(
+                scores, 
+                n_block * kBlockN, 
+                binfo.actual_seqlen_k,
+                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
+                binfo.actual_seqlen_q, 
+                kNWarps * 16,
+                alibi_slope
+            );
+        }
+        
+        if (Is_local && n_block * kBlockN < (m_block + 1) * kBlockM + binfo.actual_seqlen_k - binfo.actual_seqlen_q + params.window_size_right) {
+            flash::apply_mask_local(
+                scores, n_block * kBlockN, binfo.actual_seqlen_k,
+                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
+                binfo.actual_seqlen_q, kNWarps * 16,
+                params.window_size_left, params.window_size_right
+            );
+        }
+
+        softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);

        Tensor rP = flash::convert_type<Element>(scores);
        // Reshape rP from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
        // if using m16n8k16 or ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
        Tensor tOrP = make_tensor(rP.data(), flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMma>(rP.layout()));
-        uint32_t block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
-        uint32_t block_col_idx = n_block * (kBlockN / 32);
+        int block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
+        int block_col_idx = n_block * (kBlockN / 32);
        if (Return_softmax) {
            Tensor tOrP_copy = make_fragment_like(tOrP);
-            copy(tOrP, tOrP_copy);
+            cute::copy(tOrP, tOrP_copy);
            flash::apply_dropout</*encode_dropout_in_sign_bit=*/true>(
                tOrP_copy, params.p_dropout_in_uint8_t, seed, offset,
                block_row_idx, block_col_idx, kNWarps
            );
-            flash::write_softmax_to_gmem(tOrP_copy, tPgP, gmem_thr_copy_P);
+            flash::write_softmax_to_gmem(tOrP_copy, tPgP, gmem_tiled_copy_P);
            tPgP.data() = tPgP.data() + (-kBlockN);
        }
        if (Is_dropout) {
@ -472,7 +526,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
                                 block_row_idx, block_col_idx, kNWarps);
        }

-        flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_thr_copy_V);
+        flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
    }

    // Epilogue
@ -496,15 +550,15 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
    Tensor rO = flash::convert_type<Element>(acc_o);
    Tensor sO = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutO{});    // (SMEM_M,SMEM_N)
    // Partition sO to match the accumulator partitioning
-    auto smem_thr_copy_O = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomO{}, tiled_mma).get_thread_slice(tidx);
-    // auto smem_thr_copy_O = make_tiled_copy_C_warpcontiguousM<MMA_M>(typename Kernel_traits::SmemCopyAtomO{}, tiled_mma).get_thread_slice(tidx);
+    auto smem_tiled_copy_O = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomO{}, tiled_mma);
+    auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(tidx);
    Tensor taccOrO = smem_thr_copy_O.retile_S(rO);        // ((Atom,AtomNum), MMA_M, MMA_N)
    Tensor taccOsO = smem_thr_copy_O.partition_D(sO);     // ((Atom,AtomNum),PIPE_M,PIPE_N)

    // sO has the same size as sQ, so we don't need to sync here.
    if (Kernel_traits::Share_Q_K_smem) { __syncthreads(); }

-    copy(smem_thr_copy_O, taccOrO, taccOsO);
+    cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);

    const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
        + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
@ -515,14 +569,15 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
    Tensor gLSE = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.softmax_lse_ptr) + row_offset_lse),
                              Shape<Int<kBlockM>>{}, Stride<_1>{});

-    auto gmem_thr_copy_O = typename Kernel_traits::GmemTiledCopyO{}.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopyO gmem_tiled_copy_O;
+    auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(tidx);
    Tensor tOsO = gmem_thr_copy_O.partition_S(sO);        // ((Atom,AtomNum),ATOM_M,ATOM_N)
    Tensor tOgO = gmem_thr_copy_O.partition_D(gO);

    __syncthreads();

    Tensor tOrO = make_tensor<Element>(shape(tOgO));
-    copy(gmem_thr_copy_O, tOsO, tOrO);
+    cute::copy(gmem_tiled_copy_O, tOsO, tOrO);

    Tensor caccO = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
    Tensor taccOcO = thr_mma.partition_C(caccO);                           // (MMA,MMA_M,MMA_K)
@ -548,14 +603,15 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
        for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(0, 0, k)) < params.d; }
    }
    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    flash::copy</*Is_even_MN=*/false, Is_even_K, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_thr_copy_O, tOrO, tOgO, tOcO, tOpO, binfo.actual_seqlen_q - m_block * kBlockM
+    flash::copy<Is_even_MN, Is_even_K, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, binfo.actual_seqlen_q - m_block * kBlockM
    );
 }

+
 ////////////////////////////////////////////////////////////////////////////////////////////////////

-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_N, bool Is_even_K, bool Return_softmax, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Return_softmax, typename Params>
 inline __device__ void compute_attn(const Params &params) {
    const int m_block = blockIdx.x;
    // The block index for the batch.
@ -571,7 +627,7 @@ inline __device__ void compute_attn(const Params &params) {
    // the attention matrix. This way, as long as we have the batch, head, and the location of
    // the 16 x 32 block within the attention matrix, we can generate the exact same dropout pattern.

-    flash::compute_attn_1rowblock<Kernel_traits, Is_dropout, Is_causal, Is_even_N, Is_even_K, Return_softmax>(params, bidb, bidh, m_block);
+    flash::compute_attn_1rowblock<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Return_softmax>(params, bidb, bidh, m_block);
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/candle-flash-attn/kernels/flash_fwd_launch_template.h
+++ b/candle-flash-attn/kernels/flash_fwd_launch_template.h
@ -4,15 +4,14 @@

 #pragma once

-// #include <ATen/cuda/CUDAContext.h>
-
 #include "static_switch.h"
 #include "flash.h"
 #include "flash_fwd_kernel.h"

-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_N, bool Is_even_K, bool Return_softmax>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Return_softmax>
 __global__ void flash_fwd_kernel(Flash_fwd_params params) {
-    flash::compute_attn<Kernel_traits, Is_dropout, Is_causal, Is_even_N, Is_even_K, Return_softmax>(params);
+    static_assert(!(Is_causal && Is_local));  // If Is_local is true, Is_causal should be false
+    flash::compute_attn<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Return_softmax>(params);
 }

 template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
@ -26,35 +25,39 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {

    const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
    dim3 grid(num_m_block, params.b, params.h);
-    // We also use is_even_N to set Unpadded in the BlockInfo constructor, so we need to check
-    // for cu_seqlens_q as well.
-    const bool is_even_N = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0;
+    const bool is_even_MN = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0 && params.seqlen_q % Kernel_traits::kBlockM == 0;
    const bool is_even_K = params.d == Kernel_traits::kHeadDim;
    const bool return_softmax = params.p_ptr != nullptr;
-    BOOL_SWITCH(is_even_N, IsEvenNConst, [&] {
+    BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
        BOOL_SWITCH(is_even_K, IsEvenKConst, [&] {
-            BOOL_SWITCH(return_softmax, ReturnSoftmaxConst, [&] {
-                // Will only return softmax if dropout, to reduce compilation time.
-                auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout, Is_causal, IsEvenNConst, IsEvenKConst, ReturnSoftmaxConst && Is_dropout>;
-                // auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout, Is_causal, IsEvenNConst, true, ReturnSoftmaxConst && Is_dropout>;
-                // if (smem_size >= 48 * 1024) {
-                //     C10_CUDA_CHECK(cudaFuncSetAttribute(
-                //         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-                // }
-                int ctas_per_sm;
-                cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                    &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size);
-                // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm);
-                kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
-                // C10_CUDA_KERNEL_LAUNCH_CHECK();
+            BOOL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
+                BOOL_SWITCH(return_softmax, ReturnSoftmaxConst, [&] {
+                    BOOL_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
+                        // Will only return softmax if dropout, to reduce compilation time.
+                        // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
+                        // If return_softmax, set IsEvenMNConst to false to reduce number of templates
+                        // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
+                        // If Is_local, set Is_causal to false
+                        auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst, ReturnSoftmaxConst && Is_dropout>;
+                        // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, false, true, true, false>;
+                        // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
+                        // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
+                        // int ctas_per_sm;
+                        // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                        //     &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size);
+                        // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm);
+                        kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
+                    });
+                });
            });
        });
    });
 }

+
 template<typename T>
 void run_mha_fwd_hdim32(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 32;
+    constexpr static int Headdim = 32;
    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
        BOOL_SWITCH(params.is_causal, Is_causal, [&] {
            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
@ -64,7 +67,7 @@ void run_mha_fwd_hdim32(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 64;
+    constexpr static int Headdim = 64;
    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
        BOOL_SWITCH(params.is_causal, Is_causal, [&] {
            if constexpr(!Is_dropout) {
@ -86,7 +89,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim96(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 96;
+    constexpr static int Headdim = 96;
    // auto dprops = at::cuda::getCurrentDeviceProperties();
    bool is_sm8x = true; // dprops->major == 8 && dprops->minor > 0;
    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
@ -112,7 +115,7 @@ void run_mha_fwd_hdim96(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 128;
+    constexpr static int Headdim = 128;
    // auto dprops = at::cuda::getCurrentDeviceProperties();
    bool is_sm8x = true; // dprops->major == 8 && dprops->minor > 0;
    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
@ -149,7 +152,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 160;
+    constexpr static int Headdim = 160;
    // auto dprops = at::cuda::getCurrentDeviceProperties();
    bool is_sm8x = true; // dprops->major == 8 && dprops->minor > 0;
    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
@ -179,7 +182,7 @@ void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 192;
+    constexpr static int Headdim = 192;
    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
        BOOL_SWITCH(params.is_causal, Is_causal, [&] {
            if constexpr(!Is_dropout) {
@ -198,7 +201,7 @@ void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim224(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 224;
+    constexpr static int Headdim = 224;
    int device;
    cudaGetDevice(&device);
    int max_smem_per_block;
@ -224,7 +227,7 @@ void run_mha_fwd_hdim224(Flash_fwd_params &params, cudaStream_t stream) {

 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr int Headdim = 256;
+    constexpr static int Headdim = 256;
    int device;
    cudaGetDevice(&device);
    int max_smem_per_sm, max_smem_per_block;
--- a/candle-flash-attn/kernels/kernel_traits.h
+++ b/candle-flash-attn/kernels/kernel_traits.h
@ -91,17 +91,20 @@ struct Flash_fwd_kernel_traits : public Base {
        SmemLayoutAtomQ{},
        Shape<Int<kBlockN>, Int<kHeadDim>>{}));

+    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
+    using SmemLayoutAtomVtransposedNoSwizzle = Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
+                                                      Stride<_1, Int<kBlockKSmem>>>;
    using SmemLayoutAtomVtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{},
-                    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
-                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
-                           Stride<_1, Int<kBlockKSmem>>>{}));
+        composition(Swizzle<kSwizzle, 3, 3>{}, SmemLayoutAtomVtransposedNoSwizzle{}));
    using SmemLayoutVtransposed = decltype(tile_to_shape(
        SmemLayoutAtomVtransposed{},
        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
    // Maybe the VtransposeNoSwizzle just needs to have the right shape
    // And the strides don't matter?
-    using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());
+    using SmemLayoutVtransposedNoSwizzle = decltype(tile_to_shape(
+        SmemLayoutAtomVtransposedNoSwizzle{},
+        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
+    // using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());

    using SmemLayoutAtomO = decltype(
        composition(Swizzle<kSwizzle, 3, 3>{},
@ -110,7 +113,8 @@ struct Flash_fwd_kernel_traits : public Base {
    using SmemLayoutO = decltype(tile_to_shape(
        SmemLayoutAtomO{},
        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
-    using SmemCopyAtomO = Copy_Atom<DefaultCopy, elem_type>;
+    using SmemCopyAtomO = Copy_Atom<DefaultCopy, Element>;
+    using SmemCopyAtomOaccum = Copy_Atom<DefaultCopy, ElementAccum>;

    static constexpr int kSmemQCount = size(SmemLayoutQ{});
    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
@ -138,11 +142,11 @@ struct Flash_fwd_kernel_traits : public Base {
        DefaultCopy
    >;
    using GmemTiledCopyQKV = decltype(
-        make_tiled_copy(Copy_Atom<Gmem_copy_struct, elem_type>{},
+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, Element>{},
                        GmemLayoutAtom{},
                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
    using GmemTiledCopyO = decltype(
-        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
                        GmemLayoutAtom{},
                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
    static constexpr int kGmemThreadsPerRowP = kBlockN / kGmemElemsPerLoad;
@ -151,10 +155,30 @@ struct Flash_fwd_kernel_traits : public Base {
                                   Stride<Int<kGmemThreadsPerRowP>, _1>>;

    using GmemTiledCopyP = decltype(
-        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
                        GmemLayoutAtomP{},
                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store

+    using GmemLayoutAtomOaccum = std::conditional_t<
+        kBlockKSmem == 32,
+        Layout<Shape <_16, _8>,  // Thread layout, 8 threads per row
+               Stride< _8, _1>>,
+        Layout<Shape <_8, _16>,  // Thread layout, 16 threads per row
+               Stride< _16, _1>>
+    >;
+    using GmemTiledCopyOaccum = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomOaccum{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
+    using GmemLayoutAtomRotcossin = GmemLayoutAtom;
+    using GmemTiledCopyRotcossin = decltype(
+        make_tiled_copy(Copy_Atom<UniversalCopy<uint64_t>, Element>{},
+                        GmemLayoutAtomRotcossin{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per load
+    using GmemTiledCopyRotcossinCont = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                        GmemLayoutAtomRotcossin{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per load
 };

 // Is_V_in_regs is an option to reduce smem usage, but will increase register pressue.
@ -223,16 +247,19 @@ struct Flash_bwd_kernel_traits : public Base {
        SmemLayoutAtomKV{},
        make_shape(Int<kBlockN>{}, Int<kHeadDim>{})));

+    using SmemLayoutAtomKtransposedNoSwizzle = Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
+                                                      Stride<_1, Int<kBlockKSmem>>>;
    using SmemLayoutAtomKtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{},
-                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
-                           Stride<_1, Int<kBlockKSmem>>>{}));
+        composition(Swizzle<kSwizzle, 3, 3>{}, SmemLayoutAtomKtransposedNoSwizzle{}));
    using SmemLayoutKtransposed = decltype(tile_to_shape(
        SmemLayoutAtomKtransposed{},
        make_shape(Int<kHeadDim>{}, Int<kBlockN>{})));
    // Maybe the KtransposeNoSwizzle just needs to have the right shape
    // And the strides don't matter?
-    using SmemLayoutKtransposedNoSwizzle = decltype(SmemLayoutKtransposed{}.layout_fn());
+    using SmemLayoutKtransposedNoSwizzle = decltype(tile_to_shape(
+        SmemLayoutAtomKtransposedNoSwizzle{},
+        make_shape(Int<kHeadDim>{}, Int<kBlockN>{})));
+    // using SmemLayoutKtransposedNoSwizzle = decltype(SmemLayoutKtransposed{}.layout_fn());

    // TODO: generalize to other values of kBlockN
    // TODO: what should be the Swizzle here? 3 is faster than 1, and 1 is faster than 2
@ -250,24 +277,30 @@ struct Flash_bwd_kernel_traits : public Base {
    using SmemLayoutPdS = decltype(tile_to_shape(
        SmemLayoutAtomPdS{},
        make_shape(Int<kBlockM>{}, Int<kBlockN>{})));
+    using SmemLayoutAtomPdStransposedNoSwizzle = Layout<Shape<Int<kPBlockN>, Int<kBlockM>>,
+                                                        Stride<_1, Int<kPBlockN>>>;
    using SmemLayoutAtomPdStransposed = decltype(
-        composition(Swizzle<kSwizzlePdS, 3, 3>{},
-                    Layout<Shape<Int<kPBlockN>, Int<kBlockM>>,
-                           Stride<_1, Int<kPBlockN>>>{}));
+        composition(Swizzle<kSwizzlePdS, 3, 3>{}, SmemLayoutAtomPdStransposedNoSwizzle{}));
    using SmemLayoutPdStransposed = decltype(tile_to_shape(
        SmemLayoutAtomPdStransposed{},
        make_shape(Int<kBlockN>{}, Int<kBlockM>{})));
-    using SmemLayoutPdStransposedNoSwizzle = decltype(SmemLayoutPdStransposed{}.layout_fn());
+    using SmemLayoutPdStransposedNoSwizzle = decltype(tile_to_shape(
+        SmemLayoutAtomPdStransposedNoSwizzle{},
+        make_shape(Int<kBlockN>{}, Int<kBlockM>{})));
+    // using SmemLayoutPdStransposedNoSwizzle = decltype(SmemLayoutPdStransposed{}.layout_fn());
    using SmemCopyAtomPdS = Copy_Atom<DefaultCopy, elem_type>;

+    using SmemLayoutAtomQdOtransposedNoSwizzle = Layout<Shape<Int<kBlockKSmem>, Int<kBlockM>>,
+                                                        Stride<_1, Int<kBlockKSmem>>>;
    using SmemLayoutAtomQdOtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{},
-                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockM>>,
-                           Stride<_1, Int<kBlockKSmem>>>{}));
+        composition(Swizzle<kSwizzle, 3, 3>{}, SmemLayoutAtomQdOtransposedNoSwizzle{}));
    using SmemLayoutQdOtransposed = decltype(tile_to_shape(
        SmemLayoutAtomQdOtransposed{},
        make_shape(Int<kHeadDim>{}, Int<kBlockM>{})));
-    using SmemLayoutQdOtransposedNoSwizzle = decltype(SmemLayoutQdOtransposed{}.layout_fn());
+    using SmemLayoutQdOtransposedNoSwizzle = decltype(tile_to_shape(
+        SmemLayoutAtomQdOtransposedNoSwizzle{},
+        make_shape(Int<kHeadDim>{}, Int<kBlockM>{})));
+    // using SmemLayoutQdOtransposedNoSwizzle = decltype(SmemLayoutQdOtransposed{}.layout_fn());

    using SmemLayoutAtomdKV = decltype(
        composition(Swizzle<kSwizzle, 3, 3>{},
@ -292,13 +325,11 @@ struct Flash_bwd_kernel_traits : public Base {
    static constexpr int kSmemdSCount = size(SmemLayoutPdS{});
    static constexpr int kSmemPCount = size(SmemLayoutPdS{});
    static constexpr int kSmemdQCount = size(SmemLayoutdQ{});
-    static constexpr int kSmemdPsumCount = kBlockM;
    static constexpr int kSmemQdOSize = kSmemQdOCount * sizeof(Element);
    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
    static constexpr int kSmemdSSize = kSmemdSCount * sizeof(Element);
    static constexpr int kSmemPSize = kSmemPCount * sizeof(Element);
    static constexpr int kSmemdQSize = kSmemdQCount * sizeof(Element);
-    static constexpr int kSmemdPsumSize = kSmemdPsumCount * sizeof(ElementAccum);
    static constexpr int kSmemSize = kSmemQdOSize
        + (!Is_V_in_regs
           ? kSmemKVSize + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize)
--- a/candle-flash-attn/kernels/kernel_traits_sm90.h
+++ b/candle-flash-attn/kernels/kernel_traits_sm90.h
@ -0,0 +1,159 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/copy.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/layout.h"
+#include <cutlass/numeric_types.h>
+
+using namespace cute;
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type=cutlass::half_t>
+struct Flash_kernel_traits_sm90 {
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using Element = elem_type;
+    static constexpr bool Has_cp_async = true;
+#else
+    using Element = cutlass::half_t;
+    static constexpr bool Has_cp_async = false;
+#endif
+
+    using ElementAccum = float;
+    using index_t = uint32_t;
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using MMA_Atom_Arch = std::conditional_t<
+        std::is_same_v<elem_type, cutlass::half_t>,
+        MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+        MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>
+    >;
+    using ValLayoutMNK = Layout<Shape<_1, _2, _1>>;
+#else
+    using MMA_Atom_Arch = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
+    using ValLayoutMNK = Layout<Shape<_1, _2, _2>>;
+#endif
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 750
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, elem_type>;
+    using SmemCopyAtomTransposed = Copy_Atom<SM75_U16x8_LDSM_T, elem_type>;
+#else
+    using SmemCopyAtom = Copy_Atom<DefaultCopy, elem_type>;
+    using SmemCopyAtomTransposed = Copy_Atom<DefaultCopy, elem_type>;
+#endif
+};
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, bool Is_Q_in_regs_=false, bool Share_Q_K_smem_=false, typename elem_type=cutlass::half_t,
+         typename Base=Flash_kernel_traits_sm90<kHeadDim_, kBlockM_, kBlockN_, kNWarps_, elem_type> >
+struct Flash_fwd_kernel_traits : public Base {
+    using Element = typename Base::Element;
+    using ElementAccum = typename Base::ElementAccum;
+    using index_t = typename Base::index_t;
+    static constexpr bool Has_cp_async = Base::Has_cp_async;
+    using SmemCopyAtom = typename Base::SmemCopyAtom;
+    using SmemCopyAtomTransposed = typename Base::SmemCopyAtomTransposed;
+
+    static constexpr bool Share_Q_K_smem = Share_Q_K_smem_;
+    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_ || Share_Q_K_smem;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * 32;
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
+    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
+
+    using TiledMma = TiledMMA<
+        typename Base::MMA_Atom_Arch,
+        Layout<Shape<Int<kNWarps>,_1,_1>>,  // 4x1x1 or 8x1x1 thread group
+        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+
+    using SmemLayoutAtomQ = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    // This has to be kBlockKSmem, using kHeadDim gives wrong results for d=128
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutQ = decltype(tile_to_shape(
+        SmemLayoutAtomQ{},
+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
+
+    using SmemLayoutKV = decltype(tile_to_shape(
+        SmemLayoutAtomQ{},
+        Shape<Int<kBlockN>, Int<kHeadDim>>{}));
+
+    using SmemLayoutAtomVtransposed = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
+                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
+                           Stride<_1, Int<kBlockKSmem>>>{}));
+    using SmemLayoutVtransposed = decltype(tile_to_shape(
+        SmemLayoutAtomVtransposed{},
+        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
+    // Maybe the VtransposeNoSwizzle just needs to have the right shape
+    // And the strides don't matter?
+    using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());
+
+    using SmemLayoutAtomO = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<Int<8>, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutO = decltype(tile_to_shape(
+        SmemLayoutAtomO{},
+        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
+    using SmemCopyAtomO = Copy_Atom<DefaultCopy, elem_type>;
+
+    static constexpr int kSmemQCount = size(SmemLayoutQ{});
+    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
+    static constexpr int kSmemQSize = kSmemQCount * sizeof(Element);
+    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
+    static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize;
+
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
+    // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts.
+    // For example, for d=128, smem is split into 2 "pages", each page takes care of columns
+    // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem,
+    // thread 0 - 7 will write to the first page and thread 8 - 15 will write to the second page,
+    // to the same banks.
+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+
+    // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading
+    // from the same address by the same threadblock. This is slightly faster.
+    using Gmem_copy_struct = std::conditional_t<
+        Has_cp_async,
+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
+        DefaultCopy
+    >;
+    using GmemTiledCopyQKV = decltype(
+        make_tiled_copy(Copy_Atom<Gmem_copy_struct, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
+    using GmemTiledCopyO = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
+    static constexpr int kGmemThreadsPerRowP = kBlockN / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRowP == 0, "kNThreads must be a multiple of kGmemThreadsPerRowP");
+    using GmemLayoutAtomP = Layout<Shape <Int<kNThreads / kGmemThreadsPerRowP>, Int<kGmemThreadsPerRowP>>,
+                                   Stride<Int<kGmemThreadsPerRowP>, _1>>;
+
+    using GmemTiledCopyP = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtomP{},
+                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/candle-flash-attn/kernels/softmax.h
+++ b/candle-flash-attn/kernels/softmax.h
@ -8,8 +8,7 @@

 #include <cute/tensor.hpp>

-#include <cutlass/cutlass.h>
-#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>

 #include "philox.cuh"
 #include "utils.h"
@ -117,15 +116,18 @@ inline __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tens
 }

 template <typename Engine, typename Layout>
-inline __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const uint32_t max_seqlen_k) {
+inline __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const int max_seqlen_k,
+                                  const int col_idx_offset_ = 0) {
    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
    static_assert(Layout::rank == 2, "Only support 2D Tensor");
-    const uint32_t lane_id = threadIdx.x % 32;
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
    #pragma unroll
    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+        const int col_idx_base = col_idx_offset + nj * 8;
        #pragma unroll
        for (int j = 0; j < size<1, 0>(tensor); ++j) {
-            const uint32_t col_idx = nj * 8 + j + (lane_id % 4) * 2;
+            const int col_idx = col_idx_base + j;
            if (col_idx >= max_seqlen_k) {
                // Without the "make_coord" we get wrong results
                #pragma unroll
@ -137,30 +139,30 @@ inline __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const uint32_t
    }
 }

-template <typename Engine, typename Layout>
-inline __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const uint32_t col_idx_offset_,
-                                         const uint32_t max_seqlen_k, const uint32_t row_idx_offset_,
-                                         const uint32_t warp_row_stride) {
+template <bool HasWSLeft=true, typename Engine, typename Layout>
+inline __device__ void apply_mask_local(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
+                                        const int max_seqlen_k, const int row_idx_offset,
+                                        const int max_seqlen_q, const int warp_row_stride,
+                                        const int window_size_left, const int window_size_right) {
    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
    static_assert(Layout::rank == 2, "Only support 2D Tensor");
-    const uint32_t lane_id = threadIdx.x % 32;
-    // const uint32_t row_idx_offset = row_idx_offset_ + lane_id / 4;
-    const uint32_t row_idx_offset = row_idx_offset_;
-    const uint32_t col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
    #pragma unroll
    for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
-        const uint32_t row_idx_base = row_idx_offset + mi * warp_row_stride;
+        const int row_idx_base = row_idx_offset + mi * warp_row_stride;
        #pragma unroll
        for (int i = 0; i < size<0, 0>(tensor); ++i) {
-            const uint32_t row_idx = row_idx_base + i * 8;
-            const uint32_t col_idx_limit = std::min(max_seqlen_k, row_idx + 1);
+            const int row_idx = row_idx_base + i * 8;
+            const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
+            const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
            #pragma unroll
            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
-                const uint32_t col_idx_base = col_idx_offset + nj * 8;
+                const int col_idx_base = col_idx_offset + nj * 8;
                #pragma unroll
                for (int j = 0; j < size<1, 0>(tensor); ++j) {
-                    const uint32_t col_idx = col_idx_base + j;
-                    if (col_idx >= col_idx_limit) {
+                    const int col_idx = col_idx_base + j;
+                    if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
                    }
                }
@ -174,10 +176,19 @@ inline __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const u
    }
 }

+template <typename Engine, typename Layout>
+inline __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
+                                         const int max_seqlen_k, const int row_idx_offset,
+                                         const int max_seqlen_q, const int warp_row_stride) {
+    // Causal masking is equivalent to local masking with window_size_left = infinity and window_size_right = 0
+    apply_mask_local</*HasWSLeft=*/false>(tensor, col_idx_offset_, max_seqlen_k, row_idx_offset,
+                                          max_seqlen_q, warp_row_stride, -1, 0);
+}
+
 template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
 inline __device__ void apply_mask_causal_w_idx(
    Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol,
-    const uint32_t col_idx_offset_, const uint32_t max_seqlen_k, const uint32_t row_idx_offset_)
+    const int col_idx_offset_, const int max_seqlen_k, const int row_idx_offset)
 {
    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
@ -186,7 +197,7 @@ inline __device__ void apply_mask_causal_w_idx(
    CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol));
    #pragma unroll
    for (int mi = 0; mi < size<0>(tensor); ++mi) {
-        const uint32_t col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset_ + get<0>(idx_rowcol(mi, 0)));
+        const int col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset + get<0>(idx_rowcol(mi, 0)));
        #pragma unroll
        for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
            if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
@ -204,8 +215,8 @@ inline __device__ void apply_mask_causal_w_idx(
 template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
 inline __device__ void apply_dropout(Tensor<Engine, Layout> &tensor, uint8_t p_dropout_in_uint8_t,
                                     unsigned long long seed, unsigned long long offset,
-                                     uint32_t block_row_start, uint32_t block_col_start,
-                                     uint32_t block_row_stride) {
+                                     int block_row_start, int block_col_start,
+                                     int block_row_stride) {
    // tensor has shape (8, MMA_M, MMA_N / 2)
    using T = typename Engine::value_type;
    auto encode_dropout = [](bool keep, T val) {
--- a/candle-flash-attn/kernels/utils.h
+++ b/candle-flash-attn/kernels/utils.h
@ -87,46 +87,6 @@ inline __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-template<typename T>
-inline __device__ float2 half2_unpack(uint32_t a);
-
-template <>
-inline __device__ float2 half2_unpack<__half>(uint32_t a) {
-    return __half22float2(reinterpret_cast<__half2 (&)>(a));
-}
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-template <>
-inline __device__ float2 half2_unpack<__nv_bfloat16>(uint32_t a) {
-    return __bfloat1622float2(reinterpret_cast<__nv_bfloat162 (&)>(a));
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convert two half2's or bf162's into float, then take their dot product.
-template <typename T>
-inline __device__ float hfma2_to_float(const uint32_t a, const uint32_t b) {
-    float2 af = flash::half2_unpack<T>(a);
-    float2 bf = flash::half2_unpack<T>(b);
-    return af.x * bf.x + af.y * bf.y;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Converted two vectors of 8 half's or bf16's into float, then take their dot product.
-template<typename T>
-inline __device__ float hmulsum8(const uint4 a, const uint4 b) {
-    float sum;
-    sum  = flash::hfma2_to_float<T>(a.x, b.x);
-    sum += flash::hfma2_to_float<T>(a.y, b.y);
-    sum += flash::hfma2_to_float<T>(a.z, b.z);
-    sum += flash::hfma2_to_float<T>(a.w, b.w);
-    return sum;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 template<typename T>
 struct MaxOp {
 __device__ inline T operator()(T const & x, T const & y) { return x > y ? x : y; }
@ -173,10 +133,12 @@ static __device__ inline T run(T x, Operator &op) {

 template<bool A_in_regs=false, bool B_in_regs=false, typename Tensor0, typename Tensor1,
         typename Tensor2, typename Tensor3, typename Tensor4,
-         typename TiledMma, typename TiledCopy0, typename TiledCopy1>
+         typename TiledMma, typename TiledCopyA, typename TiledCopyB,
+         typename ThrCopyA, typename ThrCopyB>
 inline __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
                            Tensor4 const& tCsB, TiledMma tiled_mma,
-                            TiledCopy0 smem_thr_copy_A, TiledCopy1 smem_thr_copy_B) {
+                            TiledCopyA smem_tiled_copy_A, TiledCopyB smem_tiled_copy_B,
+                            ThrCopyA smem_thr_copy_A, ThrCopyB smem_thr_copy_B) {
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
@ -184,13 +146,13 @@ inline __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3
    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
-    if (!A_in_regs) { copy(smem_thr_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{})); }
-    if (!B_in_regs) { copy(smem_thr_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{})); }
+    if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{})); }
+    if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{})); }
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
-            if (!A_in_regs) { copy(smem_thr_copy_A, tCsA(_, _, i + 1), tCrA_copy_view(_, _, i + 1)); }
-            if (!B_in_regs) { copy(smem_thr_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1)); }
+            if (!A_in_regs) { cute::copy(smem_tiled_copy_A, tCsA(_, _, i + 1), tCrA_copy_view(_, _, i + 1)); }
+            if (!B_in_regs) { cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1)); }
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
@ -199,19 +161,20 @@ inline __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 template<typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3,
-         typename TiledMma, typename TiledCopy>
+         typename TiledMma, typename TiledCopy, typename ThrCopy>
 inline __device__ void gemm_A_in_regs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
-                                      TiledMma tiled_mma, TiledCopy smem_thr_copy_B) {
+                                      TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
+                                      ThrCopy smem_thr_copy_B) {
    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
-    copy(smem_thr_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
+    cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
    #pragma unroll
    for (int i = 0; i < size<2>(tCrA); ++i) {
        if (i < size<2>(tCrA) - 1) {
-            copy(smem_thr_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
+            cute::copy(smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
        }
        cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
    }
@ -225,7 +188,10 @@ inline __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
    static_assert(decltype(size<0>(acc_layout))::value == 4);
    static_assert(decltype(rank(acc_layout))::value == 3);
    auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
-    return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
+    // TD [2023-08-13]: Idk why but get<0, 1>(l) doesn't work for Cutlass 3.2, I'm getting
+    // "int_tuple.hpp(74): error: conversion to inaccessible base class"
+    // return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
+    return make_layout(make_layout(get<1>(get<0>(l)), get<1>(l)), make_layout(get<0>(get<0>(l)), get<2>(l)));
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -241,9 +207,13 @@ inline __device__ auto convert_layout_rowcol_Aregs(Layout rowcol_layout) {
    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
    constexpr int MMA_N_divisor = mma_shape_K == 8 ? 1 : 2;
    auto l = logical_divide(rowcol_layout, Shape<X, Shape<X, Int<MMA_N_divisor>>>{});  // ((2, MMA_M), (2, (2, MMA_N / 2)))
-    return make_layout(make_layout(get<1, 0>(l), get<0, 0>(l), get<1, 1, 0>(l)),
-                       get<0, 1>(l),
-                       get<1, 1, 1>(l));
+    // TD [2023-08-13]: Same error as above on Cutlass 3.2
+    // return make_layout(make_layout(get<1, 0>(l), get<0, 0>(l), get<1, 1, 0>(l)),
+    //                    get<0, 1>(l),
+    //                    get<1, 1, 1>(l));
+    return make_layout(make_layout(get<0>(get<1>(l)), get<0>(get<0>(l)), get<0>(get<1>(get<1>(l)))),
+                       get<1>(get<0>(l)),
+                       get<1>(get<1>(get<1>(l))));
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -319,9 +289,9 @@ void cp_async_wait() {
 template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
-inline __device__ void copy(TiledCopy thr_copy, Tensor<Engine0, Layout0> const &S,
+inline __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
                            Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
-                            Tensor<Engine3, Layout3> const &predicate_K, int max_MN=0) {
+                            Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
@ -335,13 +305,13 @@ inline __device__ void copy(TiledCopy thr_copy, Tensor<Engine0, Layout0> const &
            #pragma unroll
            for (int k = 0; k < size<2>(S); ++k) {
                if (Is_even_K || predicate_K(k)) {
-                    copy(thr_copy, S(_, m, k), D(_, m, k));
+                    cute::copy(tiled_copy, S(_, m, k), D(_, m, k));
                } else if (Clear_OOB_K) {
-                    clear(D(_, m, k));
+                    cute::clear(D(_, m, k));
                }
            }
        } else if (Clear_OOB_MN) {
-            clear(D(_, m, _));
+            cute::clear(D(_, m, _));
        }
    }
    // TD [2023-04-13]: Strange that the code below can cause race condition.
@ -350,7 +320,7 @@ inline __device__ void copy(TiledCopy thr_copy, Tensor<Engine0, Layout0> const &
    //     #pragma unroll
    //     for (int m = 0; m < size<1>(S); ++m) {
    //         if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
-    //             copy(thr_copy, S(_, m, _), D(_, m, _));
+    //             copy(tiled_copy, S(_, m, _), D(_, m, _));
    //         } else if (Clear_OOB_MN) {
    //             clear(D(_, m, _));
    //         }
@ -362,7 +332,7 @@ inline __device__ void copy(TiledCopy thr_copy, Tensor<Engine0, Layout0> const &
    //             #pragma unroll
    //             for (int m = 0; m < size<1>(S); ++m) {
    //                 if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
-    //                     copy(thr_copy, S(_, m, k), D(_, m, k));
+    //                     copy(tiled_copy, S(_, m, k), D(_, m, k));
    //                 } else if (Clear_OOB_MN) {
    //                     clear(D(_, m, k));
    //                 }
--- a/candle-flash-attn/src/ffi.rs
+++ b/candle-flash-attn/src/ffi.rs
@ -7,6 +7,8 @@ extern "C" {
        v_ptr: *const c_void,
        o_ptr: *const c_void,
        softmax_lse_ptr: *const c_void,
+        alibi_slopes_ptr: *const c_void,
+
        cu_seqlens_q_ptr: *const i32,
        cu_seqlens_k_ptr: *const i32,

@ -14,6 +16,7 @@ extern "C" {
        k_batch_stride: u32,
        v_batch_stride: u32,
        o_batch_stride: u32,
+        alibi_slopes_batch_stride: u32,

        q_row_stride: u32,
        k_row_stride: u32,
@ -37,8 +40,11 @@ extern "C" {
        seqlen_q_rounded: u32,
        seqlen_k_rounded: u32,

-        is_causal: c_int,
        is_bf16: c_int,
+        is_causal: c_int,
+
+        window_size_left: c_int,
+        window_size_right: c_int,
    );

 }
--- a/candle-flash-attn/src/lib.rs
+++ b/candle-flash-attn/src/lib.rs
@ -3,12 +3,14 @@ mod ffi;
 use candle::backend::BackendStorage;
 use candle::cuda_backend::cudarc::driver::DevicePtr;
 use candle::cuda_backend::WrapErr;
-use candle::{CpuStorage, Layout, Result, Shape, Tensor};
+use candle::{CpuStorage, DType, Layout, Result, Shape, Tensor};
 use half::{bf16, f16};

 pub struct FlashAttn {
    pub softmax_scale: f32,
-    pub causal: bool,
+    pub alibi_slopes: Option<Tensor>,
+    pub window_size_left: Option<usize>,
+    pub window_size_right: Option<usize>,
 }

 fn round_multiple(x: usize, m: usize) -> usize {
@ -85,6 +87,51 @@ impl FlashAttn {
            candle::bail!("number of k/v heads {num_heads_k} must divide number of heads in query {num_heads}")
        }

+        let alibi_slopes_ptr = if let Some(alibi_slopes) = &self.alibi_slopes {
+            if alibi_slopes.dtype() != DType::F32 {
+                candle::bail!(
+                    "DType mismatch alibi_slopes {:?}, expected {:?}",
+                    alibi_slopes.dtype(),
+                    DType::F32
+                );
+            }
+
+            let (alibi_slopes, alibi_slopes_layout) = alibi_slopes.storage_and_layout();
+
+            if num_heads != alibi_slopes_layout.shape().dims1()? {
+                candle::bail!(
+                    "shape mismatch alibi_slopes {:?}, expected {:?}",
+                    alibi_slopes_layout.shape(),
+                    (num_heads)
+                );
+            }
+
+            let alibi_slopes = match &*alibi_slopes {
+                candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
+                _ => candle::bail!("alibi_slopes must be a cuda tensor"),
+            };
+
+            let alibi_slopes = alibi_slopes.slice(alibi_slopes_layout.start_offset()..);
+
+            *alibi_slopes.device_ptr() as *const core::ffi::c_void
+        } else {
+            std::ptr::null()
+        };
+
+        // if window_size_left > self.max_seqlen_k or None => -1
+        let mut window_size_left = self
+            .window_size_left
+            .filter(|v| v <= &seqlen_k)
+            .map(|v| v as i32)
+            .unwrap_or(-1);
+
+        // if window_size_right > self.max_seqlen_k or None => -1
+        let mut window_size_right = self
+            .window_size_right
+            .filter(|v| v <= &seqlen_k)
+            .map(|v| v as i32)
+            .unwrap_or(-1);
+
        let head_size = round_multiple(head_size_og, 8);
        let head_size_rounded = round_multiple(head_size, 32);
        let seqlen_q_rounded = round_multiple(seqlen_q, 128);
@ -94,9 +141,22 @@ impl FlashAttn {
        let dst = unsafe { dev.alloc::<T>(elem_count) }.w()?;
        let softmax_lse = dev.alloc_zeros::<f32>(b_sz * num_heads * seqlen_q).w()?;

-        let causal = if self.causal { 1 } else { 0 };
        let is_bf16 = if is_bf16 { 1 } else { 0 };

+        // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+        // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+        let is_causal = if window_size_left < 0 && window_size_right == 0 {
+            1
+        } else {
+            0
+        };
+        if window_size_left < 0 && window_size_right >= 0 {
+            window_size_left = seqlen_k as i32;
+        }
+        if window_size_left >= 0 && window_size_right < 0 {
+            window_size_right = seqlen_k as i32;
+        }
+
        unsafe {
            let q_ptr = *q.device_ptr() as *const core::ffi::c_void;
            let k_ptr = *k.device_ptr() as *const core::ffi::c_void;
@ -109,12 +169,14 @@ impl FlashAttn {
                v_ptr,
                dst_ptr,
                softmax_lse_ptr,
+                /* alibi_slopes_ptr */ alibi_slopes_ptr,
                /* cu_seqlens_q_ptr */ std::ptr::null(),
                /* cu_seqlens_k_ptr */ std::ptr::null(),
                /* q_batch_stride */ q_stride[0] as u32,
                /* k_batch_stride */ k_stride[0] as u32,
                /* v_batch_stride */ v_stride[0] as u32,
                /* o_batch_stride */ o_stride[0] as u32,
+                /* alibi_slopes_batch_stride */ 0,
                /* q_row_stride   */ q_stride[q_rank - 3] as u32,
                /* k_row_stride   */ k_stride[k_rank - 3] as u32,
                /* v_row_stride   */ v_stride[v_rank - 3] as u32,
@ -133,8 +195,10 @@ impl FlashAttn {
                /* seqlen_k */ seqlen_k as u32,
                /* seqlen_q_rounded */ seqlen_q_rounded as u32,
                /* seqlen_k_rounded */ seqlen_k_rounded as u32,
-                /* is_causal */ causal,
                /* is_bf16 */ is_bf16,
+                /* is_causal */ is_causal,
+                /* window_size_left */ window_size_left,
+                /* window_size_right */ window_size_right,
            )
        }

@ -197,20 +261,137 @@ pub fn flash_attn(
    softmax_scale: f32,
    causal: bool,
 ) -> Result<Tensor> {
+    let window_size_left = None;
+    let window_size_right = if causal { Some(0) } else { None };
+
    let op = FlashAttn {
        softmax_scale,
-        causal,
+        alibi_slopes: None,
+        window_size_left,
+        window_size_right,
+    };
+    q.apply_op3(k, v, op)
+}
+
+/// Flash-attention v2 layer.
+///
+/// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
+/// Multi-query and grouped-query attention are supported by using tensors k and v with fewer heads
+/// than q, the number of heads in k and v has to be divisible by the number of heads in q.
+///
+/// # Arguments
+///
+/// * `q` - Query tensor with shape `(batch, seq_len_q, num_heads_q, head_size)`.
+/// * `k` - Key tensor with shape `(batch, seq_len_kv, num_heads_kv, head_size)`.
+/// * `v` - Value tensor with shape `(batch, seq_len_kv, num_heads_kv, head_size)`.
+/// * `window_size_left` - Limit left attention to value tokens.
+/// * `window_size_right` - Limit right attention to value tokens.
+///
+/// # Causal mask
+///
+/// `window_size_left=None` with `window_size_right=Some(0)` applies a causal mask to the result
+/// of  `Q @ K^T`
+///
+/// The resulting tensor has dimensions `(batch, seq_len_q, num_heads_q, head_size)`.
+pub fn flash_attn_windowed(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    softmax_scale: f32,
+    window_size_left: Option<usize>,
+    window_size_right: Option<usize>,
+) -> Result<Tensor> {
+    let op = FlashAttn {
+        softmax_scale,
+        alibi_slopes: None,
+        window_size_left,
+        window_size_right,
+    };
+    q.apply_op3(k, v, op)
+}
+
+/// Flash-attention v2 layer.
+///
+/// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
+/// Multi-query and grouped-query attention are supported by using tensors k and v with fewer heads
+/// than q, the number of heads in k and v has to be divisible by the number of heads in q.
+///
+/// # Arguments
+///
+/// * `q` - Query tensor with shape `(batch, seq_len_q, num_heads_q, head_size)`.
+/// * `k` - Key tensor with shape `(batch, seq_len_kv, num_heads_kv, head_size)`.
+/// * `v` - Value tensor with shape `(batch, seq_len_kv, num_heads_kv, head_size)`.
+/// * `alibi_slopes` - Alibi slopes tensor with shape `(num_heads_q)`.
+///
+/// The resulting tensor has dimensions `(batch, seq_len_q, num_heads_q, head_size)`.
+pub fn flash_attn_alibi(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    alibi_slopes: &Tensor,
+    softmax_scale: f32,
+    causal: bool,
+) -> Result<Tensor> {
+    let window_size_left = None;
+    let window_size_right = if causal { Some(0) } else { None };
+
+    let op = FlashAttn {
+        softmax_scale,
+        alibi_slopes: Some(alibi_slopes.clone()),
+        window_size_left,
+        window_size_right,
+    };
+    q.apply_op3(k, v, op)
+}
+
+/// Flash-attention v2 layer.
+///
+/// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
+/// Multi-query and grouped-query attention are supported by using tensors k and v with fewer heads
+/// than q, the number of heads in k and v has to be divisible by the number of heads in q.
+///
+/// # Arguments
+///
+/// * `q` - Query tensor with shape `(batch, seq_len_q, num_heads_q, head_size)`.
+/// * `k` - Key tensor with shape `(batch, seq_len_kv, num_heads_kv, head_size)`.
+/// * `v` - Value tensor with shape `(batch, seq_len_kv, num_heads_kv, head_size)`.
+/// * `alibi_slopes` - Alibi slopes tensor with shape `(num_heads_q)`.
+/// * `window_size_left` - Limit left attention to value tokens.
+/// * `window_size_right` - Limit right attention to value tokens.
+///
+/// # Causal mask
+///
+/// `window_size_left=None` with `window_size_right=Some(0)` applies a causal mask to the result
+/// of  `Q @ K^T`
+///
+/// The resulting tensor has dimensions `(batch, seq_len_q, num_heads_q, head_size)`.
+pub fn flash_attn_alibi_windowed(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    alibi_slopes: &Tensor,
+    softmax_scale: f32,
+    window_size_left: Option<usize>,
+    window_size_right: Option<usize>,
+) -> Result<Tensor> {
+    let op = FlashAttn {
+        softmax_scale,
+        alibi_slopes: Some(alibi_slopes.clone()),
+        window_size_left,
+        window_size_right,
    };
    q.apply_op3(k, v, op)
 }

 struct FlashAttnVarLen {
-    softmax_scale: f32,
-    causal: bool,
-    max_seqlen_q: usize,
-    max_seqlen_k: usize,
-    seqlens_q: Tensor,
-    seqlens_k: Tensor,
+    pub softmax_scale: f32,
+    pub max_seqlen_q: usize,
+    pub max_seqlen_k: usize,
+    pub seqlens_q: Tensor,
+    pub seqlens_k: Tensor,
+    pub alibi_slopes: Option<Tensor>,
+    pub window_size_left: Option<usize>,
+    pub window_size_right: Option<usize>,
 }

 impl FlashAttnVarLen {
@ -311,7 +492,54 @@ impl FlashAttnVarLen {
        if nseqlens_k != nseqlens_q {
            candle::bail!("seqlens_q and seqlens_k should have the same number of elements {nseqlens_q} <> {nseqlens_k}")
        }
+
        let batch_size = nseqlens_q - 1;
+
+        let alibi_slopes_ptr = if let Some(alibi_slopes) = &self.alibi_slopes {
+            if alibi_slopes.dtype() != DType::F32 {
+                candle::bail!(
+                    "DType mismatch alibi_slopes {:?}, expected {:?}",
+                    alibi_slopes.dtype(),
+                    DType::F32
+                );
+            }
+
+            let (alibi_slopes, alibi_slopes_layout) = alibi_slopes.storage_and_layout();
+
+            if num_heads != alibi_slopes_layout.shape().dims1()? {
+                candle::bail!(
+                    "shape mismatch alibi_slopes {:?}, expected {:?}",
+                    alibi_slopes_layout.shape(),
+                    (num_heads)
+                );
+            }
+
+            let alibi_slopes = match &*alibi_slopes {
+                candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
+                _ => candle::bail!("alibi_slopes must be a cuda tensor"),
+            };
+
+            let alibi_slopes = alibi_slopes.slice(alibi_slopes_layout.start_offset()..);
+
+            *alibi_slopes.device_ptr() as *const core::ffi::c_void
+        } else {
+            std::ptr::null()
+        };
+
+        // if window_size_left > self.max_seqlen_k or None => -1
+        let mut window_size_left = self
+            .window_size_left
+            .filter(|v| v <= &self.max_seqlen_k)
+            .map(|v| v as i32)
+            .unwrap_or(-1);
+
+        // if window_size_right > self.max_seqlen_k or None => -1
+        let mut window_size_right = self
+            .window_size_right
+            .filter(|v| v <= &self.max_seqlen_k)
+            .map(|v| v as i32)
+            .unwrap_or(-1);
+
        let head_size = round_multiple(head_size_og, 8);
        let head_size_rounded = round_multiple(head_size, 32);
        let seqlen_q_rounded = round_multiple(self.max_seqlen_q, 128);
@ -323,9 +551,22 @@ impl FlashAttnVarLen {
            .alloc_zeros::<f32>(batch_size * num_heads * self.max_seqlen_q)
            .w()?;

-        let causal = if self.causal { 1 } else { 0 };
        let is_bf16 = if is_bf16 { 1 } else { 0 };

+        // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+        // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+        let is_causal = if window_size_left < 0 && window_size_right == 0 {
+            1
+        } else {
+            0
+        };
+        if window_size_left < 0 && window_size_right >= 0 {
+            window_size_left = self.max_seqlen_k as i32;
+        }
+        if window_size_left >= 0 && window_size_right < 0 {
+            window_size_right = self.max_seqlen_k as i32;
+        }
+
        unsafe {
            let q_ptr = *q.device_ptr() as *const core::ffi::c_void;
            let k_ptr = *k.device_ptr() as *const core::ffi::c_void;
@ -340,12 +581,14 @@ impl FlashAttnVarLen {
                v_ptr,
                dst_ptr,
                softmax_lse_ptr,
+                /* alibi_slopes_ptr */ alibi_slopes_ptr,
                /* cu_seqlens_q_ptr */ seqlens_q_ptr,
                /* cu_seqlens_k_ptr */ seqlens_k_ptr,
                /* q_batch_stride */ 0,
                /* k_batch_stride */ 0,
                /* v_batch_stride */ 0,
                /* o_batch_stride */ 0,
+                /* alibi_slopes_batch_stride */ 0,
                /* q_row_stride   */ q_stride[q_rank - 3] as u32,
                /* k_row_stride   */ k_stride[k_rank - 3] as u32,
                /* v_row_stride   */ v_stride[v_rank - 3] as u32,
@ -364,8 +607,10 @@ impl FlashAttnVarLen {
                /* seqlen_k */ self.max_seqlen_k as u32,
                /* seqlen_q_rounded */ seqlen_q_rounded as u32,
                /* seqlen_k_rounded */ seqlen_k_rounded as u32,
-                /* is_causal */ causal,
                /* is_bf16 */ is_bf16,
+                /* is_causal */ is_causal,
+                /* window_size_left */ window_size_left,
+                /* window_size_right */ window_size_right,
            )
        }

@ -440,13 +685,176 @@ pub fn flash_attn_varlen(
    softmax_scale: f32,
    causal: bool,
 ) -> Result<Tensor> {
+    let window_size_left = None;
+    let window_size_right = if causal { Some(0) } else { None };
+
    let op = FlashAttnVarLen {
        softmax_scale,
-        causal,
        max_seqlen_q,
        max_seqlen_k,
        seqlens_q: seqlens_q.clone(),
        seqlens_k: seqlens_k.clone(),
+        alibi_slopes: None,
+        window_size_left,
+        window_size_right,
+    };
+    q.apply_op3(k, v, op)
+}
+
+#[allow(clippy::too_many_arguments)]
+/// Flash-attention v2 layer with variable-length batching.
+///
+/// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
+/// Multi-query and grouped-query attention are supported by using tensors k and v with fewer heads
+/// than q, the number of heads in k and v has to be divisible by the number of heads in q.
+///
+/// # Arguments
+///
+/// * `q` - Query tensor with shape `(total_q, num_heads_q, head_size)`.
+/// * `k` - Key tensor with shape `(total_kv, num_heads_kv, head_size)`.
+/// * `v` - Value tensor with shape `(total_kv, num_heads_kv, head_size)`.
+/// * `seqlens_q` - The cumulative lengths of the sequences in the batch, used to index in q.
+/// * `seqlens_k` - The cumulative lengths of the sequences in the batch, used to index in k and v.
+/// * `max_seqlen_q` - The maximum query sequence length for q in the batch.
+/// * `max_seqlen_k` - The maximum query sequence length for k and v in the batch.
+/// * `window_size_left` - Limit left attention to value tokens.
+/// * `window_size_right` - Limit right attention to value tokens.
+///
+/// `seqlens_q` and `seqlens_k` contain `batch_size + 1` elements, typically `0`, `seqlen_1`,
+/// `seqlen_1 + seqlen_2`, etc.
+///
+/// The resulting tensor has dimensions `(total_q, num_heads_q, head_size)`.
+///
+/// # Causal mask
+///
+/// `window_size_left=None` with `window_size_right=Some(0)` applies a causal mask to the result
+/// of  `Q @ K^T`
+pub fn flash_attn_varlen_windowed(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    seqlens_q: &Tensor,
+    seqlens_k: &Tensor,
+    max_seqlen_q: usize,
+    max_seqlen_k: usize,
+    softmax_scale: f32,
+    window_size_left: Option<usize>,
+    window_size_right: Option<usize>,
+) -> Result<Tensor> {
+    let op = FlashAttnVarLen {
+        softmax_scale,
+        max_seqlen_q,
+        max_seqlen_k,
+        seqlens_q: seqlens_q.clone(),
+        seqlens_k: seqlens_k.clone(),
+        alibi_slopes: None,
+        window_size_left,
+        window_size_right,
+    };
+    q.apply_op3(k, v, op)
+}
+
+#[allow(clippy::too_many_arguments)]
+/// Flash-attention v2 layer with variable-length batching.
+///
+/// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
+/// Multi-query and grouped-query attention are supported by using tensors k and v with fewer heads
+/// than q, the number of heads in k and v has to be divisible by the number of heads in q.
+///
+/// # Arguments
+///
+/// * `q` - Query tensor with shape `(total_q, num_heads_q, head_size)`.
+/// * `k` - Key tensor with shape `(total_kv, num_heads_kv, head_size)`.
+/// * `v` - Value tensor with shape `(total_kv, num_heads_kv, head_size)`.
+/// * `alibi_slopes` - Alibi slopes tensor with shape `(num_heads_q)`.
+/// * `seqlens_q` - The cumulative lengths of the sequences in the batch, used to index in q.
+/// * `seqlens_k` - The cumulative lengths of the sequences in the batch, used to index in k and v.
+/// * `max_seqlen_q` - The maximum query sequence length for q in the batch.
+/// * `max_seqlen_k` - The maximum query sequence length for k and v in the batch.
+///
+/// `seqlens_q` and `seqlens_k` contain `batch_size + 1` elements, typically `0`, `seqlen_1`,
+/// `seqlen_1 + seqlen_2`, etc.
+///
+/// The resulting tensor has dimensions `(total_q, num_heads_q, head_size)`.
+pub fn flash_attn_varlen_alibi(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    alibi_slopes: &Tensor,
+    seqlens_q: &Tensor,
+    seqlens_k: &Tensor,
+    max_seqlen_q: usize,
+    max_seqlen_k: usize,
+    softmax_scale: f32,
+    causal: bool,
+) -> Result<Tensor> {
+    let window_size_left = None;
+    let window_size_right = if causal { Some(0) } else { None };
+
+    let op = FlashAttnVarLen {
+        softmax_scale,
+        max_seqlen_q,
+        max_seqlen_k,
+        seqlens_q: seqlens_q.clone(),
+        seqlens_k: seqlens_k.clone(),
+        alibi_slopes: Some(alibi_slopes.clone()),
+        window_size_left,
+        window_size_right,
+    };
+    q.apply_op3(k, v, op)
+}
+
+#[allow(clippy::too_many_arguments)]
+/// Flash-attention v2 layer with variable-length batching.
+///
+/// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
+/// Multi-query and grouped-query attention are supported by using tensors k and v with fewer heads
+/// than q, the number of heads in k and v has to be divisible by the number of heads in q.
+///
+/// # Arguments
+///
+/// * `q` - Query tensor with shape `(total_q, num_heads_q, head_size)`.
+/// * `k` - Key tensor with shape `(total_kv, num_heads_kv, head_size)`.
+/// * `v` - Value tensor with shape `(total_kv, num_heads_kv, head_size)`.
+/// * `alibi_slopes` - Alibi slopes tensor with shape `(num_heads_q)`.
+/// * `seqlens_q` - The cumulative lengths of the sequences in the batch, used to index in q.
+/// * `seqlens_k` - The cumulative lengths of the sequences in the batch, used to index in k and v.
+/// * `max_seqlen_q` - The maximum query sequence length for q in the batch.
+/// * `max_seqlen_k` - The maximum query sequence length for k and v in the batch.
+/// * `window_size_left` - Limit left attention to value tokens.
+/// * `window_size_right` - Limit right attention to value tokens.
+///
+/// `seqlens_q` and `seqlens_k` contain `batch_size + 1` elements, typically `0`, `seqlen_1`,
+/// `seqlen_1 + seqlen_2`, etc.
+///
+/// The resulting tensor has dimensions `(total_q, num_heads_q, head_size)`.
+///
+/// # Causal mask
+///
+/// `window_size_left=None` with `window_size_right=Some(0)` applies a causal mask to the result
+/// of  `Q @ K^T`
+pub fn flash_attn_varlen_alibi_windowed(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    alibi_slopes: &Tensor,
+    seqlens_q: &Tensor,
+    seqlens_k: &Tensor,
+    max_seqlen_q: usize,
+    max_seqlen_k: usize,
+    softmax_scale: f32,
+    window_size_left: Option<usize>,
+    window_size_right: Option<usize>,
+) -> Result<Tensor> {
+    let op = FlashAttnVarLen {
+        softmax_scale,
+        max_seqlen_q,
+        max_seqlen_k,
+        seqlens_q: seqlens_q.clone(),
+        seqlens_k: seqlens_k.clone(),
+        alibi_slopes: Some(alibi_slopes.clone()),
+        window_size_left,
+        window_size_right,
    };
    q.apply_op3(k, v, op)
 }
--- a/candle-kernels/Cargo.toml
+++ b/candle-kernels/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-kernels"
-version = "0.3.3"
+version = "0.4.0"
 edition = "2021"

 description = "CUDA kernels for Candle"
@ -12,6 +12,4 @@ license = "MIT OR Apache-2.0"
 [dependencies]

 [build-dependencies]
-anyhow = { version = "1", features = ["backtrace"] }
-glob = "0.3.1"
-rayon = "1.7.0"
+bindgen_cuda = "0.1.1"
--- a/candle-kernels/build.rs
+++ b/candle-kernels/build.rs
@ -1,243 +1,8 @@
-use std::io::Write;
-
 fn main() {
    println!("cargo:rerun-if-changed=build.rs");

-    cuda::set_include_dir();
-    let (write, kernel_paths) = cuda::build_ptx();
-    if write {
-        let mut file = std::fs::File::create("src/lib.rs").unwrap();
-        for kernel_path in kernel_paths {
-            let name = kernel_path.file_stem().unwrap().to_str().unwrap();
-            file.write_all(
-                format!(
-                    r#"pub const {}: &str = include_str!(concat!(env!("OUT_DIR"), "/{}.ptx"));"#,
-                    name.to_uppercase().replace('.', "_"),
-                    name
-                )
-                .as_bytes(),
-            )
-            .unwrap();
-            file.write_all(&[b'\n']).unwrap();
-        }
-    }
-}
-
-mod cuda {
-    use anyhow::{Context, Result};
-
-    pub fn set_include_dir() {
-        use std::path::PathBuf;
-        // NOTE: copied from cudarc build.rs.
-        // We can't actually set a env!() value from another crate,
-        // so we have to do that here.
-
-        // use PathBuf;
-
-        let env_vars = [
-            "CUDA_PATH",
-            "CUDA_ROOT",
-            "CUDA_TOOLKIT_ROOT_DIR",
-            "CUDNN_LIB",
-        ];
-        #[allow(unused)]
-        let env_vars = env_vars
-            .into_iter()
-            .map(std::env::var)
-            .filter_map(Result::ok)
-            .map(Into::<PathBuf>::into);
-
-        let roots = [
-            "/usr",
-            "/usr/local/cuda",
-            "/opt/cuda",
-            "/usr/lib/cuda",
-            "C:/Program Files/NVIDIA GPU Computing Toolkit",
-            "C:/CUDA",
-        ];
-        #[allow(unused)]
-        let roots = roots.into_iter().map(Into::<PathBuf>::into);
-
-        #[cfg(feature = "ci-check")]
-        let root: PathBuf = "ci".into();
-
-        #[cfg(not(feature = "ci-check"))]
-        let root = env_vars
-            .chain(roots)
-            .find(|path| path.join("include").join("cuda.h").is_file())
-            .unwrap();
-
-        println!(
-            "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
-            root.join("include").display()
-        );
-    }
-
-    pub fn build_ptx() -> (bool, Vec<std::path::PathBuf>) {
-        use rayon::prelude::*;
-        use std::path::PathBuf;
-        let out_dir = std::env::var("OUT_DIR").unwrap();
-        let kernel_paths: Vec<PathBuf> = glob::glob("src/*.cu")
-            .unwrap()
-            .map(|p| p.unwrap())
-            .collect();
-        let mut include_directories: Vec<PathBuf> = glob::glob("src/**/*.cuh")
-            .unwrap()
-            .map(|p| p.unwrap())
-            .collect();
-
-        println!("cargo:rerun-if-changed=src/");
-        // for path in &kernel_paths {
-        //     println!("cargo:rerun-if-changed={}", path.display());
-        // }
-
-        for path in &mut include_directories {
-            // println!("cargo:rerun-if-changed={}", path.display());
-            let destination =
-                std::format!("{out_dir}/{}", path.file_name().unwrap().to_str().unwrap());
-            std::fs::copy(path.clone(), destination).unwrap();
-            // remove the filename from the path so it's just the directory
-            path.pop();
-        }
-
-        include_directories.sort();
-        include_directories.dedup();
-
-        let compute_cap = compute_cap().expect("Could not get Cuda compute cap");
-
-        #[allow(unused)]
-        let include_options: Vec<String> = include_directories
-            .into_iter()
-            .map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap())
-            .collect::<Vec<_>>();
-
-        let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
-        println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
-        let children = kernel_paths
-            .par_iter()
-            .flat_map(|p| {
-                let mut output = p.clone();
-                output.set_extension("ptx");
-                let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
-
-                let ignore = if output_filename.exists() {
-                    let out_modified = output_filename.metadata().unwrap().modified().unwrap();
-                    let in_modified = p.metadata().unwrap().modified().unwrap();
-                    out_modified.duration_since(in_modified).is_ok()
-                } else {
-                    false
-                };
-                if ignore {
-                    None
-                } else {
-                    let mut command = std::process::Command::new("nvcc");
-                    command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
-                        .arg("--ptx")
-                        .args(["--default-stream", "per-thread"])
-                        .args(["--output-directory", &out_dir])
-                        // Flash attention only
-                        // .arg("--expt-relaxed-constexpr")
-                        .args(&include_options);
-                    if let Ok(ccbin_path) = &ccbin_env {
-                        command
-                            .arg("-allow-unsupported-compiler")
-                            .args(["-ccbin", ccbin_path]);
-                    }
-                    command.arg(p);
-                    Some((p, command.spawn()
-                        .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
-            .unwrap()
-            .map(|p| p.unwrap())
-            .collect();
-        // We should rewrite `src/lib.rs` only if there are some newly compiled kernels, or removed
-        // some old ones
-        let write = !children.is_empty() || kernel_paths.len() < ptx_paths.len();
-        for (kernel_path, child) in children {
-            let output = child.expect("nvcc failed to run. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
-            assert!(
-                output.status.success(),
-                "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
-                String::from_utf8_lossy(&output.stdout),
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
-        (write, kernel_paths)
-    }
-
-    #[allow(unused)]
-    fn compute_cap() -> Result<usize> {
-        println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-
-        // Try to parse compute caps from env
-        let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-            println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
-            compute_cap_str
-                .parse::<usize>()
-                .context("Could not parse code")?
-        } else {
-            // Use nvidia-smi to get the current compute cap
-            let out = std::process::Command::new("nvidia-smi")
-                .arg("--query-gpu=compute_cap")
-                .arg("--format=csv")
-                .output()
-                .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
-            let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
-            let mut lines = out.lines();
-            assert_eq!(
-                lines.next().context("missing line in stdout")?,
-                "compute_cap"
-            );
-            let cap = lines
-                .next()
-                .context("missing line in stdout")?
-                .replace('.', "");
-            let cap = cap
-                .parse::<usize>()
-                .with_context(|| format!("cannot parse as int {cap}"))?;
-            println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
-            cap
-        };
-
-        // Grab available GPU codes from nvcc and select the highest one
-        let (supported_nvcc_codes, max_nvcc_code) = {
-            let out = std::process::Command::new("nvcc")
-                .arg("--list-gpu-code")
-                .output()
-                .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
-            let out = std::str::from_utf8(&out.stdout).unwrap();
-
-            let out = out.lines().collect::<Vec<&str>>();
-            let mut codes = Vec::with_capacity(out.len());
-            for code in out {
-                let code = code.split('_').collect::<Vec<&str>>();
-                if !code.is_empty() && code.contains(&"sm") {
-                    if let Ok(num) = code[1].parse::<usize>() {
-                        codes.push(num);
-                    }
-                }
-            }
-            codes.sort();
-            let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
-            (codes, max_nvcc_code)
-        };
-
-        // Check that nvcc supports the asked compute caps
-        if !supported_nvcc_codes.contains(&compute_cap) {
-            anyhow::bail!(
-            "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
-        );
-        }
-        if compute_cap > max_nvcc_code {
-            anyhow::bail!(
-            "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
-        );
-        }
-
-        Ok(compute_cap)
-    }
+    let builder = bindgen_cuda::Builder::default();
+    println!("cargo:info={builder:?}");
+    let bindings = builder.build_ptx().unwrap();
+    bindings.write("src/lib.rs").unwrap();
 }
--- a/Show More
+++ b/Show More