Merge pull request #1318 from huggingface/metal4

Starting to fix some tests.
2025-06-16 10:38:54 +00:00 · 2023-12-20 15:37:31 +01:00
parent 96f1a28e39 9b5e4843a6
commit 9fc210fae8
25 changed files with 2784 additions and 785 deletions
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -34,6 +34,8 @@ zip = { workspace = true }
 [dev-dependencies]
 anyhow = { workspace = true }
 clap = { workspace = true }
+criterion = { workspace = true }
+

 [features]
 default = []
@ -42,3 +44,8 @@ cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels"]
+
+[[bench]]
+name = "matmul"
+harness = false
+
--- a/candle-core/benches/matmul.rs
+++ b/candle-core/benches/matmul.rs
@ -0,0 +1,43 @@
+use candle_core::{DType, Device, Tensor};
+use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
+use std::time::Instant;
+
+fn run(a: &Tensor, b: &Tensor) {
+    a.matmul(&b.t().unwrap()).unwrap();
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let b = 1;
+    let m = 1;
+    let n = 2048;
+    let k = 2048;
+
+    let device = Device::new_metal(0).unwrap();
+    let dtype = DType::F32;
+    let lhs = Tensor::zeros((b, m, k), dtype, &device).unwrap();
+    let rhs = Tensor::zeros((b, n, k), dtype, &device).unwrap();
+
+    let flops = b * m * n * k;
+
+    let mut group = c.benchmark_group("matmul_metal");
+    group.throughput(Throughput::Bytes(flops as u64));
+    group.bench_function("iter", move |b| {
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                run(black_box(&lhs), black_box(&rhs));
+            }
+            if let Device::Metal(device) = &device {
+                device.wait_until_completed().unwrap();
+            } else {
+                panic!("Expected metal device");
+            }
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
+
--- a/candle-core/src/device.rs
+++ b/candle-core/src/device.rs
@ -201,10 +201,9 @@ impl Device {
                    Ok(Storage::Cuda(storage))
                }
            }
-            Device::Metal(_device) => {
-                // let storage = device.rand_uniform(shape, dtype, lo, up)?;
-                // Ok(Storage::Metal(storage))
-                crate::bail!("Metal rand_uniform not implemented")
+            Device::Metal(device) => {
+                let storage = device.rand_uniform(shape, dtype, lo, up)?;
+                Ok(Storage::Metal(storage))
            }
        }
    }
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -1877,10 +1877,7 @@ impl Tensor {
                    Storage::Metal(metal.storage_from_cpu_storage(storage)?)
                }
                (Storage::Cuda(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
-                (Storage::Metal(storage), Device::Cpu) => {
-                    println!("{storage:?} - {:?}", storage.to_cpu_storage()?);
-                    Storage::Cpu(storage.to_cpu_storage()?)
-                }
+                (Storage::Metal(storage), Device::Cpu) => Storage::Cpu(storage.to_cpu_storage()?),
                (Storage::Cuda(storage), Device::Cuda(cuda)) => {
                    // TODO: Avoid passing through the cpu storage here, especially if the gpu ids
                    // are the same.