Add a broadcast variant to matmul. (#523)

* Add a broadcast variant to matmul. * Get the test to pass.
2025-06-20 04:00:28 +00:00 · 2023-08-20 13:20:42 +01:00
parent a8f61e66cc
commit 2fcb386f17
3 changed files with 108 additions and 43 deletions
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -747,6 +747,25 @@ fn matmul(device: &Device) -> Result<()> {
    Ok(())
 }

+fn broadcast_matmul(device: &Device) -> Result<()> {
+    let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?;
+    let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?;
+    let out = lhs.broadcast_matmul(&rhs)?;
+    assert_eq!(out.dims(), &[3, 6, 4, 2]);
+    for idx1 in 0..3 {
+        for idx2 in 0..6 {
+            let out = out.i((idx1, idx2))?;
+            let lhs = lhs.i((idx1, 0))?;
+            let rhs = rhs.i(idx2)?;
+            let out2 = lhs.matmul(&rhs);
+            let sum_diff2 = (out - out2)?.sqr()?.sum_all()?;
+            // With cuda, we see errors of up to ~1e-12.
+            assert!(sum_diff2.to_vec0::<f32>()? < 1e-6)
+        }
+    }
+    Ok(())
+}
+
 fn broadcasting(device: &Device) -> Result<()> {
    let t1 = Tensor::arange(0f32, 24f32, device)?.reshape((4, 2, 3))?;
    let t2 = Tensor::new(&[100f32, 200f32], device)?;
@ -864,6 +883,7 @@ test_device!(binary_op, binary_op_cpu, binary_op_gpu);
 test_device!(embeddings, embeddings_cpu, embeddings_gpu);
 test_device!(cmp, cmp_cpu, cmp_gpu);
 test_device!(matmul, matmul_cpu, matmul_gpu);
+test_device!(broadcast_matmul, broadcast_matmul_cpu, broadcast_matmul_gpu);
 test_device!(broadcasting, broadcasting_cpu, broadcasting_gpu);
 test_device!(index_select, index_select_cpu, index_select_gpu);
 test_device!(index_add, index_add_cpu, index_add_gpu);