Add a toggle for F16/BF16 accumulation in gemm. (#2141)

* Add a toggle to control f16/bf16 gemm precision. * Use the faster variant in the quantized example. * Bugfix.
2025-06-18 19:47:12 +00:00 · 2024-04-29 09:21:07 +02:00
parent 287013ef28
commit ed7b99f525
4 changed files with 153 additions and 15 deletions
--- a/candle-core/src/dummy_cuda_backend.rs
+++ b/candle-core/src/dummy_cuda_backend.rs
@ -238,3 +238,23 @@ impl crate::backend::BackendDevice for CudaDevice {
        Ok(())
    }
 }
+
+/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
+/// allowed with f16 GEMMs.
+pub fn gemm_reduced_precision_f16() -> bool {
+    true
+}
+
+/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
+/// allowed with f16 GEMMs.
+pub fn set_gemm_reduced_precision_f16(_: bool) {}
+
+/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
+/// allowed with bf16 GEMMs.
+pub fn gemm_reduced_precision_bf16() -> bool {
+    true
+}
+
+/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
+/// allowed with bf16 GEMMs.
+pub fn set_gemm_reduced_precision_bf16(_: bool) {}