Add a toggle for F16/BF16 accumulation in gemm. (#2141)

* Add a toggle to control f16/bf16 gemm precision.

* Use the faster variant in the quantized example.

* Bugfix.
This commit is contained in:
Laurent Mazare
2024-04-29 09:21:07 +02:00
committed by GitHub
parent 287013ef28
commit ed7b99f525
4 changed files with 153 additions and 15 deletions

View File

@ -238,3 +238,23 @@ impl crate::backend::BackendDevice for CudaDevice {
Ok(())
}
}
/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
/// allowed with f16 GEMMs.
pub fn gemm_reduced_precision_f16() -> bool {
true
}
/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
/// allowed with f16 GEMMs.
pub fn set_gemm_reduced_precision_f16(_: bool) {}
/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
/// allowed with bf16 GEMMs.
pub fn gemm_reduced_precision_bf16() -> bool {
true
}
/// This bool controls whether reduced precision reductions (e.g., with fp16 accumulation type) are
/// allowed with bf16 GEMMs.
pub fn set_gemm_reduced_precision_bf16(_: bool) {}