Add a toggle for F16/BF16 accumulation in gemm. (#2141)

* Add a toggle to control f16/bf16 gemm precision.

* Use the faster variant in the quantized example.

* Bugfix.
This commit is contained in:
Laurent Mazare
2024-04-29 09:21:07 +02:00
committed by GitHub
parent 287013ef28
commit ed7b99f525
4 changed files with 153 additions and 15 deletions

View File

@ -374,6 +374,9 @@ fn main() -> anyhow::Result<()> {
#[cfg(feature = "cuda")]
candle::quantized::cuda::set_force_dmmv(args.force_dmmv);
candle::cuda::set_gemm_reduced_precision_f16(true);
candle::cuda::set_gemm_reduced_precision_bf16(true);
let _guard = if args.tracing {
let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
tracing_subscriber::registry().with(chrome_layer).init();