Cuda cleanup. (#2880)

* Cuda cleanup.

* More fixes.
This commit is contained in:
Laurent Mazare
2025-04-11 21:43:35 +02:00
committed by GitHub
parent eb478ece92
commit acc5bd335f
8 changed files with 193 additions and 161 deletions

View File

@ -2,7 +2,6 @@ mod ffi;
use candle::backend::BackendStorage;
use candle::cuda_backend::cudarc::driver::DevicePtr;
use candle::cuda_backend::WrapErr;
use candle::{CpuStorage, DType, Layout, Result, Shape, Tensor};
use half::{bf16, f16};
@ -142,10 +141,8 @@ impl FlashAttn {
let seqlen_k_rounded = round_multiple(seqlen_k, 128);
let elem_count = out_shape.elem_count();
let dst = unsafe { dev.alloc::<T>(elem_count) }.w()?;
let softmax_lse = dev
.alloc_zeros::<f32>(b_sz * 128 * num_heads * seqlen_q)
.w()?;
let dst = unsafe { dev.alloc::<T>(elem_count)? };
let softmax_lse = dev.alloc_zeros::<f32>(b_sz * 128 * num_heads * seqlen_q)?;
let is_bf16 = if is_bf16 { 1 } else { 0 };
@ -607,8 +604,8 @@ impl FlashAttnVarLen {
let seqlen_k_rounded = round_multiple(self.max_seqlen_k, 128);
let elem_count = out_shape.elem_count();
let dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?;
let softmax_lse = dev.alloc_zeros::<f32>(num_heads * total_q).w()?;
let dst = unsafe { dev.alloc::<f16>(elem_count)? };
let softmax_lse = dev.alloc_zeros::<f32>(num_heads * total_q)?;
let is_bf16 = if is_bf16 { 1 } else { 0 };