mirror of
https://github.com/huggingface/candle.git
synced 2025-06-22 04:22:50 +00:00
@ -2,7 +2,6 @@ mod ffi;
|
||||
|
||||
use candle::backend::BackendStorage;
|
||||
use candle::cuda_backend::cudarc::driver::DevicePtr;
|
||||
use candle::cuda_backend::WrapErr;
|
||||
use candle::{CpuStorage, DType, Layout, Result, Shape, Tensor};
|
||||
use half::{bf16, f16};
|
||||
|
||||
@ -142,10 +141,8 @@ impl FlashAttn {
|
||||
let seqlen_k_rounded = round_multiple(seqlen_k, 128);
|
||||
|
||||
let elem_count = out_shape.elem_count();
|
||||
let dst = unsafe { dev.alloc::<T>(elem_count) }.w()?;
|
||||
let softmax_lse = dev
|
||||
.alloc_zeros::<f32>(b_sz * 128 * num_heads * seqlen_q)
|
||||
.w()?;
|
||||
let dst = unsafe { dev.alloc::<T>(elem_count)? };
|
||||
let softmax_lse = dev.alloc_zeros::<f32>(b_sz * 128 * num_heads * seqlen_q)?;
|
||||
|
||||
let is_bf16 = if is_bf16 { 1 } else { 0 };
|
||||
|
||||
@ -607,8 +604,8 @@ impl FlashAttnVarLen {
|
||||
let seqlen_k_rounded = round_multiple(self.max_seqlen_k, 128);
|
||||
|
||||
let elem_count = out_shape.elem_count();
|
||||
let dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?;
|
||||
let softmax_lse = dev.alloc_zeros::<f32>(num_heads * total_q).w()?;
|
||||
let dst = unsafe { dev.alloc::<f16>(elem_count)? };
|
||||
let softmax_lse = dev.alloc_zeros::<f32>(num_heads * total_q)?;
|
||||
|
||||
let is_bf16 = if is_bf16 { 1 } else { 0 };
|
||||
|
||||
|
Reference in New Issue
Block a user