Files
candle/candle-flash-attn/src/ffi.rs
Laurent Mazare 0ace420e66 Flash attention without padding (varlen). (#281)
* Expose the seqlen variable for flash-attn without padding.

* Fix the batched call.

* Adapt for the varlen variant.

* No need to set the batch strides when in varlen mode.

* Add a test (disabled at the moment).

* Get the test to work properly.
2023-07-31 09:45:39 +01:00

44 lines
913 B
Rust

use core::ffi::{c_int, c_void};
extern "C" {
pub(crate) fn run_mha(
q_ptr: *const c_void,
k_ptr: *const c_void,
v_ptr: *const c_void,
o_ptr: *const c_void,
softmax_lse_ptr: *const c_void,
cu_seqlens_q_ptr: *const i32,
cu_seqlens_k_ptr: *const i32,
q_batch_stride: u32,
k_batch_stride: u32,
v_batch_stride: u32,
o_batch_stride: u32,
q_row_stride: u32,
k_row_stride: u32,
v_row_stride: u32,
o_row_stride: u32,
q_head_stride: u32,
k_head_stride: u32,
v_head_stride: u32,
o_head_stride: u32,
b: u32,
h: u32,
h_k: u32,
d: u32,
d_rounded: u32,
softmax_scale: f32,
seqlen_q: u32,
seqlen_k: u32,
seqlen_q_rounded: u32,
seqlen_k_rounded: u32,
is_causal: c_int,
);
}