Add flash attention (#241)

* Add some flash-attn kernel, import the code for flash-attn v2 from Dao-AILab. * More flash attn. * Set up the flash attn parameters. * Get things to compile locally. * Move the flash attention files in a different directory. * Build the static C library with nvcc. * Add more flash attention. * Update the build part. * Better caching. * Exclude flash attention from the default workspace. * Put flash-attn behind a feature gate. * Get the flash attn kernel to run. * Move the flags to a more appropriate place. * Enable flash attention in llama. * Use flash attention in llama.
2025-06-16 02:38:10 +00:00 · 2023-07-26 07:48:10 +01:00
parent c97d51243c
commit d9f9c859af
22 changed files with 2699 additions and 9 deletions
--- a/candle-examples/build.rs
+++ b/candle-examples/build.rs
@ -6,11 +6,13 @@ use std::path::PathBuf;
 struct KernelDirectories {
    kernel_dir: &'static str,
    rust_target: &'static str,
+    include_dirs: &'static [&'static str],
 }

 const DIRS: [KernelDirectories; 1] = [KernelDirectories {
    kernel_dir: "examples/custom-ops/kernels/",
    rust_target: "examples/custom-ops/cuda_kernels.rs",
+    include_dirs: &[],
 }];

 impl KernelDirectories {
@ -32,12 +34,15 @@ impl KernelDirectories {
            {
                let mut command = std::process::Command::new("nvcc");
                let out_dir = ptx_file.parent().context("no parent for ptx file")?;
+                let include_dirs: Vec<String> =
+                    self.include_dirs.iter().map(|c| format!("-I{c}")).collect();
                command
                    .arg(format!("--gpu-architecture=sm_{compute_cap}"))
                    .arg("--ptx")
                    .args(["--default-stream", "per-thread"])
                    .args(["--output-directory", out_dir.to_str().unwrap()])
                    .arg(format!("-I/{}", self.kernel_dir))
+                    .args(include_dirs)
                    .arg(cu_file);
                let output = command
                    .spawn()
@ -221,6 +226,7 @@ fn compute_cap() -> Result<usize> {
    }

    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
+
    if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
        compute_cap = compute_cap_str
            .parse::<usize>()