From 75629981bc2b101400a301803c027da2362a4ff9 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <Olivier.dehaene@gmail.com>
Date: Mon, 16 Oct 2023 16:37:38 +0200
Subject: [PATCH] feat: parse Cuda compute cap from env (#1066)

* feat: add support for multiple compute caps

* Revert to one compute cap

* fmt

* fix
---
 candle-book/src/guide/installation.md |   3 +
 candle-flash-attn/build.rs            |  90 +++++++-----
 candle-kernels/Cargo.toml             |   3 +-
 candle-kernels/build.rs               | 200 ++++++++++++++------------
 4 files changed, 168 insertions(+), 128 deletions(-)
diff --git a/candle-book/src/guide/installation.md b/candle-book/src/guide/installation.md
index 394cef35..ca8b7968 100644
--- a/candle-book/src/guide/installation.md
+++ b/candle-book/src/guide/installation.md
@@ -12,6 +12,9 @@ compute_cap
 8.9
 ```
 
+You can also compile the Cuda kernels for a specific compute cap using the 
+`CUDA_COMPUTE_CAP=<compute cap>` environment variable.
+
 If any of the above commands errors out, please make sure to update your Cuda version.
 
 2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support.
diff --git a/candle-flash-attn/build.rs b/candle-flash-attn/build.rs
index 64275fda..fde3aeed 100644
--- a/candle-flash-attn/build.rs
+++ b/candle-flash-attn/build.rs
@@ -84,12 +84,19 @@ fn main() -> Result<()> {
             (kernel_dir.join(f), obj_file)
         })
         .collect();
+    let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified());
     let should_compile = if out_file.exists() {
-        cu_files.iter().any(|(cu_file, _)| {
-            let out_modified = out_file.metadata().unwrap().modified().unwrap();
-            let in_modified = cu_file.metadata().unwrap().modified().unwrap();
-            in_modified.duration_since(out_modified).is_ok()
-        })
+        kernel_dir
+            .read_dir()
+            .expect("kernels folder should exist")
+            .any(|entry| {
+                if let (Ok(entry), Ok(out_modified)) = (entry, &out_modified) {
+                    let in_modified = entry.metadata().unwrap().modified().unwrap();
+                    in_modified.duration_since(*out_modified).is_ok()
+                } else {
+                    true
+                }
+            })
     } else {
         true
     };
@@ -100,12 +107,19 @@ fn main() -> Result<()> {
                 let mut command = std::process::Command::new("nvcc");
                 command
                     .arg("-std=c++17")
+                    .arg("-O3")
+                    .arg("-U__CUDA_NO_HALF_OPERATORS__")
+                    .arg("-U__CUDA_NO_HALF_CONVERSIONS__")
+                    .arg("-U__CUDA_NO_HALF2_OPERATORS__")
+                    .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
                     .arg(format!("--gpu-architecture=sm_{compute_cap}"))
                     .arg("-c")
                     .args(["-o", obj_file.to_str().unwrap()])
                     .args(["--default-stream", "per-thread"])
                     .arg("-Icutlass/include")
                     .arg("--expt-relaxed-constexpr")
+                    .arg("--expt-extended-lambda")
+                    .arg("--use_fast_math")
                     .arg("--verbose");
                 if let Ok(ccbin_path) = &ccbin_env {
                     command
@@ -203,13 +217,21 @@ fn set_cuda_include_dir() -> Result<()> {
 
 #[allow(unused)]
 fn compute_cap() -> Result<usize> {
-    // Grab compute code from nvidia-smi
-    let mut compute_cap = {
+    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
+
+    // Try to parse compute caps from env
+    let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
+        compute_cap_str
+            .parse::<usize>()
+            .context("Could not parse compute cap")?
+    } else {
+        // Use nvidia-smi to get the current compute cap
         let out = std::process::Command::new("nvidia-smi")
-                    .arg("--query-gpu=compute_cap")
-                    .arg("--format=csv")
-                    .output()
-                    .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+            .arg("--query-gpu=compute_cap")
+            .arg("--format=csv")
+            .output()
+            .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
         let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
         let mut lines = out.lines();
         assert_eq!(
@@ -220,16 +242,19 @@ fn compute_cap() -> Result<usize> {
             .next()
             .context("missing line in stdout")?
             .replace('.', "");
-        cap.parse::<usize>()
-            .with_context(|| format!("cannot parse as int {cap}"))?
+        let cap = cap
+            .parse::<usize>()
+            .with_context(|| format!("cannot parse as int {cap}"))?;
+        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
+        cap
     };
 
     // Grab available GPU codes from nvcc and select the highest one
-    let max_nvcc_code = {
+    let (supported_nvcc_codes, max_nvcc_code) = {
         let out = std::process::Command::new("nvcc")
-                    .arg("--list-gpu-code")
-                    .output()
-                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+            .arg("--list-gpu-code")
+            .output()
+            .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
         let out = std::str::from_utf8(&out.stdout).unwrap();
 
         let out = out.lines().collect::<Vec<&str>>();
@@ -243,30 +268,21 @@ fn compute_cap() -> Result<usize> {
             }
         }
         codes.sort();
-        if !codes.contains(&compute_cap) {
-            anyhow::bail!(
-                "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
-            );
-        }
-        *codes.last().unwrap()
+        let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
+        (codes, max_nvcc_code)
     };
 
-    // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
-    // then choose the highest gpu code in nvcc
-    if compute_cap > max_nvcc_code {
-        println!(
-            "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
+    // Check that nvcc supports the asked compute caps
+    if !supported_nvcc_codes.contains(&compute_cap) {
+        anyhow::bail!(
+            "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
+        );
+    }
+    if compute_cap > max_nvcc_code {
+        anyhow::bail!(
+            "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
         );
-        compute_cap = max_nvcc_code;
     }
 
-    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-    if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-        compute_cap = compute_cap_str
-            .parse::<usize>()
-            .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
-        println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
-    }
-    println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
     Ok(compute_cap)
 }
diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml
index 01f132a3..b2238699 100644
--- a/candle-kernels/Cargo.toml
+++ b/candle-kernels/Cargo.toml
@@ -12,5 +12,6 @@ license = "MIT OR Apache-2.0"
 [dependencies]
 
 [build-dependencies]
+anyhow = { version = "1", features = ["backtrace"] }
 glob = "0.3.1"
-rayon = "1.7.0"
+rayon = "1.7.0"
\ No newline at end of file
diff --git a/candle-kernels/build.rs b/candle-kernels/build.rs
index ad084671..17a0bf9c 100644
--- a/candle-kernels/build.rs
+++ b/candle-kernels/build.rs
@@ -1,4 +1,5 @@
 use std::io::Write;
+
 fn main() {
     println!("cargo:rerun-if-changed=build.rs");
 
@@ -23,6 +24,8 @@ fn main() {
 }
 
 mod cuda {
+    use anyhow::{Context, Result};
+
     pub fn set_include_dir() {
         use std::path::PathBuf;
         // NOTE: copied from cudarc build.rs.
@@ -100,107 +103,52 @@ mod cuda {
         include_directories.sort();
         include_directories.dedup();
 
+        let compute_cap = compute_cap().expect("Could not get Cuda compute cap");
+
         #[allow(unused)]
         let include_options: Vec<String> = include_directories
             .into_iter()
             .map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap())
             .collect::<Vec<_>>();
 
-        // let start = std::time::Instant::now();
-
-        // Grab compute code from nvidia-smi
-        let mut compute_cap = {
-            let out = std::process::Command::new("nvidia-smi")
-                    .arg("--query-gpu=compute_cap")
-                    .arg("--format=csv")
-                    .output()
-                    .expect("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.");
-            let out = std::str::from_utf8(&out.stdout).unwrap();
-            let mut lines = out.lines();
-            assert_eq!(lines.next().unwrap(), "compute_cap");
-            let cap = lines.next().unwrap().replace('.', "");
-            cap.parse::<usize>().unwrap()
-        };
-
-        // Grab available GPU codes from nvcc and select the highest one
-        let max_nvcc_code = {
-            let out = std::process::Command::new("nvcc")
-                    .arg("--list-gpu-code")
-                    .output()
-                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
-            let out = std::str::from_utf8(&out.stdout).unwrap();
-
-            let out = out.lines().collect::<Vec<&str>>();
-            let mut codes = Vec::with_capacity(out.len());
-            for code in out {
-                let code = code.split('_').collect::<Vec<&str>>();
-                if !code.is_empty() && code.contains(&"sm") {
-                    if let Ok(num) = code[1].parse::<usize>() {
-                        codes.push(num);
-                    }
-                }
-            }
-            codes.sort();
-            if !codes.contains(&compute_cap) {
-                panic!("nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}.");
-            }
-            *codes.last().unwrap()
-        };
-
-        // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
-        // then choose the highest gpu code in nvcc
-        if compute_cap > max_nvcc_code {
-            println!(
-                "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
-            );
-            compute_cap = max_nvcc_code;
-        }
-
-        println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-        if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-            compute_cap = compute_cap_str.parse::<usize>().unwrap();
-            println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
-        }
-
-        println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
-
         let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
         println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
         let children = kernel_paths
-                .par_iter()
-                .flat_map(|p| {
-                    let mut output = p.clone();
-                    output.set_extension("ptx");
-                    let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
+            .par_iter()
+            .flat_map(|p| {
+                let mut output = p.clone();
+                output.set_extension("ptx");
+                let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
 
-                    let ignore = if output_filename.exists() {
-                        let out_modified = output_filename.metadata().unwrap().modified().unwrap();
-                        let in_modified = p.metadata().unwrap().modified().unwrap();
-                        out_modified.duration_since(in_modified).is_ok()
-                    }else{
-                        false
-                    };
-                    if ignore{
-                        None
-                    }else{
-                        let mut command = std::process::Command::new("nvcc");
-                            command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
-                            .arg("--ptx")
-                            .args(["--default-stream", "per-thread"])
-                            .args(["--output-directory", &out_dir])
-                            // Flash attention only
-                            // .arg("--expt-relaxed-constexpr")
-                            .args(&include_options);
-                        if let Ok(ccbin_path) = &ccbin_env {
-                            command
-                                .arg("-allow-unsupported-compiler")
-                                .args(["-ccbin", ccbin_path]);
-                        }
-                        command.arg(p);
-                        Some((p,  command.spawn()
+                let ignore = if output_filename.exists() {
+                    let out_modified = output_filename.metadata().unwrap().modified().unwrap();
+                    let in_modified = p.metadata().unwrap().modified().unwrap();
+                    out_modified.duration_since(in_modified).is_ok()
+                } else {
+                    false
+                };
+                if ignore {
+                    None
+                } else {
+                    let mut command = std::process::Command::new("nvcc");
+                    command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
+                        .arg("--ptx")
+                        .args(["--default-stream", "per-thread"])
+                        .args(["--output-directory", &out_dir])
+                        // Flash attention only
+                        // .arg("--expt-relaxed-constexpr")
+                        .args(&include_options);
+                    if let Ok(ccbin_path) = &ccbin_env {
+                        command
+                            .arg("-allow-unsupported-compiler")
+                            .args(["-ccbin", ccbin_path]);
+                    }
+                    command.arg(p);
+                    Some((p, command.spawn()
                         .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
-                    }})
-                .collect::<Vec<_>>();
+                }
+            })
+            .collect::<Vec<_>>();
 
         let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
             .unwrap()
@@ -220,4 +168,76 @@ mod cuda {
         }
         (write, kernel_paths)
     }
+
+    #[allow(unused)]
+    fn compute_cap() -> Result<usize> {
+        println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
+
+        // Try to parse compute caps from env
+        let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+            println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
+            compute_cap_str
+                .parse::<usize>()
+                .context("Could not parse code")?
+        } else {
+            // Use nvidia-smi to get the current compute cap
+            let out = std::process::Command::new("nvidia-smi")
+                .arg("--query-gpu=compute_cap")
+                .arg("--format=csv")
+                .output()
+                .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+            let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
+            let mut lines = out.lines();
+            assert_eq!(
+                lines.next().context("missing line in stdout")?,
+                "compute_cap"
+            );
+            let cap = lines
+                .next()
+                .context("missing line in stdout")?
+                .replace('.', "");
+            let cap = cap
+                .parse::<usize>()
+                .with_context(|| format!("cannot parse as int {cap}"))?;
+            println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
+            cap
+        };
+
+        // Grab available GPU codes from nvcc and select the highest one
+        let (supported_nvcc_codes, max_nvcc_code) = {
+            let out = std::process::Command::new("nvcc")
+                .arg("--list-gpu-code")
+                .output()
+                .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+            let out = std::str::from_utf8(&out.stdout).unwrap();
+
+            let out = out.lines().collect::<Vec<&str>>();
+            let mut codes = Vec::with_capacity(out.len());
+            for code in out {
+                let code = code.split('_').collect::<Vec<&str>>();
+                if !code.is_empty() && code.contains(&"sm") {
+                    if let Ok(num) = code[1].parse::<usize>() {
+                        codes.push(num);
+                    }
+                }
+            }
+            codes.sort();
+            let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
+            (codes, max_nvcc_code)
+        };
+
+        // Check that nvcc supports the asked compute caps
+        if !supported_nvcc_codes.contains(&compute_cap) {
+            anyhow::bail!(
+            "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
+        );
+        }
+        if compute_cap > max_nvcc_code {
+            anyhow::bail!(
+            "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
+        );
+        }
+
+        Ok(compute_cap)
+    }
 }