From 75629981bc2b101400a301803c027da2362a4ff9 Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Mon, 16 Oct 2023 16:37:38 +0200 Subject: [PATCH] feat: parse Cuda compute cap from env (#1066) * feat: add support for multiple compute caps * Revert to one compute cap * fmt * fix --- candle-book/src/guide/installation.md | 3 + candle-flash-attn/build.rs | 90 +++++++----- candle-kernels/Cargo.toml | 3 +- candle-kernels/build.rs | 200 ++++++++++++++------------ 4 files changed, 168 insertions(+), 128 deletions(-) diff --git a/candle-book/src/guide/installation.md b/candle-book/src/guide/installation.md index 394cef35..ca8b7968 100644 --- a/candle-book/src/guide/installation.md +++ b/candle-book/src/guide/installation.md @@ -12,6 +12,9 @@ compute_cap 8.9 ``` +You can also compile the Cuda kernels for a specific compute cap using the +`CUDA_COMPUTE_CAP=` environment variable. + If any of the above commands errors out, please make sure to update your Cuda version. 2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support. diff --git a/candle-flash-attn/build.rs b/candle-flash-attn/build.rs index 64275fda..fde3aeed 100644 --- a/candle-flash-attn/build.rs +++ b/candle-flash-attn/build.rs @@ -84,12 +84,19 @@ fn main() -> Result<()> { (kernel_dir.join(f), obj_file) }) .collect(); + let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified()); let should_compile = if out_file.exists() { - cu_files.iter().any(|(cu_file, _)| { - let out_modified = out_file.metadata().unwrap().modified().unwrap(); - let in_modified = cu_file.metadata().unwrap().modified().unwrap(); - in_modified.duration_since(out_modified).is_ok() - }) + kernel_dir + .read_dir() + .expect("kernels folder should exist") + .any(|entry| { + if let (Ok(entry), Ok(out_modified)) = (entry, &out_modified) { + let in_modified = entry.metadata().unwrap().modified().unwrap(); + in_modified.duration_since(*out_modified).is_ok() + } else { + true + } + }) } else { true }; @@ -100,12 +107,19 @@ fn main() -> Result<()> { let mut command = std::process::Command::new("nvcc"); command .arg("-std=c++17") + .arg("-O3") + .arg("-U__CUDA_NO_HALF_OPERATORS__") + .arg("-U__CUDA_NO_HALF_CONVERSIONS__") + .arg("-U__CUDA_NO_HALF2_OPERATORS__") + .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__") .arg(format!("--gpu-architecture=sm_{compute_cap}")) .arg("-c") .args(["-o", obj_file.to_str().unwrap()]) .args(["--default-stream", "per-thread"]) .arg("-Icutlass/include") .arg("--expt-relaxed-constexpr") + .arg("--expt-extended-lambda") + .arg("--use_fast_math") .arg("--verbose"); if let Ok(ccbin_path) = &ccbin_env { command @@ -203,13 +217,21 @@ fn set_cuda_include_dir() -> Result<()> { #[allow(unused)] fn compute_cap() -> Result { - // Grab compute code from nvidia-smi - let mut compute_cap = { + println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP"); + + // Try to parse compute caps from env + let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") { + println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}"); + compute_cap_str + .parse::() + .context("Could not parse compute cap")? + } else { + // Use nvidia-smi to get the current compute cap let out = std::process::Command::new("nvidia-smi") - .arg("--query-gpu=compute_cap") - .arg("--format=csv") - .output() - .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?; + .arg("--query-gpu=compute_cap") + .arg("--format=csv") + .output() + .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?; let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?; let mut lines = out.lines(); assert_eq!( @@ -220,16 +242,19 @@ fn compute_cap() -> Result { .next() .context("missing line in stdout")? .replace('.', ""); - cap.parse::() - .with_context(|| format!("cannot parse as int {cap}"))? + let cap = cap + .parse::() + .with_context(|| format!("cannot parse as int {cap}"))?; + println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}"); + cap }; // Grab available GPU codes from nvcc and select the highest one - let max_nvcc_code = { + let (supported_nvcc_codes, max_nvcc_code) = { let out = std::process::Command::new("nvcc") - .arg("--list-gpu-code") - .output() - .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH."); + .arg("--list-gpu-code") + .output() + .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH."); let out = std::str::from_utf8(&out.stdout).unwrap(); let out = out.lines().collect::>(); @@ -243,30 +268,21 @@ fn compute_cap() -> Result { } } codes.sort(); - if !codes.contains(&compute_cap) { - anyhow::bail!( - "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}." - ); - } - *codes.last().unwrap() + let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?; + (codes, max_nvcc_code) }; - // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc, - // then choose the highest gpu code in nvcc - if compute_cap > max_nvcc_code { - println!( - "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}." + // Check that nvcc supports the asked compute caps + if !supported_nvcc_codes.contains(&compute_cap) { + anyhow::bail!( + "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}." + ); + } + if compute_cap > max_nvcc_code { + anyhow::bail!( + "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}" ); - compute_cap = max_nvcc_code; } - println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP"); - if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") { - compute_cap = compute_cap_str - .parse::() - .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?; - println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP"); - } - println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}"); Ok(compute_cap) } diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml index 01f132a3..b2238699 100644 --- a/candle-kernels/Cargo.toml +++ b/candle-kernels/Cargo.toml @@ -12,5 +12,6 @@ license = "MIT OR Apache-2.0" [dependencies] [build-dependencies] +anyhow = { version = "1", features = ["backtrace"] } glob = "0.3.1" -rayon = "1.7.0" +rayon = "1.7.0" \ No newline at end of file diff --git a/candle-kernels/build.rs b/candle-kernels/build.rs index ad084671..17a0bf9c 100644 --- a/candle-kernels/build.rs +++ b/candle-kernels/build.rs @@ -1,4 +1,5 @@ use std::io::Write; + fn main() { println!("cargo:rerun-if-changed=build.rs"); @@ -23,6 +24,8 @@ fn main() { } mod cuda { + use anyhow::{Context, Result}; + pub fn set_include_dir() { use std::path::PathBuf; // NOTE: copied from cudarc build.rs. @@ -100,107 +103,52 @@ mod cuda { include_directories.sort(); include_directories.dedup(); + let compute_cap = compute_cap().expect("Could not get Cuda compute cap"); + #[allow(unused)] let include_options: Vec = include_directories .into_iter() .map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap()) .collect::>(); - // let start = std::time::Instant::now(); - - // Grab compute code from nvidia-smi - let mut compute_cap = { - let out = std::process::Command::new("nvidia-smi") - .arg("--query-gpu=compute_cap") - .arg("--format=csv") - .output() - .expect("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH."); - let out = std::str::from_utf8(&out.stdout).unwrap(); - let mut lines = out.lines(); - assert_eq!(lines.next().unwrap(), "compute_cap"); - let cap = lines.next().unwrap().replace('.', ""); - cap.parse::().unwrap() - }; - - // Grab available GPU codes from nvcc and select the highest one - let max_nvcc_code = { - let out = std::process::Command::new("nvcc") - .arg("--list-gpu-code") - .output() - .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH."); - let out = std::str::from_utf8(&out.stdout).unwrap(); - - let out = out.lines().collect::>(); - let mut codes = Vec::with_capacity(out.len()); - for code in out { - let code = code.split('_').collect::>(); - if !code.is_empty() && code.contains(&"sm") { - if let Ok(num) = code[1].parse::() { - codes.push(num); - } - } - } - codes.sort(); - if !codes.contains(&compute_cap) { - panic!("nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."); - } - *codes.last().unwrap() - }; - - // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc, - // then choose the highest gpu code in nvcc - if compute_cap > max_nvcc_code { - println!( - "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}." - ); - compute_cap = max_nvcc_code; - } - - println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP"); - if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") { - compute_cap = compute_cap_str.parse::().unwrap(); - println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP"); - } - - println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}"); - let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN"); println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN"); let children = kernel_paths - .par_iter() - .flat_map(|p| { - let mut output = p.clone(); - output.set_extension("ptx"); - let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap()); + .par_iter() + .flat_map(|p| { + let mut output = p.clone(); + output.set_extension("ptx"); + let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap()); - let ignore = if output_filename.exists() { - let out_modified = output_filename.metadata().unwrap().modified().unwrap(); - let in_modified = p.metadata().unwrap().modified().unwrap(); - out_modified.duration_since(in_modified).is_ok() - }else{ - false - }; - if ignore{ - None - }else{ - let mut command = std::process::Command::new("nvcc"); - command.arg(format!("--gpu-architecture=sm_{compute_cap}")) - .arg("--ptx") - .args(["--default-stream", "per-thread"]) - .args(["--output-directory", &out_dir]) - // Flash attention only - // .arg("--expt-relaxed-constexpr") - .args(&include_options); - if let Ok(ccbin_path) = &ccbin_env { - command - .arg("-allow-unsupported-compiler") - .args(["-ccbin", ccbin_path]); - } - command.arg(p); - Some((p, command.spawn() + let ignore = if output_filename.exists() { + let out_modified = output_filename.metadata().unwrap().modified().unwrap(); + let in_modified = p.metadata().unwrap().modified().unwrap(); + out_modified.duration_since(in_modified).is_ok() + } else { + false + }; + if ignore { + None + } else { + let mut command = std::process::Command::new("nvcc"); + command.arg(format!("--gpu-architecture=sm_{compute_cap}")) + .arg("--ptx") + .args(["--default-stream", "per-thread"]) + .args(["--output-directory", &out_dir]) + // Flash attention only + // .arg("--expt-relaxed-constexpr") + .args(&include_options); + if let Ok(ccbin_path) = &ccbin_env { + command + .arg("-allow-unsupported-compiler") + .args(["-ccbin", ccbin_path]); + } + command.arg(p); + Some((p, command.spawn() .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output())) - }}) - .collect::>(); + } + }) + .collect::>(); let ptx_paths: Vec = glob::glob(&format!("{out_dir}/**/*.ptx")) .unwrap() @@ -220,4 +168,76 @@ mod cuda { } (write, kernel_paths) } + + #[allow(unused)] + fn compute_cap() -> Result { + println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP"); + + // Try to parse compute caps from env + let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") { + println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}"); + compute_cap_str + .parse::() + .context("Could not parse code")? + } else { + // Use nvidia-smi to get the current compute cap + let out = std::process::Command::new("nvidia-smi") + .arg("--query-gpu=compute_cap") + .arg("--format=csv") + .output() + .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?; + let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?; + let mut lines = out.lines(); + assert_eq!( + lines.next().context("missing line in stdout")?, + "compute_cap" + ); + let cap = lines + .next() + .context("missing line in stdout")? + .replace('.', ""); + let cap = cap + .parse::() + .with_context(|| format!("cannot parse as int {cap}"))?; + println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}"); + cap + }; + + // Grab available GPU codes from nvcc and select the highest one + let (supported_nvcc_codes, max_nvcc_code) = { + let out = std::process::Command::new("nvcc") + .arg("--list-gpu-code") + .output() + .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH."); + let out = std::str::from_utf8(&out.stdout).unwrap(); + + let out = out.lines().collect::>(); + let mut codes = Vec::with_capacity(out.len()); + for code in out { + let code = code.split('_').collect::>(); + if !code.is_empty() && code.contains(&"sm") { + if let Ok(num) = code[1].parse::() { + codes.push(num); + } + } + } + codes.sort(); + let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?; + (codes, max_nvcc_code) + }; + + // Check that nvcc supports the asked compute caps + if !supported_nvcc_codes.contains(&compute_cap) { + anyhow::bail!( + "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}." + ); + } + if compute_cap > max_nvcc_code { + anyhow::bail!( + "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}" + ); + } + + Ok(compute_cap) + } }