feat: parse Cuda compute cap from env (#1066)

* feat: add support for multiple compute caps

* Revert to one compute cap

* fmt

* fix
This commit is contained in:
OlivierDehaene
2023-10-16 16:37:38 +02:00
committed by GitHub
parent 0106b0b04c
commit 75629981bc
4 changed files with 168 additions and 128 deletions

View File

@ -12,6 +12,9 @@ compute_cap
8.9 8.9
``` ```
You can also compile the Cuda kernels for a specific compute cap using the
`CUDA_COMPUTE_CAP=<compute cap>` environment variable.
If any of the above commands errors out, please make sure to update your Cuda version. If any of the above commands errors out, please make sure to update your Cuda version.
2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support. 2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support.

View File

@ -84,12 +84,19 @@ fn main() -> Result<()> {
(kernel_dir.join(f), obj_file) (kernel_dir.join(f), obj_file)
}) })
.collect(); .collect();
let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified());
let should_compile = if out_file.exists() { let should_compile = if out_file.exists() {
cu_files.iter().any(|(cu_file, _)| { kernel_dir
let out_modified = out_file.metadata().unwrap().modified().unwrap(); .read_dir()
let in_modified = cu_file.metadata().unwrap().modified().unwrap(); .expect("kernels folder should exist")
in_modified.duration_since(out_modified).is_ok() .any(|entry| {
}) if let (Ok(entry), Ok(out_modified)) = (entry, &out_modified) {
let in_modified = entry.metadata().unwrap().modified().unwrap();
in_modified.duration_since(*out_modified).is_ok()
} else {
true
}
})
} else { } else {
true true
}; };
@ -100,12 +107,19 @@ fn main() -> Result<()> {
let mut command = std::process::Command::new("nvcc"); let mut command = std::process::Command::new("nvcc");
command command
.arg("-std=c++17") .arg("-std=c++17")
.arg("-O3")
.arg("-U__CUDA_NO_HALF_OPERATORS__")
.arg("-U__CUDA_NO_HALF_CONVERSIONS__")
.arg("-U__CUDA_NO_HALF2_OPERATORS__")
.arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
.arg(format!("--gpu-architecture=sm_{compute_cap}")) .arg(format!("--gpu-architecture=sm_{compute_cap}"))
.arg("-c") .arg("-c")
.args(["-o", obj_file.to_str().unwrap()]) .args(["-o", obj_file.to_str().unwrap()])
.args(["--default-stream", "per-thread"]) .args(["--default-stream", "per-thread"])
.arg("-Icutlass/include") .arg("-Icutlass/include")
.arg("--expt-relaxed-constexpr") .arg("--expt-relaxed-constexpr")
.arg("--expt-extended-lambda")
.arg("--use_fast_math")
.arg("--verbose"); .arg("--verbose");
if let Ok(ccbin_path) = &ccbin_env { if let Ok(ccbin_path) = &ccbin_env {
command command
@ -203,13 +217,21 @@ fn set_cuda_include_dir() -> Result<()> {
#[allow(unused)] #[allow(unused)]
fn compute_cap() -> Result<usize> { fn compute_cap() -> Result<usize> {
// Grab compute code from nvidia-smi println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
let mut compute_cap = {
// Try to parse compute caps from env
let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
compute_cap_str
.parse::<usize>()
.context("Could not parse compute cap")?
} else {
// Use nvidia-smi to get the current compute cap
let out = std::process::Command::new("nvidia-smi") let out = std::process::Command::new("nvidia-smi")
.arg("--query-gpu=compute_cap") .arg("--query-gpu=compute_cap")
.arg("--format=csv") .arg("--format=csv")
.output() .output()
.context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?; .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?; let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
let mut lines = out.lines(); let mut lines = out.lines();
assert_eq!( assert_eq!(
@ -220,16 +242,19 @@ fn compute_cap() -> Result<usize> {
.next() .next()
.context("missing line in stdout")? .context("missing line in stdout")?
.replace('.', ""); .replace('.', "");
cap.parse::<usize>() let cap = cap
.with_context(|| format!("cannot parse as int {cap}"))? .parse::<usize>()
.with_context(|| format!("cannot parse as int {cap}"))?;
println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
cap
}; };
// Grab available GPU codes from nvcc and select the highest one // Grab available GPU codes from nvcc and select the highest one
let max_nvcc_code = { let (supported_nvcc_codes, max_nvcc_code) = {
let out = std::process::Command::new("nvcc") let out = std::process::Command::new("nvcc")
.arg("--list-gpu-code") .arg("--list-gpu-code")
.output() .output()
.expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH."); .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
let out = std::str::from_utf8(&out.stdout).unwrap(); let out = std::str::from_utf8(&out.stdout).unwrap();
let out = out.lines().collect::<Vec<&str>>(); let out = out.lines().collect::<Vec<&str>>();
@ -243,30 +268,21 @@ fn compute_cap() -> Result<usize> {
} }
} }
codes.sort(); codes.sort();
if !codes.contains(&compute_cap) { let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
anyhow::bail!( (codes, max_nvcc_code)
"nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
);
}
*codes.last().unwrap()
}; };
// If nvidia-smi compute_cap is higher than the highest gpu code from nvcc, // Check that nvcc supports the asked compute caps
// then choose the highest gpu code in nvcc if !supported_nvcc_codes.contains(&compute_cap) {
if compute_cap > max_nvcc_code { anyhow::bail!(
println!( "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
"cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}." );
}
if compute_cap > max_nvcc_code {
anyhow::bail!(
"CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
); );
compute_cap = max_nvcc_code;
} }
println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
compute_cap = compute_cap_str
.parse::<usize>()
.with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
}
println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
Ok(compute_cap) Ok(compute_cap)
} }

View File

@ -12,5 +12,6 @@ license = "MIT OR Apache-2.0"
[dependencies] [dependencies]
[build-dependencies] [build-dependencies]
anyhow = { version = "1", features = ["backtrace"] }
glob = "0.3.1" glob = "0.3.1"
rayon = "1.7.0" rayon = "1.7.0"

View File

@ -1,4 +1,5 @@
use std::io::Write; use std::io::Write;
fn main() { fn main() {
println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rerun-if-changed=build.rs");
@ -23,6 +24,8 @@ fn main() {
} }
mod cuda { mod cuda {
use anyhow::{Context, Result};
pub fn set_include_dir() { pub fn set_include_dir() {
use std::path::PathBuf; use std::path::PathBuf;
// NOTE: copied from cudarc build.rs. // NOTE: copied from cudarc build.rs.
@ -100,107 +103,52 @@ mod cuda {
include_directories.sort(); include_directories.sort();
include_directories.dedup(); include_directories.dedup();
let compute_cap = compute_cap().expect("Could not get Cuda compute cap");
#[allow(unused)] #[allow(unused)]
let include_options: Vec<String> = include_directories let include_options: Vec<String> = include_directories
.into_iter() .into_iter()
.map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap()) .map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
// let start = std::time::Instant::now();
// Grab compute code from nvidia-smi
let mut compute_cap = {
let out = std::process::Command::new("nvidia-smi")
.arg("--query-gpu=compute_cap")
.arg("--format=csv")
.output()
.expect("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.");
let out = std::str::from_utf8(&out.stdout).unwrap();
let mut lines = out.lines();
assert_eq!(lines.next().unwrap(), "compute_cap");
let cap = lines.next().unwrap().replace('.', "");
cap.parse::<usize>().unwrap()
};
// Grab available GPU codes from nvcc and select the highest one
let max_nvcc_code = {
let out = std::process::Command::new("nvcc")
.arg("--list-gpu-code")
.output()
.expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
let out = std::str::from_utf8(&out.stdout).unwrap();
let out = out.lines().collect::<Vec<&str>>();
let mut codes = Vec::with_capacity(out.len());
for code in out {
let code = code.split('_').collect::<Vec<&str>>();
if !code.is_empty() && code.contains(&"sm") {
if let Ok(num) = code[1].parse::<usize>() {
codes.push(num);
}
}
}
codes.sort();
if !codes.contains(&compute_cap) {
panic!("nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}.");
}
*codes.last().unwrap()
};
// If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
// then choose the highest gpu code in nvcc
if compute_cap > max_nvcc_code {
println!(
"cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
);
compute_cap = max_nvcc_code;
}
println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
compute_cap = compute_cap_str.parse::<usize>().unwrap();
println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
}
println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN"); let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN"); println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
let children = kernel_paths let children = kernel_paths
.par_iter() .par_iter()
.flat_map(|p| { .flat_map(|p| {
let mut output = p.clone(); let mut output = p.clone();
output.set_extension("ptx"); output.set_extension("ptx");
let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap()); let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
let ignore = if output_filename.exists() { let ignore = if output_filename.exists() {
let out_modified = output_filename.metadata().unwrap().modified().unwrap(); let out_modified = output_filename.metadata().unwrap().modified().unwrap();
let in_modified = p.metadata().unwrap().modified().unwrap(); let in_modified = p.metadata().unwrap().modified().unwrap();
out_modified.duration_since(in_modified).is_ok() out_modified.duration_since(in_modified).is_ok()
}else{ } else {
false false
}; };
if ignore{ if ignore {
None None
}else{ } else {
let mut command = std::process::Command::new("nvcc"); let mut command = std::process::Command::new("nvcc");
command.arg(format!("--gpu-architecture=sm_{compute_cap}")) command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
.arg("--ptx") .arg("--ptx")
.args(["--default-stream", "per-thread"]) .args(["--default-stream", "per-thread"])
.args(["--output-directory", &out_dir]) .args(["--output-directory", &out_dir])
// Flash attention only // Flash attention only
// .arg("--expt-relaxed-constexpr") // .arg("--expt-relaxed-constexpr")
.args(&include_options); .args(&include_options);
if let Ok(ccbin_path) = &ccbin_env { if let Ok(ccbin_path) = &ccbin_env {
command command
.arg("-allow-unsupported-compiler") .arg("-allow-unsupported-compiler")
.args(["-ccbin", ccbin_path]); .args(["-ccbin", ccbin_path]);
} }
command.arg(p); command.arg(p);
Some((p, command.spawn() Some((p, command.spawn()
.expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output())) .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
}}) }
.collect::<Vec<_>>(); })
.collect::<Vec<_>>();
let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx")) let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
.unwrap() .unwrap()
@ -220,4 +168,76 @@ mod cuda {
} }
(write, kernel_paths) (write, kernel_paths)
} }
#[allow(unused)]
fn compute_cap() -> Result<usize> {
println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
// Try to parse compute caps from env
let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
compute_cap_str
.parse::<usize>()
.context("Could not parse code")?
} else {
// Use nvidia-smi to get the current compute cap
let out = std::process::Command::new("nvidia-smi")
.arg("--query-gpu=compute_cap")
.arg("--format=csv")
.output()
.context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
let mut lines = out.lines();
assert_eq!(
lines.next().context("missing line in stdout")?,
"compute_cap"
);
let cap = lines
.next()
.context("missing line in stdout")?
.replace('.', "");
let cap = cap
.parse::<usize>()
.with_context(|| format!("cannot parse as int {cap}"))?;
println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
cap
};
// Grab available GPU codes from nvcc and select the highest one
let (supported_nvcc_codes, max_nvcc_code) = {
let out = std::process::Command::new("nvcc")
.arg("--list-gpu-code")
.output()
.expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
let out = std::str::from_utf8(&out.stdout).unwrap();
let out = out.lines().collect::<Vec<&str>>();
let mut codes = Vec::with_capacity(out.len());
for code in out {
let code = code.split('_').collect::<Vec<&str>>();
if !code.is_empty() && code.contains(&"sm") {
if let Ok(num) = code[1].parse::<usize>() {
codes.push(num);
}
}
}
codes.sort();
let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
(codes, max_nvcc_code)
};
// Check that nvcc supports the asked compute caps
if !supported_nvcc_codes.contains(&compute_cap) {
anyhow::bail!(
"nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
);
}
if compute_cap > max_nvcc_code {
anyhow::bail!(
"CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
);
}
Ok(compute_cap)
}
} }