From b2788342673f76920a12e6feb6c6cc1209062193 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 5 Aug 2023 17:25:24 +0100 Subject: [PATCH] Support the Accelerate BLAS on macOS. (#325) * Add the accelerate feature. * Ffi tweaks. --- Cargo.toml | 1 + candle-core/Cargo.toml | 2 + candle-core/examples/basics.rs | 3 + candle-core/src/accelerate.rs | 111 ++++++++++++++++++++++ candle-core/src/cpu_backend.rs | 105 +++++++++++++++++++- candle-core/src/lib.rs | 2 + candle-core/src/utils.rs | 14 ++- candle-examples/Cargo.toml | 2 + candle-examples/examples/llama/main.rs | 3 + candle-examples/examples/llama2-c/main.rs | 3 + candle-nn/Cargo.toml | 2 + candle-transformers/Cargo.toml | 2 + 12 files changed, 241 insertions(+), 9 deletions(-) create mode 100644 candle-core/src/accelerate.rs diff --git a/Cargo.toml b/Cargo.toml index ea008f00..55d851e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ categories = ["science"] license = "MIT/Apache-2.0" [workspace.dependencies] +accelerate-src = { version = "0.3.2" } anyhow = { version = "1", features = ["backtrace"] } byteorder = "1.4.3" clap = { version = "4.2.4", features = ["derive"] } diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index 5a59aedc..af77a0e0 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -10,6 +10,7 @@ license.workspace = true readme = "README.md" [dependencies] +accelerate-src = { workspace = true, optional = true } byteorder = { workspace = true } candle-kernels = { path = "../candle-kernels", version = "0.1.0", optional = true } cudarc = { workspace = true, optional = true } @@ -32,3 +33,4 @@ anyhow = { workspace = true } default = [] cuda = ["dep:cudarc", "dep:candle-kernels"] mkl = ["dep:libc", "dep:intel-mkl-src"] +accelerate = ["dep:libc", "dep:accelerate-src"] diff --git a/candle-core/examples/basics.rs b/candle-core/examples/basics.rs index d028db66..18db4248 100644 --- a/candle-core/examples/basics.rs +++ b/candle-core/examples/basics.rs @@ -1,6 +1,9 @@ #[cfg(feature = "mkl")] extern crate intel_mkl_src; +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + use anyhow::Result; use candle_core::{Device, Tensor}; diff --git a/candle-core/src/accelerate.rs b/candle-core/src/accelerate.rs new file mode 100644 index 00000000..8b0df5c1 --- /dev/null +++ b/candle-core/src/accelerate.rs @@ -0,0 +1,111 @@ +#![allow(dead_code)] +use libc::{c_char, c_double, c_float, c_int}; + +mod ffi { + use super::*; + extern "C" { + // It would be nice to be able to switch to the NEWLAPACK version of the function but this + // seems to trigger some link error. Available function names can be seen here: + // /Library/Developer/CommandLineTools/SDKs/MacOSX13.3.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Accelerate.tbd + #[link_name = "sgemm_"] + pub fn sgemm_ffi( + transa: *const c_char, + transb: *const c_char, + m: *const c_int, + n: *const c_int, + k: *const c_int, + alpha: *const c_float, + a: *const c_float, + lda: *const c_int, + b: *const c_float, + ldb: *const c_int, + beta: *const c_float, + c: *mut c_float, + ldc: *const c_int, + ); + #[link_name = "dgemm_"] + pub fn dgemm_ffi( + transa: *const c_char, + transb: *const c_char, + m: *const c_int, + n: *const c_int, + k: *const c_int, + alpha: *const c_double, + a: *const c_double, + lda: *const c_int, + b: *const c_double, + ldb: *const c_int, + beta: *const c_double, + c: *mut c_double, + ldc: *const c_int, + ); + } +} + +#[allow(clippy::too_many_arguments)] +#[inline] +pub unsafe fn sgemm( + transa: u8, + transb: u8, + m: i32, + n: i32, + k: i32, + alpha: f32, + a: &[f32], + lda: i32, + b: &[f32], + ldb: i32, + beta: f32, + c: &mut [f32], + ldc: i32, +) { + ffi::sgemm_ffi( + &(transa as c_char), + &(transb as c_char), + &m, + &n, + &k, + &alpha, + a.as_ptr(), + &lda, + b.as_ptr(), + &ldb, + &beta, + c.as_mut_ptr(), + &ldc, + ) +} + +#[allow(clippy::too_many_arguments)] +#[inline] +pub unsafe fn dgemm( + transa: u8, + transb: u8, + m: i32, + n: i32, + k: i32, + alpha: f64, + a: &[f64], + lda: i32, + b: &[f64], + ldb: i32, + beta: f64, + c: &mut [f64], + ldc: i32, +) { + ffi::dgemm_ffi( + &(transa as c_char), + &(transb as c_char), + &m, + &n, + &k, + &alpha, + a.as_ptr(), + &lda, + b.as_ptr(), + &ldb, + &beta, + c.as_mut_ptr(), + &ldc, + ) +} diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs index 8563721c..4aa2f880 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend.rs @@ -974,7 +974,7 @@ impl MatMul { impl Map2 for MatMul { const OP: &'static str = "mat_mul"; - #[cfg(not(feature = "mkl"))] + #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))] fn f( &self, lhs: &[T], @@ -1053,6 +1053,109 @@ impl Map2 for MatMul { Ok(dst) } + #[cfg(feature = "accelerate")] + fn f( + &self, + lhs: &[T], + lhs_l: &Layout, + rhs: &[T], + rhs_l: &Layout, + ) -> Result> { + let (b, m, n, k) = self.0; + let lhs = &lhs[lhs_l.start_offset()..]; + let rhs = &rhs[rhs_l.start_offset()..]; + + let lhs_stride = lhs_l.stride(); + let rhs_stride = rhs_l.stride(); + let rank = lhs_stride.len(); + + let a_skip: usize = match lhs_stride[..rank - 2] { + [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride, + [stride] => stride, + [] => m * k, + _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous lhs"))?, + }; + let b_skip: usize = match rhs_stride[..rank - 2] { + [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride, + [stride] => stride, + [] => n * k, + _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))?, + }; + let c_skip: usize = m * n; + + let rhs_m1 = rhs_stride[rhs_stride.len() - 1]; + let rhs_m2 = rhs_stride[rhs_stride.len() - 2]; + let lhs_m1 = lhs_stride[lhs_stride.len() - 1]; + let lhs_m2 = lhs_stride[lhs_stride.len() - 2]; + + let (lda, transa) = if rhs_m1 == 1 && rhs_m2 == n { + (n as i32, b'N') + } else if rhs_m1 == k && rhs_m2 == 1 { + (k as i32, b'T') + } else { + Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))? + }; + // The b tensor has dims batching, m, k (lhs) + let (ldb, transb) = if lhs_m1 == 1 && lhs_m2 == k { + (k as i32, b'N') + } else if lhs_m1 == m && lhs_m2 == 1 { + (m as i32, b'T') + } else { + Err(self.striding_error(lhs_l, rhs_l, "non-contiguous lhs"))? + }; + + let mut dst = vec![T::zero(); b * m * n]; + match T::DTYPE { + DType::F16 => { + crate::bail!("the accelerate backend does not support f16 matmul") + } + DType::F32 => { + for step in 0..b { + let lhs_p = &lhs[step * a_skip..]; + let rhs_p = &rhs[step * b_skip..]; + let dst_p = &mut dst[step * c_skip..]; + unsafe { + let a = rhs_p.as_ptr() as *const f32; + let b = lhs_p.as_ptr() as *const f32; + let c = dst_p.as_mut_ptr() as *mut f32; + let a = std::slice::from_raw_parts(a, a_skip); + let b = std::slice::from_raw_parts(b, b_skip); + let c = std::slice::from_raw_parts_mut(c, c_skip); + crate::accelerate::sgemm( + transa, transb, /* m= */ n as i32, /* n= */ m as i32, + /* k= */ k as i32, /* alpha= */ 1., /* a= */ a, + /* lda= */ lda, /* b= */ b, /* ldb= */ ldb, + /* beta= */ 0., /* c= */ c, /* ldc= */ n as i32, + ) + } + } + } + DType::F64 => { + for step in 0..b { + let lhs_p = &lhs[step * a_skip..]; + let rhs_p = &rhs[step * b_skip..]; + let dst_p = &mut dst[step * c_skip..]; + unsafe { + let a = rhs_p.as_ptr() as *const f64; + let b = lhs_p.as_ptr() as *const f64; + let c = dst_p.as_mut_ptr() as *mut f64; + let a = std::slice::from_raw_parts(a, a_skip); + let b = std::slice::from_raw_parts(b, b_skip); + let c = std::slice::from_raw_parts_mut(c, c_skip); + crate::accelerate::dgemm( + transa, transb, /* m= */ n as i32, /* n= */ m as i32, + /* k= */ k as i32, /* alpha= */ 1., /* a= */ a, + /* lda= */ lda, /* b= */ b, /* ldb= */ ldb, + /* beta= */ 0., /* c= */ c, /* ldc= */ n as i32, + ) + } + } + } + dtype => Err(Error::UnsupportedDTypeForOp(dtype, "matmul").bt())?, + } + Ok(dst) + } + #[cfg(feature = "mkl")] fn f( &self, diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index e46a87cf..016d3806 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -33,6 +33,8 @@ //! //! Rust is cool, and a lot of the HF ecosystem already has Rust crates [safetensors](https://github.com/huggingface/safetensors) and [tokenizers](https://github.com/huggingface/tokenizers) +#[cfg(feature = "accelerate")] +mod accelerate; pub mod backend; pub mod backprop; mod conv; diff --git a/candle-core/src/utils.rs b/candle-core/src/utils.rs index 895c97e1..d3f5b50e 100644 --- a/candle-core/src/utils.rs +++ b/candle-core/src/utils.rs @@ -11,16 +11,14 @@ pub fn get_num_threads() -> usize { } } +pub fn has_accelerate() -> bool { + cfg!(feature = "accelerate") +} + pub fn has_mkl() -> bool { - #[cfg(feature = "mkl")] - return true; - #[cfg(not(feature = "mkl"))] - return false; + cfg!(feature = "mkl") } pub fn cuda_is_available() -> bool { - #[cfg(feature = "cuda")] - return true; - #[cfg(not(feature = "cuda"))] - return false; + cfg!(feature = "cuda") } diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index 47490f42..f3a4e325 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -10,6 +10,7 @@ license.workspace = true readme = "README.md" [dependencies] +accelerate-src = { workspace = true, optional = true } candle = { path = "../candle-core", version = "0.1.0", package = "candle-core" } candle-datasets = { path = "../candle-datasets", version = "0.1.0" } candle-nn = { path = "../candle-nn", version = "0.1.0" } @@ -41,6 +42,7 @@ anyhow = { workspace = true } [features] default = [] +accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"] cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"] flash-attn = ["cuda", "dep:candle-flash-attn"] mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl", "candle-transformers/mkl"] diff --git a/candle-examples/examples/llama/main.rs b/candle-examples/examples/llama/main.rs index d0a55be1..b2c4e55a 100644 --- a/candle-examples/examples/llama/main.rs +++ b/candle-examples/examples/llama/main.rs @@ -9,6 +9,9 @@ // In order to convert the llama weights to a .npz file, run: // python examples/llama/convert_checkpoint.py ..../LLaMA/7B/consolidated.00.pth +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + #[cfg(feature = "mkl")] extern crate intel_mkl_src; diff --git a/candle-examples/examples/llama2-c/main.rs b/candle-examples/examples/llama2-c/main.rs index c3b94df1..418218b6 100644 --- a/candle-examples/examples/llama2-c/main.rs +++ b/candle-examples/examples/llama2-c/main.rs @@ -1,5 +1,8 @@ // https://github.com/karpathy/llama2.c +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + #[cfg(feature = "mkl")] extern crate intel_mkl_src; diff --git a/candle-nn/Cargo.toml b/candle-nn/Cargo.toml index bb44acd3..6db9ccab 100644 --- a/candle-nn/Cargo.toml +++ b/candle-nn/Cargo.toml @@ -10,6 +10,7 @@ license.workspace = true readme = "README.md" [dependencies] +accelerate-src = { workspace = true, optional = true } candle = { path = "../candle-core", version = "0.1.0", package = "candle-core" } thiserror = { workspace = true } intel-mkl-src = { workspace = true, optional = true } @@ -20,5 +21,6 @@ anyhow = { workspace = true } [features] default = [] +accelerate = ["dep:accelerate-src", "candle/accelerate"] cuda = ["candle/cuda"] mkl = ["dep:intel-mkl-src", "candle/mkl"] diff --git a/candle-transformers/Cargo.toml b/candle-transformers/Cargo.toml index a37cc12a..457c0776 100644 --- a/candle-transformers/Cargo.toml +++ b/candle-transformers/Cargo.toml @@ -10,6 +10,7 @@ license.workspace = true readme = "README.md" [dependencies] +accelerate-src = { workspace = true, optional = true } candle = { path = "../candle-core", version = "0.1.0", package = "candle-core" } hf-hub = { workspace = true} candle-nn = { path = "../candle-nn", version = "0.1.0" } @@ -20,5 +21,6 @@ wav = { workspace = true } [features] default = [] +accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"] cuda = ["candle/cuda", "candle-nn/cuda"] mkl = ["dep:intel-mkl-src", "candle/mkl", "candle-nn/mkl"]