From 0a3487a776a08171f39ec7784c0a66ff94f98ec5 Mon Sep 17 00:00:00 2001 From: Niklas Hallqvist Date: Fri, 8 Mar 2024 08:17:36 +0100 Subject: [PATCH 001/131] Add a --seed argument to the stable-diffusion example. (#1812) * Add a --seed argument to the stable-diffusion example. * Make the case when no seed is specified, that it will not be set, but use the engine's default. This will make the CPU engine work again when no --seed is given, and will cause a bailout when a seed is there, as the engine does not currently support it. --------- Co-authored-by: niklas --- candle-examples/examples/stable-diffusion/main.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/candle-examples/examples/stable-diffusion/main.rs b/candle-examples/examples/stable-diffusion/main.rs index 8c3ca2ee..14642e9a 100644 --- a/candle-examples/examples/stable-diffusion/main.rs +++ b/candle-examples/examples/stable-diffusion/main.rs @@ -96,6 +96,10 @@ struct Args { /// information. #[arg(long, default_value_t = 0.8)] img2img_strength: f64, + + /// The seed to use when generating random samples. + #[arg(long)] + seed: Option, } #[derive(Debug, Clone, Copy, clap::ValueEnum, PartialEq, Eq)] @@ -374,6 +378,7 @@ fn run(args: Args) -> Result<()> { use_flash_attn, img2img, img2img_strength, + seed, .. } = args; @@ -427,6 +432,9 @@ fn run(args: Args) -> Result<()> { let scheduler = sd_config.build_scheduler(n_steps)?; let device = candle_examples::device(cpu)?; + if let Some(seed) = seed { + device.set_seed(seed)?; + } let use_guide_scale = guidance_scale > 1.0; let which = match sd_version { From 758366160e26a493fb5a1d151dcdab9c8abf99c8 Mon Sep 17 00:00:00 2001 From: Kirpal Grewal <45569241+KGrewal1@users.noreply.github.com> Date: Fri, 8 Mar 2024 07:18:01 +0000 Subject: [PATCH 002/131] add clone to candle dropout (#1814) --- candle-nn/src/ops.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-nn/src/ops.rs b/candle-nn/src/ops.rs index fdd67142..88d1b3d6 100644 --- a/candle-nn/src/ops.rs +++ b/candle-nn/src/ops.rs @@ -74,7 +74,7 @@ pub fn dropout(xs: &Tensor, drop_p: f32) -> Result { xs * mask } -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct Dropout { drop_p: f32, } From 96345837817ce407edb8c465ce9fce61bd22d947 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 8 Mar 2024 10:52:22 +0100 Subject: [PATCH 003/131] Expose a couple layout methods. (#1816) --- candle-core/src/layout.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/candle-core/src/layout.rs b/candle-core/src/layout.rs index bf346cf2..e6824b29 100644 --- a/candle-core/src/layout.rs +++ b/candle-core/src/layout.rs @@ -70,7 +70,7 @@ impl Layout { self.shape.is_fortran_contiguous(&self.stride) } - pub(crate) fn narrow(&self, dim: usize, start: usize, len: usize) -> Result { + pub fn narrow(&self, dim: usize, start: usize, len: usize) -> Result { let dims = self.shape().dims(); if dim >= dims.len() { Err(Error::DimOutOfRange { @@ -99,7 +99,7 @@ impl Layout { }) } - pub(crate) fn transpose(&self, dim1: usize, dim2: usize) -> Result { + pub fn transpose(&self, dim1: usize, dim2: usize) -> Result { let rank = self.shape.rank(); if rank <= dim1 || rank <= dim2 { Err(Error::UnexpectedNumberOfDims { @@ -120,7 +120,7 @@ impl Layout { }) } - pub(crate) fn permute(&self, idxs: &[usize]) -> Result { + pub fn permute(&self, idxs: &[usize]) -> Result { let is_permutation = idxs.len() == self.shape.rank() && (0..idxs.len()).all(|i| idxs.contains(&i)); if !is_permutation { From ea984d04210cf882953d1149a5bbc6b66f4157fb Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 8 Mar 2024 15:04:18 +0100 Subject: [PATCH 004/131] Expose more printer options. (#1817) --- candle-core/src/display.rs | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/candle-core/src/display.rs b/candle-core/src/display.rs index 4f5a390e..7e6e3cf8 100644 --- a/candle-core/src/display.rs +++ b/candle-core/src/display.rs @@ -65,12 +65,13 @@ impl std::fmt::Debug for Tensor { } /// Options for Tensor pretty printing +#[derive(Debug, Clone)] pub struct PrinterOptions { - precision: usize, - threshold: usize, - edge_items: usize, - line_width: usize, - sci_mode: Option, + pub precision: usize, + pub threshold: usize, + pub edge_items: usize, + pub line_width: usize, + pub sci_mode: Option, } static PRINT_OPTS: std::sync::Mutex = @@ -89,6 +90,10 @@ impl PrinterOptions { } } +pub fn print_options() -> &'static std::sync::Mutex { + &PRINT_OPTS +} + pub fn set_print_options(options: PrinterOptions) { *PRINT_OPTS.lock().unwrap() = options } @@ -117,6 +122,26 @@ pub fn set_print_options_full() { } } +pub fn set_line_width(line_width: usize) { + PRINT_OPTS.lock().unwrap().line_width = line_width +} + +pub fn set_precision(precision: usize) { + PRINT_OPTS.lock().unwrap().precision = precision +} + +pub fn set_edge_items(edge_items: usize) { + PRINT_OPTS.lock().unwrap().edge_items = edge_items +} + +pub fn set_threshold(threshold: usize) { + PRINT_OPTS.lock().unwrap().threshold = threshold +} + +pub fn set_sci_mode(sci_mode: Option) { + PRINT_OPTS.lock().unwrap().sci_mode = sci_mode +} + struct FmtSize { current_size: usize, } From be5b68cd0ba49424b0b100c0ea48ad35b2bd67b9 Mon Sep 17 00:00:00 2001 From: Niklas Hallqvist Date: Fri, 8 Mar 2024 16:11:50 +0100 Subject: [PATCH 005/131] Metal random-generation bug fixes (#1811) * use_resource API misunderstood. It is not additive. Several usages must be bit-ORed together. * The seeding was incorrect and used the address instead of the value of the passed in seed. * Add a check that likely exhibits failure to update the seed between generation of random tensors. * Buffer overrun, the length given to the std::ptr::copy call was in bytes, and not 32-bit units. * By default seed the RNG with a time-based value, so that different runs may produce different output, just like the CPU engine. Use device.set_seed if determinism is warranted. * Revert "By default seed the RNG with a time-based value, so that different runs may produce different output, just like the CPU engine. Use device.set_seed if determinism is warranted." This reverts commit d7302de9 Discussion in https://github.com/huggingface/candle/pull/1811#issuecomment-1983079119 * The Metal random kernel failed to set element N/2 of tensors with N elements, N being even. The reason was that all threads but thread 0 all created 2 random samples, but thread 0 only one, i.e. an odd number. In order to produce an even number of samples, the early termination of thread 0 should only everr occur for odd sized tensors. * Add a test catching any deterministic tensor element in rand and randn output. --------- Co-authored-by: niklas Co-authored-by: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> --- candle-core/src/metal_backend.rs | 2 +- candle-core/tests/tensor_tests.rs | 25 +++++++++++++++++++++++++ candle-metal-kernels/src/lib.rs | 12 ++++++++---- candle-metal-kernels/src/random.metal | 24 ++++++++++++++++-------- 4 files changed, 50 insertions(+), 13 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 6e1ecc5e..02078db5 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1727,7 +1727,7 @@ impl BackendDevice for MetalDevice { let seed_buffer = self.seed.try_lock().map_err(MetalError::from)?; let contents = seed_buffer.contents(); unsafe { - std::ptr::copy([seed].as_ptr(), contents as *mut u32, 4); + std::ptr::copy([seed].as_ptr(), contents as *mut u32, 1); } seed_buffer.did_modify_range(metal::NSRange::new(0, 4)); diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index 40737e7b..31a27422 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -1080,8 +1080,33 @@ fn broadcasting(device: &Device) -> Result<()> { fn randn(device: &Device) -> Result<()> { let tensor = Tensor::randn(0f32, 1f32, (5, 3), device)?; assert_eq!(tensor.dims(), [5, 3]); + // Check that the seed gets updated by checking that + // a new series of numbers is generated each time + let tensor2 = Tensor::randn(0f32, 1f32, (5, 3), device)?; + assert_ne!(tensor.to_vec2::()?, tensor2.to_vec2::()?); let tensor = Tensor::rand(0f32, 1f32, (5, 3), device)?; assert_eq!(tensor.dims(), [5, 3]); + // Check that the seed gets updated by checking that + // a new series of numbers is generated each time + let tensor2 = Tensor::rand(0f32, 1f32, (5, 3), device)?; + assert_ne!(tensor.to_vec2::()?, tensor2.to_vec2::()?); + // We do not expect deterministic elements at any index. + // There once was a bug that had a deterministic zero element in evenly sized tensors. + const N: usize = 2; + let v = (0..100) + .map(|_| Tensor::randn(0f32, 1f32, N, device).and_then(|t| t.to_vec1::())) + .collect::>>()?; + assert!( + (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])), + "There are deterministic values in the randn tensors" + ); + let v = (0..100) + .map(|_| Tensor::rand(0f32, 1f32, N, device).and_then(|t| t.to_vec1::())) + .collect::>>()?; + assert!( + (0..N).all(|i| v.windows(2).any(|pair| pair[0][i] != pair[1][i])), + "There are deterministic values in the rand tensors" + ); Ok(()) } diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 33bc3453..47ce7e96 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1558,8 +1558,10 @@ pub fn call_random_uniform( set_params!(encoder, (length, min, max, seed, buffer)); - encoder.use_resource(seed, metal::MTLResourceUsage::Read); - encoder.use_resource(seed, metal::MTLResourceUsage::Write); + encoder.use_resource( + seed, + metal::MTLResourceUsage::Read | metal::MTLResourceUsage::Write, + ); encoder.use_resource(buffer, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1589,8 +1591,10 @@ pub fn call_random_normal( set_params!(encoder, (length, mean, stddev, seed, buffer)); - encoder.use_resource(seed, metal::MTLResourceUsage::Read); - encoder.use_resource(seed, metal::MTLResourceUsage::Write); + encoder.use_resource( + seed, + metal::MTLResourceUsage::Read | metal::MTLResourceUsage::Write, + ); encoder.use_resource(buffer, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); diff --git a/candle-metal-kernels/src/random.metal b/candle-metal-kernels/src/random.metal index a7e48393..c1a94199 100644 --- a/candle-metal-kernels/src/random.metal +++ b/candle-metal-kernels/src/random.metal @@ -123,16 +123,20 @@ template METAL_FUNC void rand_uniform( return; } + // Evenly sized vectors need an offset when writing the mirror element. + uint off = 1 - size % 2; float diff = abs(min - max); - HybridTaus rng = HybridTaus::init({ulong(seed), tid, 1, 1}); + uint s = atomic_load_explicit(seed, memory_order_relaxed); + HybridTaus rng = HybridTaus::init({ulong(s), tid, 1, 1}); out[tid] = static_cast(rng.rand() * diff + min); if (tid == 0) { atomic_store_explicit(seed, uint(rng.rand() * UNIF01_NORM32), memory_order_relaxed); - // Return early if tid == 0, otherwise we will write to out[size]. - return; + // Return early if tid == 0 && off == 0, otherwise we will write to out[size]. + if (off == 0) + return; } // Use symmetry to fill the other half of the array. - out[size - tid] = static_cast(rng.rand() * diff + min); + out[size - off - tid] = static_cast(rng.rand() * diff + min); } // Create Gaussian normal distribution using Box-Muller transform: @@ -148,7 +152,10 @@ template METAL_FUNC void normal( if (tid >= size) { return; } - HybridTaus rng = HybridTaus::init({ulong(seed), tid, 1, 1}); + // Evenly sized vectors need an offset when writing the mirror element. + uint off = 1 - size % 2; + uint s = atomic_load_explicit(seed, memory_order_relaxed); + HybridTaus rng = HybridTaus::init({ulong(s), tid, 1, 1}); float u1 = rng.rand(); float u2 = rng.rand(); @@ -162,11 +169,12 @@ template METAL_FUNC void normal( if (tid == 0) { atomic_store_explicit(seed, uint(rng.rand() * UNIF01_NORM32), memory_order_relaxed); - // Return early if tid == 0, otherwise we will write to out[size]. - return; + // Return early if tid == 0 && off == 0, otherwise we will write to out[size]. + if (off == 0) + return; } // Use symmetry to fill the other half of the array. - out[size - tid] = static_cast(z1); + out[size - off - tid] = static_cast(z1); } #define UNIFORM_OP(NAME, T) \ From e7fc1daa21c0ae8fe92d1619f84817821fa5429d Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 8 Mar 2024 22:01:51 +0100 Subject: [PATCH 006/131] Bump the crate versions to 0.4.2. (#1821) --- Cargo.toml | 18 +++++++++--------- candle-flash-attn/Cargo.toml | 4 ++-- candle-kernels/Cargo.toml | 2 +- candle-metal-kernels/Cargo.toml | 2 +- candle-onnx/Cargo.toml | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 40f51fea..1a3e1983 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ exclude = [ resolver = "2" [workspace.package] -version = "0.4.1" +version = "0.4.2" edition = "2021" description = "Minimalist ML framework." repository = "https://github.com/huggingface/candle" @@ -31,14 +31,14 @@ license = "MIT OR Apache-2.0" accelerate-src = { version = "0.3.2" } anyhow = { version = "1", features = ["backtrace"] } byteorder = "1.4.3" -candle = { path = "./candle-core", package = "candle-core", version = "0.4.1" } -candle-datasets = { path = "./candle-datasets", version = "0.4.1" } -candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.1" } -candle-kernels = { path = "./candle-kernels", version = "0.4.1" } -candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.1" } -candle-nn = { path = "./candle-nn", version = "0.4.1" } -candle-onnx = { path = "./candle-onnx", version = "0.4.1" } -candle-transformers = { path = "./candle-transformers", version = "0.4.1" } +candle = { path = "./candle-core", package = "candle-core", version = "0.4.2" } +candle-datasets = { path = "./candle-datasets", version = "0.4.2" } +candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.2" } +candle-kernels = { path = "./candle-kernels", version = "0.4.2" } +candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.2" } +candle-nn = { path = "./candle-nn", version = "0.4.2" } +candle-onnx = { path = "./candle-onnx", version = "0.4.2" } +candle-transformers = { path = "./candle-transformers", version = "0.4.2" } clap = { version = "4.2.4", features = ["derive"] } criterion = { version = "0.5.1", default-features=false } cudarc = { version = "0.10.0", features = ["f16"] } diff --git a/candle-flash-attn/Cargo.toml b/candle-flash-attn/Cargo.toml index 29d72cd7..3b570776 100644 --- a/candle-flash-attn/Cargo.toml +++ b/candle-flash-attn/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-flash-attn" -version = "0.4.1" +version = "0.4.2" edition = "2021" description = "Flash attention layer for the candle ML framework." @@ -11,7 +11,7 @@ license = "MIT OR Apache-2.0" readme = "README.md" [dependencies] -candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.4.1" } +candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.4.2" } half = { version = "2.3.1", features = ["num-traits"] } [build-dependencies] diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml index ab059f89..ceee23fe 100644 --- a/candle-kernels/Cargo.toml +++ b/candle-kernels/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-kernels" -version = "0.4.1" +version = "0.4.2" edition = "2021" description = "CUDA kernels for Candle" diff --git a/candle-metal-kernels/Cargo.toml b/candle-metal-kernels/Cargo.toml index a2837ddb..569ad032 100644 --- a/candle-metal-kernels/Cargo.toml +++ b/candle-metal-kernels/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-metal-kernels" -version = "0.4.1" +version = "0.4.2" edition = "2021" description = "Metal kernels for Candle" diff --git a/candle-onnx/Cargo.toml b/candle-onnx/Cargo.toml index 9a75f802..01472a5f 100644 --- a/candle-onnx/Cargo.toml +++ b/candle-onnx/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-onnx" -version = "0.4.1" +version = "0.4.2" edition = "2021" description = "ONNX support for Candle" @@ -10,8 +10,8 @@ categories = ["science"] license = "MIT OR Apache-2.0" [dependencies] -candle = { path = "../candle-core", package = "candle-core", version = "0.4.1" } -candle-nn = { path = "../candle-nn", version = "0.4.1" } +candle = { path = "../candle-core", package = "candle-core", version = "0.4.2" } +candle-nn = { path = "../candle-nn", version = "0.4.2" } prost = "0.12.1" [build-dependencies] From 3440cec3a0f01e87a9a92b4de7878069c0b7f057 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 8 Mar 2024 22:43:07 +0100 Subject: [PATCH 007/131] Fast CPU kernel for transposed 1d convolutions. (#1822) * Fast CPU kernel for transposed 1d convolutions. * Bugfix. --- candle-core/src/cpu_backend.rs | 76 +++++++++++++++++++++- candle-core/tests/conv_tests.rs | 44 +++++++------ candle-examples/examples/metavoice/main.rs | 2 +- 3 files changed, 100 insertions(+), 22 deletions(-) diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs index 8ae39020..181fbb61 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend.rs @@ -5,6 +5,7 @@ use half::{bf16, f16}; use rayon::prelude::*; const USE_IM2COL_CONV1D: bool = true; +const USE_IM2COL_CONV1D_TR: bool = true; const USE_IM2COL_CONV2D: bool = true; // TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator + @@ -1256,6 +1257,34 @@ impl Map1 for Im2Col { } } +struct Col2Im1D { + stride: usize, +} + +impl Map1 for Col2Im1D { + fn f(&self, col: &[T], l: &Layout) -> Result> { + let (b_size, l_in, c_out, k_size) = l.shape().dims4()?; + let stride = self.stride; + let l_out = (l_in - 1) * stride + k_size; + let mut im = vec![T::zero(); b_size * c_out * l_out]; + let (dst_s0, dst_s1) = (c_out * l_out, l_out); + let (src_s0, src_s1, src_s2) = (c_out * k_size * l_in, c_out * k_size, k_size); + for l_in_i in 0..l_in { + for k_i in 0..k_size { + let l_out_i = l_in_i * stride + k_i; + for b_i in 0..b_size { + for c_i in 0..c_out { + let dst_idx = b_i * dst_s0 + c_i * dst_s1 + l_out_i; + let src_idx = b_i * src_s0 + l_in_i * src_s1 + c_i * src_s2 + k_i; + im[dst_idx] += col[src_idx] + } + } + } + } + Ok(im) + } +} + struct ConvTranspose1D<'a>(&'a crate::conv::ParamsConvTranspose1D); impl<'a> Map2 for ConvTranspose1D<'a> { @@ -2511,7 +2540,52 @@ impl BackendStorage for CpuStorage { kernel_l: &Layout, params: &crate::conv::ParamsConvTranspose1D, ) -> Result { - ConvTranspose1D(params).map(self, l, kernel, kernel_l) + let can_use_col2im = kernel_l.is_contiguous() + && params.dilation == 1 + && params.padding == 0 + && params.output_padding == 0; + if USE_IM2COL_CONV1D_TR && can_use_col2im { + let (b_size, c_in, l_in) = l.shape().dims3()?; + let (c_in2, c_out, k_size) = kernel_l.shape().dims3()?; + if !kernel_l.is_contiguous() { + crate::bail!( + "convtr1d: the second argument (kernel) has to be contiguous {kernel_l:?}" + ) + } + if c_in != c_in2 { + crate::bail!( + "convtr1d: shape mismatch on c_in {:?} {:?}", + l.shape(), + kernel_l.shape() + ) + } + let col = { + // This merges the last two dimensions of the kernel together. + let kernel_l_mm = Layout::new( + (b_size, c_in, k_size * c_out).into(), + vec![0, k_size * c_out, 1], + kernel_l.start_offset(), + ); + self.matmul( + kernel, + ( + b_size, + /* m */ l_in, + /* n */ c_out * k_size, + /* k */ c_in, + ), + &l.transpose(1, 2)?, + &kernel_l_mm, + )? + }; + let col_l = Layout::contiguous((b_size, l_in, c_out, k_size)); + Col2Im1D { + stride: params.stride, + } + .map(&col, &col_l) + } else { + ConvTranspose1D(params).map(self, l, kernel, kernel_l) + } } fn conv2d( diff --git a/candle-core/tests/conv_tests.rs b/candle-core/tests/conv_tests.rs index b967515d..f0f1b7f2 100644 --- a/candle-core/tests/conv_tests.rs +++ b/candle-core/tests/conv_tests.rs @@ -53,26 +53,30 @@ fn conv1d(dev: &Device) -> Result<()> { test_utils::to_vec1_round(&res.flatten_all()?, 4)?, [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352] ); - let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1, 1)?; - assert_eq!(res.dims(), [1, 2, 7]); - assert_eq!( - test_utils::to_vec1_round(&res.flatten_all()?, 4)?, - [ - 0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538, - 4.7076, -5.9745, -0.8276, 1.621 - ], - ); - let res = t.conv_transpose1d(&w.transpose(0, 1)?, 0, 0, 1, 1, 2)?; - assert_eq!(res.dims(), [1, 4, 7]); - assert_eq!( - test_utils::to_vec2_round(&res.squeeze(0)?, 4)?, - [ - [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819], - [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721], - [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113], - [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488] - ] - ); + let w = w.transpose(0, 1)?; + // The CPU kernels applied in the contiguous and non contiguous cases are different. + for w in [w.clone(), w.contiguous()?] { + let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 1)?; + assert_eq!(res.dims(), [1, 2, 7]); + assert_eq!( + test_utils::to_vec1_round(&res.flatten_all()?, 4)?, + [ + 0.0699, -1.2899, 8.3018, 5.5873, 2.4572, -2.6143, -0.0706, 1.8765, 4.8318, 1.1538, + 4.7076, -5.9745, -0.8276, 1.621 + ], + ); + let res = t.conv_transpose1d(&w, 0, 0, 1, 1, 2)?; + assert_eq!(res.dims(), [1, 4, 7]); + assert_eq!( + test_utils::to_vec2_round(&res.squeeze(0)?, 4)?, + [ + [-1.5596, -1.8099, 2.0407, 4.8764, -0.1743, -0.735, -0.7819], + [0.7816, 3.8152, -0.5926, 2.2515, -5.1844, -0.3157, 1.4721], + [1.6295, 0.52, 6.2611, 0.7109, 2.6315, -1.8793, 0.7113], + [1.0949, 1.0166, 1.7464, 2.4561, -0.79, -0.5119, 0.1488] + ] + ); + } Ok(()) } diff --git a/candle-examples/examples/metavoice/main.rs b/candle-examples/examples/metavoice/main.rs index ae571929..7635277c 100644 --- a/candle-examples/examples/metavoice/main.rs +++ b/candle-examples/examples/metavoice/main.rs @@ -120,7 +120,7 @@ fn main() -> Result<()> { Some(w) => std::path::PathBuf::from(w), None => repo.get("first_stage.safetensors")?, }; - let second_stage_weights = match &args.first_stage_weights { + let second_stage_weights = match &args.second_stage_weights { Some(w) => std::path::PathBuf::from(w), None => repo.get("second_stage.safetensors")?, }; From 936f6a48407ee111f52742cf48eccc61f6b62325 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 8 Mar 2024 23:12:13 +0100 Subject: [PATCH 008/131] Fix dequantization. (#1823) --- candle-core/src/quantized/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-core/src/quantized/mod.rs b/candle-core/src/quantized/mod.rs index f7abcd93..47307f2e 100644 --- a/candle-core/src/quantized/mod.rs +++ b/candle-core/src/quantized/mod.rs @@ -398,7 +398,7 @@ impl QMatMul { _ => DEQUANTIZE_ALL.with(|b| *b), }; let t = if dequantize { - let tensor = qtensor.dequantize(&Device::Cpu)?; + let tensor = qtensor.dequantize(&qtensor.device())?; Self::Tensor(tensor) } else { Self::QTensor(qtensor) From dd00482ea3456111482ec1cee045d2ae8efaf8ba Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 9 Mar 2024 11:06:04 +0100 Subject: [PATCH 009/131] Quantized version of the metavoice model. (#1824) * Quantized version of the metavoice model. * Integrate the quantized version of metavoice. --- candle-examples/examples/metavoice/main.rs | 44 +++- candle-transformers/src/models/metavoice.rs | 8 +- candle-transformers/src/models/mod.rs | 1 + .../src/models/quantized_metavoice.rs | 226 ++++++++++++++++++ candle-transformers/src/quantized_nn.rs | 10 + 5 files changed, 277 insertions(+), 12 deletions(-) create mode 100644 candle-transformers/src/models/quantized_metavoice.rs diff --git a/candle-examples/examples/metavoice/main.rs b/candle-examples/examples/metavoice/main.rs index 7635277c..7a7ec3e4 100644 --- a/candle-examples/examples/metavoice/main.rs +++ b/candle-examples/examples/metavoice/main.rs @@ -11,6 +11,7 @@ use std::io::Write; use candle_transformers::generation::LogitsProcessor; use candle_transformers::models::encodec; use candle_transformers::models::metavoice::{adapters, gpt, tokenizers, transformer}; +use candle_transformers::models::quantized_metavoice::transformer as qtransformer; use candle::{DType, IndexOp, Tensor}; use candle_nn::VarBuilder; @@ -26,6 +27,11 @@ enum ArgDType { Bf16, } +enum Transformer { + Normal(transformer::Model), + Quantized(qtransformer::Model), +} + #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { @@ -40,6 +46,10 @@ struct Args { #[arg(long)] prompt: String, + /// Use the quantized version of the model. + #[arg(long)] + quantized: bool, + /// The guidance scale. #[arg(long, default_value_t = 3.0)] guidance_scale: f64, @@ -116,10 +126,6 @@ fn main() -> Result<()> { }; let fs_tokenizer = tokenizers::BPE::from_json(first_stage_tokenizer, 512)?; - let first_stage_weights = match &args.first_stage_weights { - Some(w) => std::path::PathBuf::from(w), - None => repo.get("first_stage.safetensors")?, - }; let second_stage_weights = match &args.second_stage_weights { Some(w) => std::path::PathBuf::from(w), None => repo.get("second_stage.safetensors")?, @@ -135,10 +141,27 @@ fn main() -> Result<()> { ArgDType::F16 => DType::F16, ArgDType::Bf16 => DType::BF16, }; - let first_stage_vb = - unsafe { VarBuilder::from_mmaped_safetensors(&[first_stage_weights], dtype, &device)? }; + let first_stage_config = transformer::Config::cfg1b_v0_1(); - let mut first_stage_model = transformer::Model::new(&first_stage_config, first_stage_vb)?; + let mut first_stage_model = if args.quantized { + let filename = match &args.first_stage_weights { + Some(w) => std::path::PathBuf::from(w), + None => repo.get("first_stage_q4k.gguf")?, + }; + let vb = + candle_transformers::quantized_var_builder::VarBuilder::from_gguf(filename, &device)?; + let first_stage_model = qtransformer::Model::new(&first_stage_config, vb)?; + Transformer::Quantized(first_stage_model) + } else { + let first_stage_weights = match &args.first_stage_weights { + Some(w) => std::path::PathBuf::from(w), + None => repo.get("first_stage.safetensors")?, + }; + let first_stage_vb = + unsafe { VarBuilder::from_mmaped_safetensors(&[first_stage_weights], dtype, &device)? }; + let first_stage_model = transformer::Model::new(&first_stage_config, first_stage_vb)?; + Transformer::Normal(first_stage_model) + }; let second_stage_vb = unsafe { VarBuilder::from_mmaped_safetensors(&[second_stage_weights], dtype, &device)? }; @@ -178,7 +201,12 @@ fn main() -> Result<()> { let ctxt = &tokens[start_pos..]; let input = Tensor::new(ctxt, &device)?; let input = Tensor::stack(&[&input, &input], 0)?; - let logits = first_stage_model.forward(&input, &spk_emb, tokens.len() - context_size)?; + let logits = match &mut first_stage_model { + Transformer::Normal(m) => m.forward(&input, &spk_emb, tokens.len() - context_size)?, + Transformer::Quantized(m) => { + m.forward(&input, &spk_emb, tokens.len() - context_size)? + } + }; let logits0 = logits.i((0, 0))?; let logits1 = logits.i((1, 0))?; let logits = ((logits0 * args.guidance_scale)? + logits1 * (1. - args.guidance_scale))?; diff --git a/candle-transformers/src/models/metavoice.rs b/candle-transformers/src/models/metavoice.rs index 35cb30c7..2eeb0713 100644 --- a/candle-transformers/src/models/metavoice.rs +++ b/candle-transformers/src/models/metavoice.rs @@ -2,7 +2,7 @@ use candle::{DType, Device, Error as E, IndexOp, Module, Result, Tensor, D}; use candle_nn::{embedding, linear_b, rms_norm, Embedding, Linear, RmsNorm, VarBuilder}; // Equivalent to torch.repeat_interleave -fn repeat_interleave(img: &Tensor, repeats: usize, dim: usize) -> Result { +pub(crate) fn repeat_interleave(img: &Tensor, repeats: usize, dim: usize) -> Result { let img = img.unsqueeze(dim + 1)?; let mut dims = img.dims().to_vec(); dims[dim + 1] = repeats; @@ -664,15 +664,15 @@ pub mod transformer { } } - fn n_local_heads(&self) -> usize { + pub(crate) fn n_local_heads(&self) -> usize { self.n_local_heads.unwrap_or(self.n_head) } - fn head_dim(&self) -> usize { + pub(crate) fn head_dim(&self) -> usize { self.dim / self.n_head } - fn intermediate_size(&self) -> usize { + pub(crate) fn intermediate_size(&self) -> usize { match self.intermediate_size { Some(intermediate_size) => intermediate_size, None => { diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 66e06e0e..389d1a80 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -30,6 +30,7 @@ pub mod quantized_blip; pub mod quantized_blip_text; pub mod quantized_llama; pub mod quantized_llama2_c; +pub mod quantized_metavoice; pub mod quantized_mistral; pub mod quantized_mixformer; pub mod quantized_mpt; diff --git a/candle-transformers/src/models/quantized_metavoice.rs b/candle-transformers/src/models/quantized_metavoice.rs new file mode 100644 index 00000000..16545150 --- /dev/null +++ b/candle-transformers/src/models/quantized_metavoice.rs @@ -0,0 +1,226 @@ +use crate::quantized_nn::{linear_b, Embedding, Linear, RmsNorm}; +pub use crate::quantized_var_builder::VarBuilder; + +use crate::models::metavoice::repeat_interleave; +use candle::{Module, Result, Tensor, D}; + +pub mod transformer { + use super::*; + + type Config = crate::models::metavoice::transformer::Config; + + #[derive(Debug, Clone)] + struct FeedForward { + w1: Linear, + w2: Linear, + w3: Linear, + } + + impl FeedForward { + fn new(cfg: &Config, vb: VarBuilder) -> Result { + let i_size = cfg.intermediate_size(); + let w1 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w1"))?; + let w2 = linear_b(i_size, cfg.dim, false, vb.pp("w2"))?; + let w3 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w3"))?; + Ok(Self { w1, w2, w3 }) + } + } + + impl Module for FeedForward { + fn forward(&self, xs: &Tensor) -> Result { + let swiglu = (candle_nn::ops::silu(&xs.apply(&self.w1)?)? * xs.apply(&self.w3))?; + swiglu.apply(&self.w2) + } + } + + #[derive(Debug, Clone)] + struct Attention { + wqkv: Linear, + wo: Linear, + dim: usize, + kv_size: usize, + n_local_heads: usize, + head_dim: usize, + n_head: usize, + kv_cache: Option<(Tensor, Tensor)>, + } + + impl Attention { + fn new(cfg: &Config, vb: VarBuilder) -> Result { + let n_local_heads = cfg.n_local_heads(); + let head_dim = cfg.head_dim(); + let total_head_dim = (cfg.n_head + 2 * n_local_heads) * head_dim; + let wqkv = linear_b(cfg.dim, total_head_dim, false, vb.pp("wqkv"))?; + let wo = linear_b(cfg.dim, cfg.dim, false, vb.pp("wo"))?; + Ok(Self { + wqkv, + wo, + dim: cfg.dim, + kv_size: n_local_heads * head_dim, + n_local_heads, + head_dim, + n_head: cfg.n_head, + kv_cache: None, + }) + } + + fn forward(&mut self, xs: &Tensor, _pos: usize, mask: &Tensor) -> Result { + let (b_sz, seqlen, _) = xs.dims3()?; + + let qkv = xs.apply(&self.wqkv)?; + let q = qkv.narrow(D::Minus1, 0, self.dim)?; + let k = qkv.narrow(D::Minus1, self.dim, self.kv_size)?; + let v = qkv.narrow(D::Minus1, self.dim + self.kv_size, self.kv_size)?; + let q = q + .reshape((b_sz, seqlen, self.n_head, self.head_dim))? + .transpose(1, 2)? + .contiguous()?; + let k = k + .reshape((b_sz, seqlen, self.n_local_heads, self.head_dim))? + .transpose(1, 2)?; + let v = v + .reshape((b_sz, seqlen, self.n_local_heads, self.head_dim))? + .transpose(1, 2)?; + + let (k, v) = match &self.kv_cache { + None => (k, v), + Some((prev_k, prev_v)) => { + let k = Tensor::cat(&[prev_k, &k], 2)?; + let v = Tensor::cat(&[prev_v, &v], 2)?; + (k, v) + } + }; + self.kv_cache = Some((k.clone(), v.clone())); + + let k = repeat_interleave(&k, self.n_head / self.n_local_heads, 1)?; + let v = repeat_interleave(&v, self.n_head / self.n_local_heads, 1)?; + + let scale = 1f64 / f64::sqrt(self.head_dim as f64); + let attn_weights = (q.matmul(&k.transpose(2, 3)?)? * scale)?; + + let attn_weights = attn_weights.broadcast_add(mask)?; + let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?; + let attn_output = attn_weights.matmul(&v)?; + attn_output + .transpose(1, 2)? + .reshape((b_sz, seqlen, self.dim))? + .apply(&self.wo) + } + + fn clear_kv_cache(&mut self) { + self.kv_cache = None + } + } + + #[derive(Debug, Clone)] + struct Block { + attention: Attention, + feed_forward: FeedForward, + ffn_norm: RmsNorm, + attention_norm: RmsNorm, + } + + impl Block { + fn new(cfg: &Config, vb: VarBuilder) -> Result { + let attention = Attention::new(cfg, vb.pp("attention"))?; + let feed_forward = FeedForward::new(cfg, vb.pp("feed_forward"))?; + let ffn_norm = RmsNorm::new(cfg.dim, cfg.norm_eps, vb.pp("ffn_norm"))?; + let attention_norm = RmsNorm::new(cfg.dim, cfg.norm_eps, vb.pp("attention_norm"))?; + Ok(Self { + attention, + feed_forward, + ffn_norm, + attention_norm, + }) + } + + fn forward(&mut self, xs: &Tensor, pos: usize, mask: &Tensor) -> Result { + let hs = xs.apply(&self.attention_norm)?; + let hs = (xs + self.attention.forward(&hs, pos, mask))?; + &hs + hs.apply(&self.ffn_norm)?.apply(&self.feed_forward) + } + + fn clear_kv_cache(&mut self) { + self.attention.clear_kv_cache() + } + } + + #[derive(Debug, Clone)] + pub struct Model { + tok_embeddings: Embedding, + pos_embeddings: Embedding, + speaker_cond_pos: Linear, + layers: Vec, + norm: RmsNorm, + output: Linear, + spk_cond_mask: Tensor, + } + + impl Model { + pub fn new(cfg: &Config, vb: VarBuilder) -> Result { + let tok_embeddings = Embedding::new(cfg.vocab_size, cfg.dim, vb.pp("tok_embeddings"))?; + let pos_embeddings = Embedding::new(cfg.block_size, cfg.dim, vb.pp("pos_embeddings"))?; + let speaker_cond_pos = linear_b( + cfg.speaker_emb_dim, + cfg.dim, + false, + vb.pp("speaker_cond_pos"), + )?; + let mut layers = Vec::with_capacity(cfg.n_layer); + let vb_l = vb.pp("layers"); + for layer_idx in 0..cfg.n_layer { + let layer = Block::new(cfg, vb_l.pp(layer_idx))?; + layers.push(layer) + } + let norm = RmsNorm::new(cfg.dim, cfg.norm_eps, vb.pp("norm"))?; + let output = linear_b(cfg.dim, cfg.vocab_size, false, vb.pp("output"))?; + let spk_cond_mask = Tensor::cat( + &[ + Tensor::ones((1, 1, cfg.dim), candle::DType::F32, vb.device())?, + Tensor::zeros((1, 1, cfg.dim), candle::DType::F32, vb.device())?, + ], + 0, + )?; + Ok(Self { + tok_embeddings, + pos_embeddings, + speaker_cond_pos, + layers, + norm, + output, + spk_cond_mask, + }) + } + + pub fn clear_kv_cache(&mut self) { + for layer in self.layers.iter_mut() { + layer.clear_kv_cache() + } + } + + pub fn forward(&mut self, xs: &Tensor, spk_emb: &Tensor, pos: usize) -> Result { + let (_b_sz, seqlen) = xs.dims2()?; + let mask: Vec<_> = (0..seqlen) + .flat_map(|i| (0..seqlen).map(move |j| if i < j { f32::NEG_INFINITY } else { 0. })) + .collect(); + let mask = Tensor::from_slice(&mask, (1, 1, seqlen, seqlen), xs.device())?; + let input_pos = Tensor::arange(pos as u32, (pos + seqlen) as u32, xs.device())?; + let tok_embeddings = xs.apply(&self.tok_embeddings)?; + let pos_embeddings = input_pos.apply(&self.pos_embeddings)?; + let mut xs = tok_embeddings + .broadcast_add(&pos_embeddings)? + .broadcast_add( + &spk_emb + .apply(&self.speaker_cond_pos)? + .broadcast_mul(&self.spk_cond_mask)?, + )?; + let mask = mask.to_dtype(xs.dtype())?; + for layer in self.layers.iter_mut() { + xs = layer.forward(&xs, pos, &mask)? + } + xs.narrow(1, seqlen - 1, 1)? + .apply(&self.norm)? + .apply(&self.output) + } + } +} diff --git a/candle-transformers/src/quantized_nn.rs b/candle-transformers/src/quantized_nn.rs index 99e8d45b..21c88430 100644 --- a/candle-transformers/src/quantized_nn.rs +++ b/candle-transformers/src/quantized_nn.rs @@ -50,6 +50,16 @@ impl Module for Linear { } } +pub fn linear_b(in_dim: usize, out_dim: usize, bias: bool, vb: VarBuilder) -> Result { + let bias = if bias { + Some(vb.get(out_dim, "bias")?.dequantize(vb.device())?) + } else { + None + }; + let weight = QMatMul::new(in_dim, out_dim, vb)?; + Ok(Linear { weight, bias }) +} + pub fn linear(in_dim: usize, out_dim: usize, vb: VarBuilder) -> Result { let bias = vb.get(out_dim, "bias")?.dequantize(vb.device())?; let weight = QMatMul::new(in_dim, out_dim, vb)?; From 56c9d3ee7b92bd1605be9f94416fa89976ebba87 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 9 Mar 2024 11:21:48 +0100 Subject: [PATCH 010/131] Fix the model path for rwkv. (#1825) --- candle-examples/examples/rwkv/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-examples/examples/rwkv/main.rs b/candle-examples/examples/rwkv/main.rs index e971a1cc..8fb2c0d4 100644 --- a/candle-examples/examples/rwkv/main.rs +++ b/candle-examples/examples/rwkv/main.rs @@ -141,7 +141,7 @@ impl std::fmt::Display for Which { impl Which { fn model_id(&self) -> &'static str { match self { - Self::Eagle7b => "RWKV/HF_v5-Eagle-7B", + Self::Eagle7b => "RWKV/v5-Eagle-7B-HF", Self::World1b5 => "RWKV/rwkv-5-world-1b5", Self::World3b => "RWKV/rwkv-5-world-3b", Self::World6_1b6 => "paperfun/rwkv", From 0c5eecbc0faa7e642210800c735ad8137d5a9e08 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 9 Mar 2024 12:24:11 +0100 Subject: [PATCH 011/131] Add some tracing to metavoice. (#1826) --- candle-transformers/src/models/metavoice.rs | 72 +++++++++++++++++-- .../src/models/quantized_metavoice.rs | 18 ++++- 2 files changed, 82 insertions(+), 8 deletions(-) diff --git a/candle-transformers/src/models/metavoice.rs b/candle-transformers/src/models/metavoice.rs index 2eeb0713..43de594f 100644 --- a/candle-transformers/src/models/metavoice.rs +++ b/candle-transformers/src/models/metavoice.rs @@ -181,6 +181,7 @@ pub mod tokenizers { pub end_of_text: usize, pub offset: usize, pub ranks: HashMap, Rank>, + span: tracing::Span, } impl BPE { @@ -231,6 +232,7 @@ pub mod tokenizers { end_of_text, offset, ranks, + span: tracing::span!(tracing::Level::TRACE, "bpe"), }) } @@ -310,6 +312,7 @@ pub mod tokenizers { } pub fn encode(&self, text: &str) -> Result> { + let _enter = self.span.enter(); let mut bpe_tokens: Vec = Vec::new(); for word in self.re.find_iter(text) { let word = word.map_err(E::wrap)?; @@ -426,6 +429,7 @@ pub mod gpt { c_attn: Linear, c_proj: Linear, n_head: usize, + span: tracing::Span, } impl SelfAttention { @@ -444,12 +448,14 @@ pub mod gpt { c_attn, c_proj, n_head: cfg.n_head, + span: tracing::span!(tracing::Level::TRACE, "self-attn"), }) } } impl Module for SelfAttention { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let (b, t, c) = xs.dims3()?; let c_x = xs .apply(&self.c_attn)? @@ -474,11 +480,13 @@ pub mod gpt { Gelu { c_fc: Linear, c_proj: Linear, + span: tracing::Span, }, Swiglu { w1: Linear, w3: Linear, c_proj: Linear, + span: tracing::Span, }, } @@ -489,7 +497,11 @@ pub mod gpt { NonLinearityType::Gelu => { let c_fc = linear_b(cfg.n_embd, hidden_dim, cfg.bias, vb.pp("c_fc"))?; let c_proj = linear_b(hidden_dim, cfg.n_embd, cfg.bias, vb.pp("c_proj"))?; - Self::Gelu { c_fc, c_proj } + Self::Gelu { + c_fc, + c_proj, + span: tracing::span!(tracing::Level::TRACE, "mlp-gelu"), + } } NonLinearityType::Swiglu => { let hidden_dim = (2 * hidden_dim) / 3; @@ -502,7 +514,12 @@ pub mod gpt { let w1 = linear_b(cfg.n_embd, hidden_dim, cfg.bias, vb.pp("w1"))?; let w3 = linear_b(cfg.n_embd, hidden_dim, cfg.bias, vb.pp("w3"))?; let c_proj = linear_b(hidden_dim, cfg.n_embd, cfg.bias, vb.pp("c_proj"))?; - Self::Swiglu { w1, w3, c_proj } + Self::Swiglu { + w1, + w3, + c_proj, + span: tracing::span!(tracing::Level::TRACE, "mlp-swiglu"), + } } }; Ok(slf) @@ -512,8 +529,17 @@ pub mod gpt { impl Module for MLP { fn forward(&self, xs: &Tensor) -> Result { match self { - Self::Gelu { c_fc, c_proj } => xs.apply(c_fc)?.gelu()?.apply(c_proj), - Self::Swiglu { w1, w3, c_proj } => { + Self::Gelu { c_fc, c_proj, span } => { + let _enter = span.enter(); + xs.apply(c_fc)?.gelu()?.apply(c_proj) + } + Self::Swiglu { + w1, + w3, + c_proj, + span, + } => { + let _enter = span.enter(); let w1 = xs.apply(w1)?; let w3 = xs.apply(w3)?; (w1.silu()? * w3)?.apply(c_proj) @@ -528,6 +554,7 @@ pub mod gpt { ln_2: Norm, attn: SelfAttention, mlp: MLP, + span: tracing::Span, } impl Block { @@ -541,12 +568,14 @@ pub mod gpt { ln_2, attn, mlp, + span: tracing::span!(tracing::Level::TRACE, "gpt-block"), }) } } impl Module for Block { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let xs = (xs + xs.apply(&self.ln_1)?.apply(&self.attn))?; let xs = (&xs + xs.apply(&self.ln_2)?.apply(&self.mlp))?; Ok(xs) @@ -563,6 +592,7 @@ pub mod gpt { lm_heads: Vec, cfg: Config, dtype: DType, + span: tracing::Span, } impl Model { @@ -598,6 +628,7 @@ pub mod gpt { lm_heads, cfg, dtype: vb.dtype(), + span: tracing::span!(tracing::Level::TRACE, "gpt"), }) } @@ -606,6 +637,7 @@ pub mod gpt { } pub fn forward(&self, idx: &Tensor) -> Result> { + let _enter = self.span.enter(); let device = idx.device(); let (b, _num_hierarchies, t) = idx.dims3()?; let pos = Tensor::arange(0u32, t as u32, device)?; @@ -689,6 +721,7 @@ pub mod transformer { w1: Linear, w2: Linear, w3: Linear, + span: tracing::Span, } impl FeedForward { @@ -697,12 +730,18 @@ pub mod transformer { let w1 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w1"))?; let w2 = linear_b(i_size, cfg.dim, false, vb.pp("w2"))?; let w3 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w3"))?; - Ok(Self { w1, w2, w3 }) + Ok(Self { + w1, + w2, + w3, + span: tracing::span!(tracing::Level::TRACE, "feed-forward"), + }) } } impl Module for FeedForward { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let swiglu = (candle_nn::ops::silu(&xs.apply(&self.w1)?)? * xs.apply(&self.w3))?; swiglu.apply(&self.w2) } @@ -718,6 +757,7 @@ pub mod transformer { head_dim: usize, n_head: usize, kv_cache: Option<(Tensor, Tensor)>, + span: tracing::Span, } impl Attention { @@ -736,10 +776,12 @@ pub mod transformer { head_dim, n_head: cfg.n_head, kv_cache: None, + span: tracing::span!(tracing::Level::TRACE, "feed-forward"), }) } fn forward(&mut self, xs: &Tensor, _pos: usize, mask: &Tensor) -> Result { + let _enter = self.span.enter(); let (b_sz, seqlen, _) = xs.dims3()?; let qkv = xs.apply(&self.wqkv)?; @@ -793,6 +835,7 @@ pub mod transformer { feed_forward: FeedForward, ffn_norm: RmsNorm, attention_norm: RmsNorm, + span: tracing::Span, } impl Block { @@ -806,10 +849,12 @@ pub mod transformer { feed_forward, ffn_norm, attention_norm, + span: tracing::span!(tracing::Level::TRACE, "block"), }) } fn forward(&mut self, xs: &Tensor, pos: usize, mask: &Tensor) -> Result { + let _enter = self.span.enter(); let hs = xs.apply(&self.attention_norm)?; let hs = (xs + self.attention.forward(&hs, pos, mask))?; &hs + hs.apply(&self.ffn_norm)?.apply(&self.feed_forward) @@ -829,6 +874,7 @@ pub mod transformer { norm: RmsNorm, output: Linear, spk_cond_mask: Tensor, + span: tracing::Span, } impl Model { @@ -865,6 +911,7 @@ pub mod transformer { norm, output, spk_cond_mask, + span: tracing::span!(tracing::Level::TRACE, "transformer"), }) } @@ -875,6 +922,7 @@ pub mod transformer { } pub fn forward(&mut self, xs: &Tensor, spk_emb: &Tensor, pos: usize) -> Result { + let _enter = self.span.enter(); let (_b_sz, seqlen) = xs.dims2()?; let mask: Vec<_> = (0..seqlen) .flat_map(|i| (0..seqlen).map(move |j| if i < j { f32::NEG_INFINITY } else { 0. })) @@ -905,14 +953,19 @@ pub mod adapters { // https://github.com/metavoiceio/metavoice-src/blob/9078234c496d76adbec06df789b6b04b1875f129/fam/llm/adapters/tilted_encodec.py pub struct TiltedEncodec { end_of_audio_token: u32, + span: tracing::Span, } impl TiltedEncodec { pub fn new(end_of_audio_token: u32) -> Self { - Self { end_of_audio_token } + Self { + end_of_audio_token, + span: tracing::span!(tracing::Level::TRACE, "tilted-encodec"), + } } pub fn decode(&self, tokens: &[Vec]) -> (Vec, Vec>) { + let _enter = self.span.enter(); let mut text_ids = vec![]; let mut extracted_audio_ids = vec![]; let mut min_audio_ids_len = usize::MAX; @@ -941,14 +994,19 @@ pub mod adapters { // https://github.com/metavoiceio/metavoice-src/blob/9078234c496d76adbec06df789b6b04b1875f129/fam/llm/adapters/flattened_encodec.py#L4 pub struct FlattenedInterleavedEncodec2Codebook { end_of_audio_token: u32, + span: tracing::Span, } impl FlattenedInterleavedEncodec2Codebook { pub fn new(end_of_audio_token: u32) -> Self { - Self { end_of_audio_token } + Self { + end_of_audio_token, + span: tracing::span!(tracing::Level::TRACE, "encodec2codebook"), + } } pub fn decode(&self, tokens: &[u32]) -> (Vec, Vec, Vec) { + let _enter = self.span.enter(); let mut text_ids = vec![]; let mut audio_ids1 = vec![]; let mut audio_ids2 = vec![]; diff --git a/candle-transformers/src/models/quantized_metavoice.rs b/candle-transformers/src/models/quantized_metavoice.rs index 16545150..84c0388c 100644 --- a/candle-transformers/src/models/quantized_metavoice.rs +++ b/candle-transformers/src/models/quantized_metavoice.rs @@ -14,6 +14,7 @@ pub mod transformer { w1: Linear, w2: Linear, w3: Linear, + span: tracing::Span, } impl FeedForward { @@ -22,12 +23,18 @@ pub mod transformer { let w1 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w1"))?; let w2 = linear_b(i_size, cfg.dim, false, vb.pp("w2"))?; let w3 = linear_b(cfg.dim, i_size, false, vb.pp("swiglu.w3"))?; - Ok(Self { w1, w2, w3 }) + Ok(Self { + w1, + w2, + w3, + span: tracing::span!(tracing::Level::TRACE, "feed-forward"), + }) } } impl Module for FeedForward { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let swiglu = (candle_nn::ops::silu(&xs.apply(&self.w1)?)? * xs.apply(&self.w3))?; swiglu.apply(&self.w2) } @@ -43,6 +50,7 @@ pub mod transformer { head_dim: usize, n_head: usize, kv_cache: Option<(Tensor, Tensor)>, + span: tracing::Span, } impl Attention { @@ -61,10 +69,12 @@ pub mod transformer { head_dim, n_head: cfg.n_head, kv_cache: None, + span: tracing::span!(tracing::Level::TRACE, "attention"), }) } fn forward(&mut self, xs: &Tensor, _pos: usize, mask: &Tensor) -> Result { + let _enter = self.span.enter(); let (b_sz, seqlen, _) = xs.dims3()?; let qkv = xs.apply(&self.wqkv)?; @@ -118,6 +128,7 @@ pub mod transformer { feed_forward: FeedForward, ffn_norm: RmsNorm, attention_norm: RmsNorm, + span: tracing::Span, } impl Block { @@ -131,10 +142,12 @@ pub mod transformer { feed_forward, ffn_norm, attention_norm, + span: tracing::span!(tracing::Level::TRACE, "block"), }) } fn forward(&mut self, xs: &Tensor, pos: usize, mask: &Tensor) -> Result { + let _enter = self.span.enter(); let hs = xs.apply(&self.attention_norm)?; let hs = (xs + self.attention.forward(&hs, pos, mask))?; &hs + hs.apply(&self.ffn_norm)?.apply(&self.feed_forward) @@ -154,6 +167,7 @@ pub mod transformer { norm: RmsNorm, output: Linear, spk_cond_mask: Tensor, + span: tracing::Span, } impl Model { @@ -189,6 +203,7 @@ pub mod transformer { norm, output, spk_cond_mask, + span: tracing::span!(tracing::Level::TRACE, "qtransformer"), }) } @@ -199,6 +214,7 @@ pub mod transformer { } pub fn forward(&mut self, xs: &Tensor, spk_emb: &Tensor, pos: usize) -> Result { + let _enter = self.span.enter(); let (_b_sz, seqlen) = xs.dims2()?; let mask: Vec<_> = (0..seqlen) .flat_map(|i| (0..seqlen).map(move |j| if i < j { f32::NEG_INFINITY } else { 0. })) From df5f69444e438a7cd8d8ab4971579bf309b72114 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 10 Mar 2024 20:23:43 +0100 Subject: [PATCH 012/131] Properly handle the batch dimension in cuda quantized matmul. (#1832) --- candle-core/src/quantized/cuda.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs index 5b684573..c90cf576 100644 --- a/candle-core/src/quantized/cuda.rs +++ b/candle-core/src/quantized/cuda.rs @@ -313,7 +313,7 @@ impl QCudaStorage { } let data_f32 = self.dequantize(n * k)?; - let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0); + let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0).broadcast_as((b, k, n))?; let out = storage.matmul(&data_f32, (b, m, n, k), layout, &rhs_l)?; let mut out_shape = layout.shape().dims().to_vec(); out_shape.pop(); From ff03fd3fb314980d3273ffc49826d764541d76e2 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Tue, 12 Mar 2024 11:30:24 +0100 Subject: [PATCH 013/131] Expose some helper functions to create quantized models. (#1837) --- candle-transformers/src/models/with_tracing.rs | 6 ++++++ candle-transformers/src/quantized_nn.rs | 8 ++++++++ candle-transformers/src/quantized_var_builder.rs | 1 + 3 files changed, 15 insertions(+) diff --git a/candle-transformers/src/models/with_tracing.rs b/candle-transformers/src/models/with_tracing.rs index 383ae71c..2ffec724 100644 --- a/candle-transformers/src/models/with_tracing.rs +++ b/candle-transformers/src/models/with_tracing.rs @@ -116,6 +116,12 @@ impl QMatMul { let span = tracing::span!(tracing::Level::TRACE, "qmatmul"); Ok(Self { inner, span }) } + + pub fn from_weights(ws: std::sync::Arc) -> Result { + let inner = candle::quantized::QMatMul::from_arc(ws)?; + let span = tracing::span!(tracing::Level::TRACE, "qmatmul"); + Ok(Self { inner, span }) + } } impl Module for QMatMul { diff --git a/candle-transformers/src/quantized_nn.rs b/candle-transformers/src/quantized_nn.rs index 21c88430..bb0a8641 100644 --- a/candle-transformers/src/quantized_nn.rs +++ b/candle-transformers/src/quantized_nn.rs @@ -35,6 +35,14 @@ pub struct Linear { } impl Linear { + pub fn from_arc( + weight: std::sync::Arc, + bias: Option, + ) -> Result { + let weight = QMatMul::from_weights(weight)?; + Ok(Self { weight, bias }) + } + pub fn from_weights(weight: QMatMul, bias: Option) -> Self { Self { weight, bias } } diff --git a/candle-transformers/src/quantized_var_builder.rs b/candle-transformers/src/quantized_var_builder.rs index bfd0629f..a963e311 100644 --- a/candle-transformers/src/quantized_var_builder.rs +++ b/candle-transformers/src/quantized_var_builder.rs @@ -3,6 +3,7 @@ use candle::{Device, Result, Shape}; use std::sync::Arc; // VarBuilder specialized for QTensors +#[derive(Clone)] pub struct VarBuilder { data: Arc>>, path: Vec, From 2bb9c683b9a2c7e6cfc775a0f4dd6af97b7193a7 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Wed, 13 Mar 2024 09:36:25 -0400 Subject: [PATCH 014/131] Update README.md (#1840) Adds the candle-einops to the readme as an external resource --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index fd80069e..b0a3b118 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,7 @@ And then head over to - [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more. - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle. - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem. +- [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library. If you have an addition to this list, please submit a pull request. From 3318fe30fb3d8c3b92ba404a2a33de81c2731ad9 Mon Sep 17 00:00:00 2001 From: Tyler Rockwood Date: Wed, 13 Mar 2024 15:41:36 -0500 Subject: [PATCH 015/131] Update gemma README (#1843) * Update gemma README * Fixit --- candle-examples/examples/gemma/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-examples/examples/gemma/README.md b/candle-examples/examples/gemma/README.md index 8319cf44..5d77c7a4 100644 --- a/candle-examples/examples/gemma/README.md +++ b/candle-examples/examples/gemma/README.md @@ -1,4 +1,4 @@ -# candle-mistral: 2b and 7b LLMs from Google DeepMind +# candle-gemma: 2b and 7b LLMs from Google DeepMind [Gemma](https://ai.google.dev/gemma/docs) is a collection of lightweight open models published by Google Deepmind with a 2b and a 7b variant. From e1f9c3776d4e3b77cc2958c835314e680ac6f54f Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Thu, 14 Mar 2024 22:01:36 +0200 Subject: [PATCH 016/131] StableLM-2 models were updated to use GPT-2 tokenization. (#1847) --- candle-examples/examples/stable-lm/README.md | 5 ----- candle-examples/examples/stable-lm/main.rs | 9 +-------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/candle-examples/examples/stable-lm/README.md b/candle-examples/examples/stable-lm/README.md index 546124a2..6f5e7597 100644 --- a/candle-examples/examples/stable-lm/README.md +++ b/candle-examples/examples/stable-lm/README.md @@ -10,11 +10,6 @@ order to be able to use it. Other available models are Stable-Code-3B, StableLM-2 and Zephyr variants. -StableLM-2 uses a Tiktoken based GPT-3.5/GPT-4 tokenizer not supported by -Candle, so to run it you can download a somewhat compatible -[tokenizer.json](https://huggingface.co/Xenova/gpt-4/resolve/main/tokenizer.json?download=true) -and pass it via the --tokenizer-file argument. - ## Running some example ```bash diff --git a/candle-examples/examples/stable-lm/main.rs b/candle-examples/examples/stable-lm/main.rs index abe7020c..f467903a 100644 --- a/candle-examples/examples/stable-lm/main.rs +++ b/candle-examples/examples/stable-lm/main.rs @@ -239,14 +239,7 @@ fn main() -> Result<()> { )); let tokenizer_filename = match args.tokenizer_file { Some(file) => std::path::PathBuf::from(file), - None => match args.which { - Which::V1Orig | Which::V1 | Which::V1Zephyr | Which::Code => { - repo.get("tokenizer.json")? - } - Which::V2 | Which::V2Zephyr => api - .model("lmz/candle-stablelm".to_string()) - .get("tokenizer-gpt4.json")?, - }, + None => repo.get("tokenizer.json")?, }; let filenames = match args.weight_files { Some(files) => files From cdc4c172c42b5c31b3063afd20cc7055d60f9af8 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 15 Mar 2024 08:37:27 +0100 Subject: [PATCH 017/131] Implement the error trait for DTypeParseError. (#1852) --- candle-core/src/dtype.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/candle-core/src/dtype.rs b/candle-core/src/dtype.rs index 94ca57d8..1a698a35 100644 --- a/candle-core/src/dtype.rs +++ b/candle-core/src/dtype.rs @@ -23,7 +23,15 @@ pub enum DType { } #[derive(Debug, PartialEq, Eq)] -pub struct DTypeParseError; +pub struct DTypeParseError(String); + +impl std::fmt::Display for DTypeParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "cannot parse '{}' as a dtype", self.0) + } +} + +impl std::error::Error for DTypeParseError {} impl std::str::FromStr for DType { type Err = DTypeParseError; @@ -36,7 +44,7 @@ impl std::str::FromStr for DType { "f16" => Ok(Self::F16), "f32" => Ok(Self::F32), "f64" => Ok(Self::F64), - _ => Err(DTypeParseError), + _ => Err(DTypeParseError(s.to_string())), } } } From 74bf6994b172f364c6e8bea2ac6e1bfbc6ca0c25 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 16 Mar 2024 22:25:46 +0100 Subject: [PATCH 018/131] Move the image tensor to the appropriate device. (#1856) --- candle-examples/examples/convmixer/main.rs | 2 +- candle-examples/examples/convnext/main.rs | 2 +- candle-examples/examples/dinov2/main.rs | 2 +- candle-examples/examples/efficientnet/main.rs | 2 +- candle-examples/examples/efficientvit/main.rs | 2 +- candle-examples/examples/mobileone/main.rs | 2 +- candle-examples/examples/repvgg/main.rs | 2 +- candle-examples/examples/resnet/main.rs | 2 +- candle-examples/examples/vgg/main.rs | 2 +- candle-examples/examples/vit/main.rs | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/candle-examples/examples/convmixer/main.rs b/candle-examples/examples/convmixer/main.rs index feae536f..d8c2e619 100644 --- a/candle-examples/examples/convmixer/main.rs +++ b/candle-examples/examples/convmixer/main.rs @@ -28,7 +28,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/convnext/main.rs b/candle-examples/examples/convnext/main.rs index 8fc72e16..e5b235fa 100644 --- a/candle-examples/examples/convnext/main.rs +++ b/candle-examples/examples/convnext/main.rs @@ -93,7 +93,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/dinov2/main.rs b/candle-examples/examples/dinov2/main.rs index 6b3edeb4..d718ee6f 100644 --- a/candle-examples/examples/dinov2/main.rs +++ b/candle-examples/examples/dinov2/main.rs @@ -31,7 +31,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/efficientnet/main.rs b/candle-examples/examples/efficientnet/main.rs index 0e4a2864..a8f17cca 100644 --- a/candle-examples/examples/efficientnet/main.rs +++ b/candle-examples/examples/efficientnet/main.rs @@ -47,7 +47,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/efficientvit/main.rs b/candle-examples/examples/efficientvit/main.rs index 1eb80a2d..efbf813c 100644 --- a/candle-examples/examples/efficientvit/main.rs +++ b/candle-examples/examples/efficientvit/main.rs @@ -66,7 +66,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/mobileone/main.rs b/candle-examples/examples/mobileone/main.rs index 4cd55001..76533fe3 100644 --- a/candle-examples/examples/mobileone/main.rs +++ b/candle-examples/examples/mobileone/main.rs @@ -63,7 +63,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/repvgg/main.rs b/candle-examples/examples/repvgg/main.rs index 0864c559..7cc90ba1 100644 --- a/candle-examples/examples/repvgg/main.rs +++ b/candle-examples/examples/repvgg/main.rs @@ -78,7 +78,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/resnet/main.rs b/candle-examples/examples/resnet/main.rs index 4a4592ad..bdf02fb1 100644 --- a/candle-examples/examples/resnet/main.rs +++ b/candle-examples/examples/resnet/main.rs @@ -45,7 +45,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { diff --git a/candle-examples/examples/vgg/main.rs b/candle-examples/examples/vgg/main.rs index 27e141cb..e7bfe7d2 100644 --- a/candle-examples/examples/vgg/main.rs +++ b/candle-examples/examples/vgg/main.rs @@ -33,7 +33,7 @@ struct Args { pub fn main() -> anyhow::Result<()> { let args = Args::parse(); let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); diff --git a/candle-examples/examples/vit/main.rs b/candle-examples/examples/vit/main.rs index 168caf9e..b38bae15 100644 --- a/candle-examples/examples/vit/main.rs +++ b/candle-examples/examples/vit/main.rs @@ -28,7 +28,7 @@ pub fn main() -> anyhow::Result<()> { let device = candle_examples::device(args.cpu)?; - let image = candle_examples::imagenet::load_image224(args.image)?; + let image = candle_examples::imagenet::load_image224(args.image)?.to_device(&device)?; println!("loaded image {image:?}"); let model_file = match args.model { From db8b24ae92419377283821ee0a65fb224a4f3c4d Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Sun, 17 Mar 2024 03:09:43 -0400 Subject: [PATCH 019/131] Add support for index u8/i64 and input f16/bf16 scatter-add on metal (#1849) * add support and tests for scatter add on metal * add support for all datatypes --- candle-core/src/metal_backend.rs | 8 ++ candle-metal-kernels/src/indexing.metal | 13 ++- candle-metal-kernels/src/tests.rs | 104 ++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 2 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 02078db5..a17b87b8 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1128,7 +1128,15 @@ impl BackendStorage for MetalStorage { None => Err(crate::Error::RequiresContiguous { op: "scatter-add" }.bt())?, }; let name = match (ids.dtype, self.dtype) { + (DType::U8, DType::F32) => "sa_u8_f32", + (DType::U8, DType::F16) => "sa_u8_f16", + (DType::U8, DType::BF16) => "sa_u8_bf16", (DType::U32, DType::F32) => "sa_u32_f32", + (DType::U32, DType::F16) => "sa_u32_f16", + (DType::U32, DType::BF16) => "sa_u32_bf16", + (DType::I64, DType::F32) => "sa_i64_f32", + (DType::I64, DType::F16) => "sa_i64_f16", + (DType::I64, DType::BF16) => "sa_i64_bf16", _ => Err(MetalError::UnexpectedDType { msg: "scatter-add ids should be u8/u32/i64", expected: DType::U32, diff --git a/candle-metal-kernels/src/indexing.metal b/candle-metal-kernels/src/indexing.metal index 2a57bdbb..f6b81be0 100644 --- a/candle-metal-kernels/src/indexing.metal +++ b/candle-metal-kernels/src/indexing.metal @@ -167,11 +167,16 @@ kernel void NAME( \ INDEX_OP(is_u32_f32, uint, float) INDEX_OP(is_u32_f16, uint, half) + GATHER_OP(gather_u32_f32, uint, float) GATHER_OP(gather_u32_f16, uint, half) -SCATTER_ADD_OP(sa_u32_f32, uint, float) -SCATTER_ADD_OP(sa_u32_f16, uint, half) +SCATTER_ADD_OP(sa_u32_f32, uint32_t, float) +SCATTER_ADD_OP(sa_u8_f32, uint8_t, float) +SCATTER_ADD_OP(sa_i64_f32, int64_t, float) +SCATTER_ADD_OP(sa_u32_f16, uint32_t, half) +SCATTER_ADD_OP(sa_u8_f16, uint8_t, half) +SCATTER_ADD_OP(sa_i64_f16, int64_t, half) #if defined(__HAVE_BFLOAT__) INDEX_OP(is_u32_bf16, uint32_t, bfloat) @@ -180,6 +185,10 @@ INDEX_OP(is_u8_bf16, uint8_t, bfloat) INDEX_ADD_OP(ia_i64_bf16, int64_t, bfloat) INDEX_ADD_OP(ia_u32_bf16, uint32_t, bfloat) INDEX_ADD_OP(ia_u8_bf16, uint8_t, bfloat) + +SCATTER_ADD_OP(sa_u32_bf16, uint32_t, bfloat) +SCATTER_ADD_OP(sa_u8_bf16, uint8_t, bfloat) +SCATTER_ADD_OP(sa_i64_bf16, int64_t, bfloat) #endif INDEX_ADD_OP(ia_u32_f16, uint32_t, half) diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index 459c8edb..b47fff6a 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -1066,3 +1066,107 @@ fn random() { validate_random!(f16); validate_random!(bf16); } + +fn run_scatter_add( + input: &[T], + ids: &[I], + shape: &[usize], + dim: usize, + name: &'static str, +) -> Vec { + let device = device(); + let kernels = Kernels::new(); + let command_queue = device.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + let options = MTLResourceOptions::StorageModeManaged; + let input_buffer = new_buffer(&device, input); + let ids_buffer = new_buffer(&device, ids); + let output = device.new_buffer(std::mem::size_of_val(input) as u64, options); + call_scatter_add( + &device, + command_buffer, + &kernels, + name, + shape, + shape, + dim, + &input_buffer, + 0, + &ids_buffer, + 0, + &output, + ) + .unwrap(); + command_buffer.commit(); + command_buffer.wait_until_completed(); + read_to_vec(&output, input.len()) +} + +#[test] +fn scatter_add() { + let ids_u8 = [0u8, 0, 1, 0, 2, 2, 3, 3]; + let ids_u32 = [0u32, 0, 1, 0, 2, 2, 3, 3]; + let ids_i64 = [0i64, 0, 1, 0, 2, 2, 3, 3]; + + let input_f32 = [5.0f32, 1.0, 7.0, 2.0, 3.0, 2.0, 1.0, 3.0]; + let input_f16 = input_f32 + .iter() + .map(|v| f16::from_f32(*v)) + .collect::>(); + let input_bf16 = input_f32 + .iter() + .map(|v| bf16::from_f32(*v)) + .collect::>(); + + let output_dim1_f32 = vec![8.0, 7.0, 5.0, 4.0, 0.0, 0.0, 0.0, 0.0]; + let output_dim1_f16 = output_dim1_f32 + .iter() + .map(|v| f16::from_f32(*v)) + .collect::>(); + let output_dim1_bf16 = output_dim1_f32 + .iter() + .map(|v| bf16::from_f32(*v)) + .collect::>(); + + let output_dim2_f32 = vec![5.0, 3.0, 7.0, 0.0, 3.0, 2.0, 1.0, 3.0]; + let output_dim2_f16 = output_dim2_f32 + .iter() + .map(|v| f16::from_f32(*v)) + .collect::>(); + let output_dim2_bf16 = output_dim2_f32 + .iter() + .map(|v| bf16::from_f32(*v)) + .collect::>(); + + for (shape, output_f32, output_f16, output_bf16) in [ + (vec![8], output_dim1_f32, output_dim1_f16, output_dim1_bf16), + ( + vec![4, 2], + output_dim2_f32, + output_dim2_f16, + output_dim2_bf16, + ), + ] { + for results in [ + run_scatter_add(&input_f32, &ids_u8, &shape, 0, "sa_u8_f32"), + run_scatter_add(&input_f32, &ids_u32, &shape, 0, "sa_u32_f32"), + run_scatter_add(&input_f32, &ids_i64, &shape, 0, "sa_i64_f32"), + ] { + assert_eq!(results, output_f32); + } + for results in [ + run_scatter_add(&input_f16, &ids_u8, &shape, 0, "sa_u8_f16"), + run_scatter_add(&input_f16, &ids_u32, &shape, 0, "sa_u32_f16"), + run_scatter_add(&input_f16, &ids_i64, &shape, 0, "sa_i64_f16"), + ] { + assert_eq!(results, output_f16); + } + for results in [ + run_scatter_add(&input_bf16, &ids_u8, &shape, 0, "sa_u8_bf16"), + run_scatter_add(&input_bf16, &ids_u32, &shape, 0, "sa_u32_bf16"), + run_scatter_add(&input_bf16, &ids_i64, &shape, 0, "sa_i64_bf16"), + ] { + assert_eq!(results, output_bf16); + } + } +} From ce9fbc368211815ef2dddff01575ca1f9d4eccd5 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 17 Mar 2024 10:49:13 +0100 Subject: [PATCH 020/131] Optimize the cat operation on contiguous tensors (#1855) * Add a specialized kernel for copy2d. * Move the cat operations. * Avoid transpositions in cat. * Bugfix. * Bugfix for the cuda kernel. * Add a benchmark. * Add more testing. * Test fix. * Faster kernel. * Add the missing kernel. * Tweak the test. * Add a metal kernel. * Fix for the metal kernel. * Get the tests to pass on metal. * Also use this opportunity to fix the metal kernel for ELU. * Add some bf16 kernels. * Clippy fixes. --- candle-core/src/backend.rs | 13 ++ candle-core/src/cpu_backend.rs | 62 +++++++ candle-core/src/cuda_backend.rs | 61 +++++++ candle-core/src/dummy_cuda_backend.rs | 13 ++ candle-core/src/dummy_metal_backend.rs | 13 ++ candle-core/src/lib.rs | 1 + candle-core/src/metal_backend.rs | 65 +++++++ candle-core/src/storage.rs | 28 +++ candle-core/src/tensor.rs | 148 +-------------- candle-core/src/tensor_cat.rs | 240 +++++++++++++++++++++++++ candle-core/tests/conv_tests.rs | 128 +++++++------ candle-core/tests/grad_tests.rs | 18 +- candle-core/tests/pool_tests.rs | 9 + candle-core/tests/tensor_tests.rs | 25 +++ candle-kernels/src/fill.cu | 30 +++- candle-metal-kernels/src/affine.metal | 2 +- candle-metal-kernels/src/lib.rs | 50 ++++++ candle-metal-kernels/src/unary.metal | 27 +++ candle-nn/examples/cpu_benchmarks.rs | 19 ++ 19 files changed, 744 insertions(+), 208 deletions(-) create mode 100644 candle-core/src/tensor_cat.rs diff --git a/candle-core/src/backend.rs b/candle-core/src/backend.rs index 2125af69..ea1ac1a9 100644 --- a/candle-core/src/backend.rs +++ b/candle-core/src/backend.rs @@ -98,6 +98,19 @@ pub trait BackendStorage: Sized { ) -> Result; fn copy_strided_src(&self, _: &mut Self, _: usize, _: &Layout) -> Result<()>; + + #[allow(clippy::too_many_arguments)] + // Similar to cudaMemcpy2D, though values are in elements and not in bytes. + fn copy2d( + &self, + _: &mut Self, + _d1: usize, + _d2: usize, + _src_stride1: usize, + _dst_stride1: usize, + _src_offset: usize, + _dst_offset: usize, + ) -> Result<()>; } pub trait BackendDevice: Sized + std::fmt::Debug + Clone { diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs index 181fbb61..1504d5b8 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend.rs @@ -1023,6 +1023,26 @@ impl<'a, I: IntDType> Map2 for IndexAdd<'a, I> { } } +#[allow(clippy::too_many_arguments)] +fn copy2d_( + src: &[T], + dst: &mut [T], + d1: usize, + d2: usize, + src_stride1: usize, + dst_stride1: usize, + src_offset: usize, + dst_offset: usize, +) { + for i1 in 0..d1 { + let dst_idx = i1 * dst_stride1 + dst_offset; + let src_idx = i1 * src_stride1 + src_offset; + let dst = &mut dst[dst_idx..dst_idx + d2]; + let src = &src[src_idx..src_idx + d2]; + dst.copy_from_slice(src) + } +} + fn copy_strided_src_(src: &[T], dst: &mut [T], dst_offset: usize, src_l: &Layout) { match src_l.strided_blocks() { crate::StridedBlocks::SingleBlock { start_offset, len } => { @@ -2452,6 +2472,48 @@ impl BackendStorage for CpuStorage { } } + fn copy2d( + &self, + dst: &mut Self, + d1: usize, + d2: usize, + src_s: usize, + dst_s: usize, + src_o: usize, + dst_o: usize, + ) -> Result<()> { + match (self, dst) { + (Self::U8(src), Self::U8(dst)) => copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o), + (Self::U32(src), Self::U32(dst)) => { + copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o) + } + (Self::I64(src), Self::I64(dst)) => { + copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o) + } + (Self::BF16(src), Self::BF16(dst)) => { + copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o) + } + (Self::F16(src), Self::F16(dst)) => { + copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o) + } + (Self::F32(src), Self::F32(dst)) => { + copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o) + } + (Self::F64(src), Self::F64(dst)) => { + copy2d_(src, dst, d1, d2, src_s, dst_s, src_o, dst_o) + } + (_, dst) => { + return Err(Error::DTypeMismatchBinaryOp { + lhs: self.dtype(), + rhs: dst.dtype(), + op: "copy2d", + } + .bt()); + } + } + Ok(()) + } + fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> { match (self, dst) { (Self::U8(src), Self::U8(dst)) => copy_strided_src_(src, dst, dst_offset, src_l), diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs index b7756fa6..52d1b558 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend.rs @@ -2145,6 +2145,67 @@ impl BackendStorage for CudaStorage { Ok(Self { slice, device }) } + fn copy2d( + &self, + dst: &mut Self, + d1: usize, + d2: usize, + src_s: usize, + dst_s: usize, + src_o: usize, + dst_o: usize, + ) -> Result<()> { + let dev = &self.device; + let d1 = d1 as u32; + let d2 = d2 as u32; + let dst_s = dst_s as u32; + let src_s = src_s as u32; + let (src, dst, kname) = match (&self.slice, &mut dst.slice) { + (S::U8(s), S::U8(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_u8", + ), + (S::U32(s), S::U32(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_u32", + ), + (S::I64(s), S::I64(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_i64", + ), + (S::BF16(s), S::BF16(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_bf16", + ), + (S::F16(s), S::F16(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_f16", + ), + (S::F32(s), S::F32(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_f32", + ), + (S::F64(s), S::F64(d)) => ( + *s.slice(src_o..).device_ptr(), + *d.slice(dst_o..).device_ptr(), + "copy2d_f64", + ), + _ => Err(CudaError::InternalError("dtype mismatch in copy2d"))?, + }; + let func = dev.get_or_load_func(kname, kernels::FILL)?; + let cfg = LaunchConfig::for_num_elems(d1 * d2); + let params = (src, dst, d1, d2, src_s, dst_s); + // SAFETY: ffi. + unsafe { func.launch(cfg, params) }.w()?; + Ok(()) + } + fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> { let src_shape = src_l.shape(); let dims = src_shape.dims(); diff --git a/candle-core/src/dummy_cuda_backend.rs b/candle-core/src/dummy_cuda_backend.rs index 34c5d97f..43d55fa4 100644 --- a/candle-core/src/dummy_cuda_backend.rs +++ b/candle-core/src/dummy_cuda_backend.rs @@ -154,6 +154,19 @@ impl crate::backend::BackendStorage for CudaStorage { Err(Error::NotCompiledWithCudaSupport) } + fn copy2d( + &self, + _: &mut Self, + _: usize, + _: usize, + _: usize, + _: usize, + _: usize, + _: usize, + ) -> Result<()> { + Err(Error::NotCompiledWithCudaSupport) + } + fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result { Err(Error::NotCompiledWithCudaSupport) } diff --git a/candle-core/src/dummy_metal_backend.rs b/candle-core/src/dummy_metal_backend.rs index e9d92331..791ec153 100644 --- a/candle-core/src/dummy_metal_backend.rs +++ b/candle-core/src/dummy_metal_backend.rs @@ -166,6 +166,19 @@ impl crate::backend::BackendStorage for MetalStorage { Err(Error::NotCompiledWithMetalSupport) } + fn copy2d( + &self, + _: &mut Self, + _: usize, + _: usize, + _: usize, + _: usize, + _: usize, + _: usize, + ) -> Result<()> { + Err(Error::NotCompiledWithMetalSupport) + } + fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result { Err(Error::NotCompiledWithMetalSupport) } diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index fcc17afc..31ef1169 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -67,6 +67,7 @@ pub mod shape; mod storage; mod strided_index; mod tensor; +mod tensor_cat; pub mod test_utils; pub mod utils; mod variable; diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index a17b87b8..2e07cce5 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -422,6 +422,7 @@ impl BackendStorage for MetalStorage { let name = match self.dtype { DType::F32 => "powf_f32", DType::F16 => "powf_f16", + DType::BF16 => "powf_bf16", dtype => crate::bail!("Metal contiguous powf {dtype:?} not implemented"), }; candle_metal_kernels::call_powf( @@ -439,6 +440,7 @@ impl BackendStorage for MetalStorage { let name = match self.dtype { DType::F32 => "powf_f32_strided", DType::F16 => "powf_f16_strided", + DType::BF16 => "powf_bf16_strided", dtype => crate::bail!("Metal strided powf {dtype:?} not implemented"), }; candle_metal_kernels::call_powf_strided( @@ -471,6 +473,7 @@ impl BackendStorage for MetalStorage { let name = match self.dtype { DType::F32 => "elu_f32", DType::F16 => "elu_f16", + DType::BF16 => "elu_bf16", dtype => crate::bail!("Metal contiguous elu {dtype:?} not implemented"), }; candle_metal_kernels::call_elu( @@ -488,6 +491,7 @@ impl BackendStorage for MetalStorage { let name = match self.dtype { DType::F32 => "elu_f32_strided", DType::F16 => "elu_f16_strided", + DType::BF16 => "elu_bf16_strided", dtype => crate::bail!("Metal strided elu {dtype:?} not implemented"), }; candle_metal_kernels::call_elu_strided( @@ -1292,6 +1296,67 @@ impl BackendStorage for MetalStorage { )) } + fn copy2d( + &self, + dst: &mut Self, + d1: usize, + d2: usize, + src_s: usize, + dst_s: usize, + src_o: usize, + dst_o: usize, + ) -> Result<()> { + if self.dtype() != dst.dtype() { + crate::bail!( + "copy2d with inconsistent dtypes {:?} {:?}", + self.dtype(), + dst.dtype() + ) + } + let command_buffer = self.device.command_buffer()?; + if src_s == d2 && dst_s == d2 { + command_buffer.set_label("copy2d_contiguous"); + let blit = command_buffer.new_blit_command_encoder(); + blit.set_label("copy2d_contiguous"); + let src_offset = (src_o * self.dtype.size_in_bytes()) as NSUInteger; + let length = (d1 * d2 * self.dtype.size_in_bytes()) as NSUInteger; + let dst_offset = (dst_o * dst.dtype().size_in_bytes()) as NSUInteger; + blit.copy_from_buffer(&self.buffer, src_offset, dst.buffer(), dst_offset, length); + blit.end_encoding(); + } else { + let el_count = d1 * d2; + if el_count == 0 { + return Ok(()); + } + let kernel_name = match self.dtype { + DType::F32 => candle_metal_kernels::copy2d::FLOAT, + DType::F16 => candle_metal_kernels::copy2d::HALF, + DType::BF16 => candle_metal_kernels::copy2d::BFLOAT, + DType::I64 => candle_metal_kernels::copy2d::I64, + DType::U32 => candle_metal_kernels::copy2d::U32, + DType::U8 => candle_metal_kernels::copy2d::U8, + dtype => crate::bail!("Metal copy2d {dtype:?} not implemented"), + }; + candle_metal_kernels::call_copy2d( + &self.device.device, + &command_buffer, + &self.device.kernels, + kernel_name, + &self.buffer, + &dst.buffer, + d1, + d2, + src_s, + dst_s, + src_o * self.dtype.size_in_bytes(), + dst_o * self.dtype.size_in_bytes(), + ) + .map_err(MetalError::from)?; + command_buffer.set_label("copy2d"); + } + Ok(()) + } + fn copy_strided_src(&self, dst: &mut Self, dst_offset: usize, src_l: &Layout) -> Result<()> { let command_buffer = self.device.command_buffer()?; if src_l.is_contiguous() && self.dtype == dst.dtype() { diff --git a/candle-core/src/storage.rs b/candle-core/src/storage.rs index 65bcc6aa..3bd4b022 100644 --- a/candle-core/src/storage.rs +++ b/candle-core/src/storage.rs @@ -701,4 +701,32 @@ impl Storage { .bt()), } } + + #[allow(clippy::too_many_arguments)] + pub(crate) fn copy2d( + &self, + dst: &mut Self, + d1: usize, + d2: usize, + src_s: usize, + dst_s: usize, + src_o: usize, + dst_o: usize, + ) -> Result<()> { + match (self, dst) { + (Self::Cpu(src), Self::Cpu(dst)) => src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o), + (Self::Cuda(src), Self::Cuda(dst)) => { + Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?) + } + (Self::Metal(src), Self::Metal(dst)) => { + Ok(src.copy2d(dst, d1, d2, src_s, dst_s, src_o, dst_o)?) + } + (lhs, rhs) => Err(Error::DeviceMismatchBinaryOp { + lhs: lhs.device().location(), + rhs: rhs.device().location(), + op: "copy2d", + } + .bt()), + } + } } diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index 0e2c3e8f..22cd4950 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -666,7 +666,7 @@ impl Tensor { Ok(from_storage(storage, self.shape(), op, false)) } - fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> { + pub(crate) fn check_dim(&self, dim: usize, op: &'static str) -> Result<()> { if dim >= self.dims().len() { Err(Error::DimOutOfRange { shape: self.shape().clone(), @@ -2149,152 +2149,6 @@ impl Tensor { Self::cat(&args, dim) } - /// Concatenates two or more tensors along a particular dimension. - /// - /// All tensors must of the same rank, and the output will have - /// the same rank - /// - /// ```rust - /// # use candle_core::{Tensor, DType, Device}; - /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?; - /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?; - /// - /// let c = Tensor::cat(&[&a, &b], 0)?; - /// assert_eq!(c.shape().dims(), &[4, 3]); - /// - /// let c = Tensor::cat(&[&a, &b], 1)?; - /// assert_eq!(c.shape().dims(), &[2, 6]); - /// # Ok::<(), candle_core::Error>(()) - /// ``` - pub fn cat, D: Dim>(args: &[A], dim: D) -> Result { - if args.is_empty() { - Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())? - } - let arg0 = args[0].as_ref(); - if args.len() == 1 { - return Ok(arg0.clone()); - } - let dim = dim.to_index(arg0.shape(), "cat")?; - for arg in args { - arg.as_ref().check_dim(dim, "cat")?; - } - for (arg_idx, arg) in args.iter().enumerate() { - let arg = arg.as_ref(); - if arg0.rank() != arg.rank() { - Err(Error::UnexpectedNumberOfDims { - expected: arg0.rank(), - got: arg.rank(), - shape: arg.shape().clone(), - } - .bt())? - } - for (dim_idx, (v1, v2)) in arg0 - .shape() - .dims() - .iter() - .zip(arg.shape().dims().iter()) - .enumerate() - { - if dim_idx != dim && v1 != v2 { - Err(Error::ShapeMismatchCat { - dim: dim_idx, - first_shape: arg0.shape().clone(), - n: arg_idx + 1, - nth_shape: arg.shape().clone(), - } - .bt())? - } - } - } - if dim == 0 { - Self::cat0(args) - } else { - // TODO: Avoid these transpositions and have an implementation that works - // for dim != 0... - let args: Vec = args - .iter() - .map(|a| a.as_ref().transpose(0, dim)) - .collect::>>()?; - let cat = Self::cat0(&args)?; - cat.transpose(0, dim) - } - } - - fn cat0>(args: &[A]) -> Result { - if args.is_empty() { - Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())? - } - let arg0 = args[0].as_ref(); - if args.len() == 1 { - return Ok(arg0.clone()); - } - let rank = arg0.rank(); - let device = arg0.device(); - let dtype = arg0.dtype(); - let first_dims = arg0.shape().dims(); - let mut cat_dims = first_dims.to_vec(); - cat_dims[0] = 0; - let mut offsets = vec![0usize]; - for (arg_idx, arg) in args.iter().enumerate() { - let arg = arg.as_ref(); - if arg.dtype() != dtype { - Err(Error::DTypeMismatchBinaryOp { - lhs: dtype, - rhs: arg.dtype(), - op: "cat", - } - .bt())? - } - if arg.device().location() != device.location() { - Err(Error::DeviceMismatchBinaryOp { - lhs: device.location(), - rhs: arg.device().location(), - op: "cat", - } - .bt())? - } - if rank != arg.rank() { - Err(Error::UnexpectedNumberOfDims { - expected: rank, - got: arg.rank(), - shape: arg.shape().clone(), - } - .bt())? - } - for (dim_idx, (v1, v2)) in arg0 - .shape() - .dims() - .iter() - .zip(arg.shape().dims().iter()) - .enumerate() - { - if dim_idx == 0 { - cat_dims[0] += v2; - } - if dim_idx != 0 && v1 != v2 { - Err(Error::ShapeMismatchCat { - dim: dim_idx, - first_shape: arg0.shape().clone(), - n: arg_idx + 1, - nth_shape: arg.shape().clone(), - } - .bt())? - } - } - let next_offset = offsets.last().unwrap() + arg.elem_count(); - offsets.push(next_offset); - } - let shape = Shape::from(cat_dims); - let op = BackpropOp::new(args, |args| Op::Cat(args, 0)); - let mut storage = device.zeros(&shape, dtype)?; - for (arg, &offset) in args.iter().zip(offsets.iter()) { - let arg = arg.as_ref(); - arg.storage() - .copy_strided_src(&mut storage, offset, arg.layout())?; - } - Ok(from_storage(storage, shape, op, false)) - } - /// Pad the input tensor using 0s along dimension `dim`. This adds `left` elements before the /// input tensor values and `right` elements after. pub fn pad_with_zeros(&self, dim: D, left: usize, right: usize) -> Result { diff --git a/candle-core/src/tensor_cat.rs b/candle-core/src/tensor_cat.rs new file mode 100644 index 00000000..25acc80e --- /dev/null +++ b/candle-core/src/tensor_cat.rs @@ -0,0 +1,240 @@ +use crate::{shape::Dim, Error, Result, Shape, Tensor}; + +impl Tensor { + /// Concatenates two or more tensors along a particular dimension. + /// + /// All tensors must of the same rank, and the output will have + /// the same rank + /// + /// ```rust + /// # use candle_core::{Tensor, DType, Device}; + /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?; + /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?; + /// + /// let c = Tensor::cat(&[&a, &b], 0)?; + /// assert_eq!(c.shape().dims(), &[4, 3]); + /// + /// let c = Tensor::cat(&[&a, &b], 1)?; + /// assert_eq!(c.shape().dims(), &[2, 6]); + /// # Ok::<(), candle_core::Error>(()) + /// ``` + pub fn cat, D: Dim>(args: &[A], dim: D) -> Result { + if args.is_empty() { + Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())? + } + let arg0 = args[0].as_ref(); + if args.len() == 1 { + return Ok(arg0.clone()); + } + let dim = dim.to_index(arg0.shape(), "cat")?; + for arg in args { + arg.as_ref().check_dim(dim, "cat")?; + } + for (arg_idx, arg) in args.iter().enumerate() { + let arg = arg.as_ref(); + if arg0.rank() != arg.rank() { + Err(Error::UnexpectedNumberOfDims { + expected: arg0.rank(), + got: arg.rank(), + shape: arg.shape().clone(), + } + .bt())? + } + for (dim_idx, (v1, v2)) in arg0 + .shape() + .dims() + .iter() + .zip(arg.shape().dims().iter()) + .enumerate() + { + if dim_idx != dim && v1 != v2 { + Err(Error::ShapeMismatchCat { + dim: dim_idx, + first_shape: arg0.shape().clone(), + n: arg_idx + 1, + nth_shape: arg.shape().clone(), + } + .bt())? + } + } + } + if dim == 0 { + Self::cat0(args) + } else { + let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous()); + if all_contiguous { + Self::cat_contiguous(args, dim) + } else { + let args: Vec = args + .iter() + .map(|a| a.as_ref().transpose(0, dim)) + .collect::>>()?; + let cat = Self::cat0(&args)?; + cat.transpose(0, dim) + } + } + } + + fn cat0>(args: &[A]) -> Result { + if args.is_empty() { + Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())? + } + let arg0 = args[0].as_ref(); + if args.len() == 1 { + return Ok(arg0.clone()); + } + let rank = arg0.rank(); + let device = arg0.device(); + let dtype = arg0.dtype(); + let first_dims = arg0.shape().dims(); + let mut cat_dims = first_dims.to_vec(); + cat_dims[0] = 0; + let mut offsets = vec![0usize]; + for (arg_idx, arg) in args.iter().enumerate() { + let arg = arg.as_ref(); + if arg.dtype() != dtype { + Err(Error::DTypeMismatchBinaryOp { + lhs: dtype, + rhs: arg.dtype(), + op: "cat", + } + .bt())? + } + if arg.device().location() != device.location() { + Err(Error::DeviceMismatchBinaryOp { + lhs: device.location(), + rhs: arg.device().location(), + op: "cat", + } + .bt())? + } + if rank != arg.rank() { + Err(Error::UnexpectedNumberOfDims { + expected: rank, + got: arg.rank(), + shape: arg.shape().clone(), + } + .bt())? + } + for (dim_idx, (v1, v2)) in arg0 + .shape() + .dims() + .iter() + .zip(arg.shape().dims().iter()) + .enumerate() + { + if dim_idx == 0 { + cat_dims[0] += v2; + } + if dim_idx != 0 && v1 != v2 { + Err(Error::ShapeMismatchCat { + dim: dim_idx, + first_shape: arg0.shape().clone(), + n: arg_idx + 1, + nth_shape: arg.shape().clone(), + } + .bt())? + } + } + let next_offset = offsets.last().unwrap() + arg.elem_count(); + offsets.push(next_offset); + } + let shape = Shape::from(cat_dims); + let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0)); + let mut storage = device.zeros(&shape, dtype)?; + for (arg, &offset) in args.iter().zip(offsets.iter()) { + let arg = arg.as_ref(); + arg.storage() + .copy_strided_src(&mut storage, offset, arg.layout())?; + } + Ok(crate::tensor::from_storage(storage, shape, op, false)) + } + + fn cat_contiguous>(args: &[A], dim: usize) -> Result { + if args.is_empty() { + Err(Error::OpRequiresAtLeastOneTensor { op: "cat" }.bt())? + } + let arg0 = args[0].as_ref(); + if args.len() == 1 { + return Ok(arg0.clone()); + } + let rank = arg0.rank(); + let device = arg0.device(); + let dtype = arg0.dtype(); + let first_dims = arg0.shape().dims(); + let mut cat_dims = first_dims.to_vec(); + cat_dims[dim] = 0; + for (arg_idx, arg) in args.iter().enumerate() { + let arg = arg.as_ref(); + if arg.dtype() != dtype { + Err(Error::DTypeMismatchBinaryOp { + lhs: dtype, + rhs: arg.dtype(), + op: "cat", + } + .bt())? + } + if arg.device().location() != device.location() { + Err(Error::DeviceMismatchBinaryOp { + lhs: device.location(), + rhs: arg.device().location(), + op: "cat", + } + .bt())? + } + if rank != arg.rank() { + Err(Error::UnexpectedNumberOfDims { + expected: rank, + got: arg.rank(), + shape: arg.shape().clone(), + } + .bt())? + } + for (dim_idx, (v1, v2)) in arg0 + .shape() + .dims() + .iter() + .zip(arg.shape().dims().iter()) + .enumerate() + { + if dim_idx == dim { + cat_dims[dim] += v2; + } + if dim_idx != dim && v1 != v2 { + Err(Error::ShapeMismatchCat { + dim: dim_idx, + first_shape: arg0.shape().clone(), + n: arg_idx + 1, + nth_shape: arg.shape().clone(), + } + .bt())? + } + } + } + let cat_target_dim_len = cat_dims[dim]; + let block_size: usize = cat_dims.iter().skip(1 + dim).product(); + let shape = Shape::from(cat_dims); + let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim)); + let mut storage = device.zeros(&shape, dtype)?; + let mut dst_o = 0; + for arg in args.iter() { + let arg = arg.as_ref(); + let arg_dims = arg.shape().dims(); + let d1: usize = arg_dims.iter().take(dim).product(); + let d2 = block_size * arg_dims[dim]; + let dst_s = block_size * cat_target_dim_len; + let src_o = arg.layout().start_offset(); + arg.storage().copy2d( + &mut storage, + d1, + d2, + /* src_s */ d2, + dst_s, + src_o, + dst_o, + )?; + dst_o += d2; + } + Ok(crate::tensor::from_storage(storage, shape, op, false)) + } +} diff --git a/candle-core/tests/conv_tests.rs b/candle-core/tests/conv_tests.rs index f0f1b7f2..ba60b778 100644 --- a/candle-core/tests/conv_tests.rs +++ b/candle-core/tests/conv_tests.rs @@ -53,6 +53,12 @@ fn conv1d(dev: &Device) -> Result<()> { test_utils::to_vec1_round(&res.flatten_all()?, 4)?, [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352] ); + + // conv-transposes are not implemented for metal. + if dev.is_metal() { + return Ok(()); + } + let w = w.transpose(0, 1)?; // The CPU kernels applied in the contiguous and non contiguous cases are different. for w in [w.clone(), w.contiguous()?] { @@ -162,31 +168,33 @@ fn conv2d(dev: &Device) -> Result<()> { 10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075 ] ); - let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?; - assert_eq!(res.dims(), [1, 2, 7, 7]); - assert_eq!( - test_utils::to_vec3_round(&res.i(0)?, 4)?, - [ + if !dev.is_metal() { + let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?; + assert_eq!(res.dims(), [1, 2, 7, 7]); + assert_eq!( + test_utils::to_vec3_round(&res.i(0)?, 4)?, [ - [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277], - [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375], - [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889], - [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632], - [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985], - [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114], - [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579] - ], - [ - [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211], - [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131], - [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621], - [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142], - [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059], - [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516], - [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171] + [ + [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277], + [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375], + [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889], + [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632], + [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985], + [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114], + [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579] + ], + [ + [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211], + [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131], + [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621], + [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142], + [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059], + [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516], + [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171] + ] ] - ] - ); + ); + } // Dilations. let res = t.conv2d(&w, 0, 1, 2, 1)?; assert_eq!(res.dims(), [1, 2, 1, 1]); @@ -195,36 +203,44 @@ fn conv2d(dev: &Device) -> Result<()> { [2.45, -2.3504], ); - // Transpose and dilations. - let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?; - assert_eq!(res.dims(), [1, 2, 9, 9]); - assert_eq!( - test_utils::to_vec3_round(&res.i(0)?, 4)?, - [ + if !dev.is_metal() { + // Transpose and dilations. + let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?; + assert_eq!(res.dims(), [1, 2, 9, 9]); + assert_eq!( + test_utils::to_vec3_round(&res.i(0)?, 4)?, [ - [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277], - [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499], - [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376], - [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141], - [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822], - [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03], - [-2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, -3.5024], - [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787], - [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579] - ], - [ - [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211], - [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278], - [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861], - [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185], - [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642], - [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957], - [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856], - [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908], - [-5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, 1.0171] + [ + [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277], + [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499], + [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376], + [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141], + [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822], + [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03], + [ + -2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, + -3.5024 + ], + [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787], + [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579] + ], + [ + [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211], + [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278], + [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861], + [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185], + [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642], + [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957], + [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856], + [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908], + [ + -5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, + 1.0171 + ] + ] ] - ] - ); + ); + } Ok(()) } @@ -278,6 +294,12 @@ fn conv2d_small(dev: &Device) -> Result<()> { 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000 ] ); + + // conv-transposes are not implemented for metal + if dev.is_metal() { + return Ok(()); + } + let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?; assert_eq!(res.dims(), [1, 1, 3, 3]); assert_eq!( @@ -379,6 +401,10 @@ print(w.grad.shape) print(w.grad[0]) */ fn conv2d_grad(dev: &Device) -> Result<()> { + // conv-transposes are not implemented for metal + if dev.is_metal() { + return Ok(()); + } use candle_core::Var; let t = Var::from_slice( &[ diff --git a/candle-core/tests/grad_tests.rs b/candle-core/tests/grad_tests.rs index a4d81618..b8b6be8d 100644 --- a/candle-core/tests/grad_tests.rs +++ b/candle-core/tests/grad_tests.rs @@ -1,3 +1,4 @@ +#![allow(clippy::approx_constant)] use anyhow::{Context, Result}; use candle_core::{test_device, test_utils, Device, Shape, Tensor, Var}; @@ -96,24 +97,24 @@ fn unary_grad(device: &Device) -> Result<()> { let grads = y.backward()?; let grad_x = grads.get(x).context("no grad for x")?; assert_eq!( - y.to_vec1::()?, - [20.085537, 2.7182817, 54.59815, 1.1618342] + test_utils::to_vec1_round(&y, 4)?, + [20.0855, 2.7183, 54.5982, 1.1618] ); assert_eq!( - grad_x.to_vec1::()?, - [20.085537, 2.7182817, 54.59815, 1.1618342] + test_utils::to_vec1_round(grad_x, 4)?, + [20.0855, 2.7183, 54.5982, 1.1618] ); let y = x.exp()?.sqr()?; let grads = y.backward()?; let grad_x = grads.get(x).context("no grad for x")?; assert_eq!( - y.to_vec1::()?, - [403.4288, 7.3890557, 2980.9578, 1.3498588] + test_utils::to_vec1_round(&y, 3)?, + [403.429, 7.389, 2980.958, 1.35] ); // exp(x)^2 = exp(2*x) assert_eq!( - grad_x.to_vec1::()?, - [806.8576, 14.778111, 5961.9155, 2.6997175] + test_utils::to_vec1_round(grad_x, 2)?, + [806.86, 14.78, 5961.92, 2.7] ); let y = x.sin()?; let grads = y.backward()?; @@ -261,6 +262,7 @@ fn unary_grad(device: &Device) -> Result<()> { let y = elu_x.elu(2.)?; let grads = y.backward()?; let grad_x = grads.get(&elu_x).context("no grad for x")?; + assert_eq!( test_utils::to_vec1_round(&y, 4)?, [-1.2642, 0.0000, -1.7293, 3.0000] diff --git a/candle-core/tests/pool_tests.rs b/candle-core/tests/pool_tests.rs index a3708ec4..a6530e03 100644 --- a/candle-core/tests/pool_tests.rs +++ b/candle-core/tests/pool_tests.rs @@ -2,6 +2,9 @@ use candle_core::{test_device, test_utils, Device, IndexOp, Result, Tensor}; // https://github.com/huggingface/candle/issues/364 fn avg_pool2d(dev: &Device) -> Result<()> { + if dev.is_metal() { + return Ok(()); + } let data: Vec = vec![ 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; @@ -19,6 +22,9 @@ fn avg_pool2d(dev: &Device) -> Result<()> { } fn max_pool2d(dev: &Device) -> Result<()> { + if dev.is_metal() { + return Ok(()); + } let data: Vec = vec![ 1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1., ]; @@ -43,6 +49,9 @@ res = torch.nn.functional.avg_pool2d(t, 2) print(res) */ fn avg_pool2d_pytorch(dev: &Device) -> Result<()> { + if dev.is_metal() { + return Ok(()); + } let t = Tensor::new( &[ 0.4056f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, 3.0616, diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index 31a27422..b2475adc 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -672,6 +672,31 @@ fn cat(device: &Device) -> Result<()> { [2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0] ] ); + + // 3D + let t1 = Tensor::arange(0, 48i64, device)?.reshape((2, 6, 4))?; + let t2 = Tensor::arange(100, 124i64, device)?.reshape((2, 3, 4))?; + let t3 = Tensor::arange(10000, 10032i64, device)?.reshape((2, 4, 4))?; + + let t_cat = Tensor::cat(&[&t1, &t2, &t3], 1)?; + + let t1 = t1.t()?.contiguous()?.t()?; + let t2 = t2.t()?.contiguous()?.t()?; + let t3 = t3.t()?.contiguous()?.t()?; + let t_cat2 = Tensor::cat(&[&t1, &t2, &t3], 1)?; + + let diff = t_cat.eq(&t_cat2)?.to_dtype(DType::F32)?.sum_all()?; + assert_eq!(diff.to_vec0::()?, 104.0); + assert_eq!(t_cat.i((0, 0, 0))?.to_vec0::()?, 0); + assert_eq!(t_cat.i((0, 4, 0))?.to_vec0::()?, 16); + assert_eq!(t_cat.i((0, 5, 0))?.to_vec0::()?, 20); + assert_eq!(t_cat.i((1, 5, 0))?.to_vec0::()?, 44); + assert_eq!(t_cat.i((0, 6, 0))?.to_vec0::()?, 100); + assert_eq!(t_cat.i((1, 6, 0))?.to_vec0::()?, 112); + assert_eq!(t_cat.i((0, 6, 1))?.to_vec0::()?, 101); + assert_eq!(t_cat.i((0, 7, 1))?.to_vec0::()?, 105); + assert_eq!(t_cat.i((0, 12, 1))?.to_vec0::()?, 10013); + assert_eq!(t_cat.i((1, 12, 3))?.to_vec0::()?, 10031); Ok(()) } diff --git a/candle-kernels/src/fill.cu b/candle-kernels/src/fill.cu index 883ca072..ca448d98 100644 --- a/candle-kernels/src/fill.cu +++ b/candle-kernels/src/fill.cu @@ -10,11 +10,39 @@ __device__ void fill_with(T *buf, T value, const size_t numel) { extern "C" __global__ void fill_u8(uint8_t *buf, uint8_t value, const size_t numel) { fill_with(buf, value, numel); } extern "C" __global__ void fill_u32(uint32_t *buf, uint32_t value, const size_t numel) { fill_with(buf, value, numel); } extern "C" __global__ void fill_i64(int64_t *buf, int64_t value, const size_t numel) { fill_with(buf, value, numel); } -extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); } extern "C" __global__ void fill_f32(float *buf, float value, const size_t numel) { fill_with(buf, value, numel); } extern "C" __global__ void fill_f64(double *buf, double value, const size_t numel) { fill_with(buf, value, numel); } +template +__device__ void copy2d(const T *src, T *dst, uint32_t d1, uint32_t d2, uint32_t src_s, uint32_t dst_s) { + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= d1 * d2) { + return; + } + uint32_t idx1 = idx / d2; + uint32_t idx2 = idx - d2 * idx1; + dst[idx1 * dst_s + idx2] = src[idx1 * src_s + idx2]; +} + +#define COPY2D_OP(TYPENAME, FNNAME) \ +extern "C" __global__ \ +void FNNAME(const TYPENAME *src, TYPENAME *dst, uint32_t d1, uint32_t d2, uint32_t src_s, uint32_t dst_s) { \ + copy2d(src, dst, d1, d2, src_s, dst_s); \ +} \ + +COPY2D_OP(float, copy2d_f32) +COPY2D_OP(double, copy2d_f64) +COPY2D_OP(uint8_t, copy2d_u8) +COPY2D_OP(uint32_t, copy2d_u32) +COPY2D_OP(int64_t, copy2d_i64) + +#if __CUDA_ARCH__ >= 530 +extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); } +COPY2D_OP(__half, copy2d_f16) +#endif + #if __CUDA_ARCH__ >= 800 #include extern "C" __global__ void fill_bf16(__nv_bfloat16 *buf, __nv_bfloat16 value, const size_t numel) { fill_with(buf, value, numel); } +COPY2D_OP(__nv_bfloat16, copy2d_bf16) #endif diff --git a/candle-metal-kernels/src/affine.metal b/candle-metal-kernels/src/affine.metal index a4484998..76c0365a 100644 --- a/candle-metal-kernels/src/affine.metal +++ b/candle-metal-kernels/src/affine.metal @@ -89,7 +89,7 @@ kernel void FN_NAME( \ return; \ } \ const TYPENAME x = input[id]; \ - output[id] = TYPENAME((x > 0)?x: mul * exp(x - 1)); \ + output[id] = TYPENAME((x > 0)?x: mul * (exp(x) - 1)); \ } \ kernel void FN_NAME##_strided( \ constant size_t &dim, \ diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 47ce7e96..a879c86a 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -127,6 +127,16 @@ pub enum Source { Quantized, } +pub mod copy2d { + pub struct Kernel(pub &'static str); + pub const FLOAT: Kernel = Kernel("copy2d_f32"); + pub const HALF: Kernel = Kernel("copy2d_f16"); + pub const BFLOAT: Kernel = Kernel("copy2d_bf16"); + pub const I64: Kernel = Kernel("copy2d_i64"); + pub const U32: Kernel = Kernel("copy2d_u32"); + pub const U8: Kernel = Kernel("copy2d_u8"); +} + macro_rules! ops{ ($($name:ident),+) => { @@ -365,6 +375,46 @@ pub fn call_unary_contiguous( Ok(()) } +#[allow(clippy::too_many_arguments)] +pub fn call_copy2d( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + name: copy2d::Kernel, + input: &Buffer, + output: &Buffer, + d1: usize, + d2: usize, + src_s: usize, + dst_s: usize, + src_o_in_bytes: usize, + dst_o_in_bytes: usize, +) -> Result<(), MetalKernelError> { + let pipeline = kernels.load_pipeline(device, Source::Unary, name.0)?; + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + set_params!( + encoder, + ( + d1, + d2, + src_s, + dst_s, + (input, src_o_in_bytes), + (output, dst_o_in_bytes) + ) + ); + + let width: usize = d1 * d2; + let (thread_group_count, thread_group_size) = linear_split(&pipeline, width); + + encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[allow(clippy::too_many_arguments)] pub fn call_unary_strided( device: &Device, diff --git a/candle-metal-kernels/src/unary.metal b/candle-metal-kernels/src/unary.metal index 1e0d5526..bdc13f9e 100644 --- a/candle-metal-kernels/src/unary.metal +++ b/candle-metal-kernels/src/unary.metal @@ -102,6 +102,30 @@ UNARY(NAME, half, NAME##_f16, NAME##_f16_strided); #define BFLOAT_UNARY_OP(NAME) \ UNARY(NAME, bfloat, NAME##_bf16, NAME##_bf16_strided); +#define COPY2D(FN_NAME, TYPENAME) \ +kernel void FN_NAME( \ + constant size_t &d1, \ + constant size_t &d2, \ + constant size_t &src_s, \ + constant size_t &dst_s, \ + device const TYPENAME *input, \ + device TYPENAME *output, \ + uint tid [[ thread_position_in_grid ]] \ +) { \ + if (tid >= d1 * d2) { \ + return; \ + } \ + size_t idx1 = tid / d2; \ + size_t idx2 = tid - idx1 * d2; \ + size_t src_idx = idx1 * src_s + idx2; \ + size_t dst_idx = idx1 * dst_s + idx2; \ + output[dst_idx] = input[src_idx]; \ +} + +COPY2D(copy2d_f32, float) +COPY2D(copy2d_f16, half) +COPY2D(copy2d_u8, uint8_t) +COPY2D(copy2d_u32, uint32_t) UNARY_OP(cos) UNARY_OP(sin) @@ -128,6 +152,7 @@ UNARY(id, uint32_t, copy_u32, copy_u32_strided) #if __METAL_VERSION__ >= 220 UNARY(id, int64_t, copy_i64, copy_i64_strided) +COPY2D(copy2d_i64, int64_t) #endif #if defined(__HAVE_BFLOAT__) @@ -151,4 +176,6 @@ BFLOAT_UNARY_OP(recip) BFLOAT_UNARY_OP(relu) UNARY(id, bfloat, copy_bf16, copy_bf16_strided) + +COPY2D(copy2d_bf64, bfloat) #endif diff --git a/candle-nn/examples/cpu_benchmarks.rs b/candle-nn/examples/cpu_benchmarks.rs index 001be116..430316b8 100644 --- a/candle-nn/examples/cpu_benchmarks.rs +++ b/candle-nn/examples/cpu_benchmarks.rs @@ -238,6 +238,23 @@ impl Benchmark for QMatMul { const ITERS: usize = 100; } +struct Cat; +impl Benchmark for Cat { + type PreProcessData = (Tensor, Tensor); + type RunResult = Tensor; + fn preprocess() -> Result { + let lhs = Tensor::randn(0f32, 1., (1, 32, 2000, 128), &Device::Cpu)?; + let rhs = Tensor::randn(0f32, 1., (1, 32, 1, 128), &Device::Cpu)?; + Ok((lhs, rhs)) + } + + fn run_one(d: &Self::PreProcessData) -> Result { + Tensor::cat(&[&d.0, &d.1], 2) + } + + const ITERS: usize = 1000; +} + struct Softmax; impl Benchmark for Softmax { type PreProcessData = Tensor; @@ -295,6 +312,7 @@ enum Task { Qmatmul, Softmax, SoftmaxLastDim, + Cat, } #[derive(Parser, Debug)] @@ -319,6 +337,7 @@ fn main() -> Result<()> { Task::Softmax => run::(args.iters)?, Task::SoftmaxLastDim => run::(args.iters)?, Task::Qmatmul => run::(args.iters)?, + Task::Cat => run::(args.iters)?, } Ok(()) } From e316cb699743b5d45ab4a1067057b8f6d8687a02 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Sun, 17 Mar 2024 15:55:11 -0400 Subject: [PATCH 021/131] add support for casting between all datatypes (#1860) --- candle-core/src/metal_backend.rs | 29 +++- candle-metal-kernels/src/cast.metal | 57 +++++-- candle-metal-kernels/src/tests.rs | 226 +++++++++++++++++++--------- 3 files changed, 220 insertions(+), 92 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 2e07cce5..a6513b1c 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -609,28 +609,41 @@ impl BackendStorage for MetalStorage { let command_buffer = device.command_buffer()?; if layout.is_contiguous() && layout.start_offset() == 0 { let kernel_name = match (self.dtype, dtype) { - (DType::U32, DType::F32) => "cast_u32_f32", - (DType::U32, DType::U8) => "cast_u32_u8", - (DType::U32, DType::I64) => "cast_u32_i64", (DType::U32, DType::BF16) => "cast_u32_bf16", + (DType::U32, DType::F16) => "cast_u32_f16", + (DType::U32, DType::F32) => "cast_u32_f32", + (DType::U32, DType::I64) => "cast_u32_i64", + (DType::U32, DType::U8) => "cast_u32_u8", - (DType::U8, DType::U32) => "cast_u8_u32", + (DType::U8, DType::BF16) => "cast_u8_bf16", + (DType::U8, DType::F16) => "cast_u8_f16", (DType::U8, DType::F32) => "cast_u8_f32", (DType::U8, DType::I64) => "cast_u8_i64", - (DType::U8, DType::BF16) => "cast_u8_bf16", + (DType::U8, DType::U32) => "cast_u8_u32", - (DType::F32, DType::F16) => "cast_f32_f16", (DType::F32, DType::BF16) => "cast_f32_bf16", + (DType::F32, DType::F16) => "cast_f32_f16", + (DType::F32, DType::I64) => "cast_f32_i64", + (DType::F32, DType::U32) => "cast_f32_u32", + (DType::F32, DType::U8) => "cast_f32_u8", + (DType::I64, DType::BF16) => "cast_i64_bf16", + (DType::I64, DType::F16) => "cast_i64_f16", (DType::I64, DType::F32) => "cast_i64_f32", + (DType::I64, DType::U32) => "cast_i64_u32", + (DType::I64, DType::U8) => "cast_i64_u8", (DType::F16, DType::BF16) => "cast_f16_bf16", (DType::F16, DType::F32) => "cast_f16_f32", + (DType::F16, DType::I64) => "cast_f16_i64", + (DType::F16, DType::U32) => "cast_f16_u32", + (DType::F16, DType::U8) => "cast_f16_u8", - (DType::BF16, DType::U8) => "cast_bf16_u8", - (DType::BF16, DType::U32) => "cast_bf16_u32", (DType::BF16, DType::F16) => "cast_bf16_f16", (DType::BF16, DType::F32) => "cast_bf16_f32", + (DType::BF16, DType::I64) => "cast_bf16_i64", + (DType::BF16, DType::U32) => "cast_bf16_u32", + (DType::BF16, DType::U8) => "cast_bf16_u8", (left, right) => { crate::bail!("Metal contiguous to_dtype {left:?} {right:?} not implemented") diff --git a/candle-metal-kernels/src/cast.metal b/candle-metal-kernels/src/cast.metal index 9aead139..2af3fdce 100644 --- a/candle-metal-kernels/src/cast.metal +++ b/candle-metal-kernels/src/cast.metal @@ -72,27 +72,60 @@ kernel void FN_NAME_STRIDED( \ output[tid] = static_cast(static_cast(input[get_strided_index(tid, num_dims, dims, strides)])); \ } \ +// u32 CAST(cast_u32_f32, cast_u32_f32_strided, uint32_t, float) CAST(cast_u32_u8, cast_u32_u8_strided, uint32_t, uint8_t) -CAST(cast_u8_u32, cast_u8_u32_strided, uint8_t, uint32_t) -CAST(cast_u8_f32, cast_u8_f32_strided, uint8_t, float) -CAST(cast_f16_f32, cast_f16_f32_strided, half, float) -CAST(cast_f32_f16, cast_f32_f16_strided, float, half) - +CAST(cast_u32_f16, cast_u32_f16_strided, uint32_t, half) #if __METAL_VERSION__ >= 220 -CAST(cast_u8_i64, cast_u8_i64_strided, uint8_t, int64_t) CAST(cast_u32_i64, cast_u32_i64_strided, uint32_t, int64_t) -CAST(cast_i64_f32, cast_i64_f32_strided, int64_t, float) +#endif +#if defined(__HAVE_BFLOAT__) +CAST(cast_u32_bf16, cast_u32_bf16_strided, uint32_t, bfloat) #endif +// u8 +CAST(cast_u8_u32, cast_u8_u32_strided, uint8_t, uint32_t) +CAST(cast_u8_f32, cast_u8_f32_strided, uint8_t, float) +CAST(cast_u8_f16, cast_u8_f16_strided, uint8_t, half) +#if __METAL_VERSION__ >= 220 +CAST(cast_u8_i64, cast_u8_i64_strided, uint8_t, int64_t) +#endif +#if defined(__HAVE_BFLOAT__) +CAST(cast_u8_bf16, cast_u8_bf16_strided, uint8_t, bfloat) +#endif + +// f16 +CAST(cast_f16_f32, cast_f16_f32_strided, half, float) +CAST(cast_f16_u8, cast_f16_u8_strided, half, uint8_t) +CAST(cast_f16_u32, cast_f16_u32_strided, half, uint32_t) +CAST(cast_f16_i64, cast_f16_i64_strided, half, int64_t) +#if defined(__HAVE_BFLOAT__) +CAST_THROUGH(cast_f16_bf16, cast_f16_bf16_strided, half, bfloat, float) +#endif + +// i64 +CAST(cast_i64_f32, cast_i64_f32_strided, int64_t, float) +CAST(cast_i64_u8, cast_i64_u8_strided, int64_t, uint8_t) +CAST(cast_i64_u32, cast_i64_u32_strided, int64_t, uint32_t) +CAST(cast_i64_f16, cast_i64_f16_strided, int64_t, half) +#if defined(__HAVE_BFLOAT__) +CAST_THROUGH(cast_i64_bf16, cast_i64_bf16_strided, int64_t, bfloat, float) +#endif + +// f32 +CAST(cast_f32_f16, cast_f32_f16_strided, float, half) +CAST(cast_f32_u32, cast_f32_u32_strided, float, uint32_t) +CAST(cast_f32_u8, cast_f32_u8_strided, float, uint8_t) +CAST(cast_f32_i64, cast_f32_i64_strided, float, int64_t) +#if defined(__HAVE_BFLOAT__) +CAST(cast_f32_bf16, cast_f32_bf16_strided, float, bfloat) +#endif + +// bf16 #if defined(__HAVE_BFLOAT__) CAST(cast_bf16_u32, cast_bf16_u32_strided, bfloat, uint32_t) +CAST(cast_bf16_i64, cast_bf16_i64_strided, bfloat, int64_t) CAST(cast_bf16_f32, cast_bf16_f32_strided, bfloat, float) -CAST(cast_u8_bf16, cast_u8_bf16_strided, uint8_t, bfloat) -CAST(cast_u32_bf16, cast_u32_bf16_strided, uint32_t, bfloat) -CAST(cast_f32_bf16, cast_f32_bf16_strided, float, bfloat) - CAST_THROUGH(cast_bf16_u8, cast_bf16_u8_strided, bfloat, uint8_t, float) CAST_THROUGH(cast_bf16_f16, cast_bf16_f16_strided, bfloat, half, float) -CAST_THROUGH(cast_f16_bf16, cast_f16_bf16_strided, half, bfloat, float) #endif \ No newline at end of file diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index b47fff6a..b2f1d723 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -292,7 +292,7 @@ fn binary_ops_bf16() { binary_op!(max, |x: bf16, y| x.max(y)); } -fn cast(v: &[T], name: &'static str) -> Vec { +fn run_cast(v: &[T], name: &'static str) -> Vec { let device = device(); let kernels = Kernels::new(); let command_queue = device.new_command_queue(); @@ -319,107 +319,189 @@ fn cast(v: &[T], name: &'static str) -> Vec { } #[test] -fn cast_u32_f32() { - let v = vec![1u32, 2, 3]; - let results = cast(&v, "cast_u32_f32"); - let expected: Vec<_> = v.iter().map(|&v| v as f32).collect(); - assert_eq!(approx(results, 4), vec![1.0f32, 2.0, 3.0]); - assert_eq!(approx(expected, 4), vec![1.0f32, 2.0, 3.0]); +fn cast_f32() { + let v_f64 = vec![1.0f64, 2.0, 3.0]; + let v_f32: Vec = v_f64.iter().map(|&v| v as f32).collect(); + let v_f16: Vec = v_f64.iter().map(|&v| f16::from_f32(v as f32)).collect(); + let v_bf16: Vec = v_f64.iter().map(|&v| bf16::from_f32(v as f32)).collect(); + let v_u32: Vec = v_f64.iter().map(|&v| v as u32).collect(); + let v_u8: Vec = v_f64.iter().map(|&v| v as u8).collect(); + let v_i64: Vec = v_f64.iter().map(|&v| v as i64).collect(); - let v = vec![1.0f32, 2.0, 3.0]; - let input: Vec = v.iter().map(|v| f16::from_f32(*v)).collect(); - let results: Vec = cast(&input, "cast_f16_f32"); - assert_eq!(results, vec![1.0f32, 2.0, 3.0]); + // f32 -> f16 + let results: Vec = run_cast(&v_f32, "cast_f32_f16"); + assert_eq!(results, v_f16); - let v = vec![1.0f32; 10_000]; - let input: Vec = v.iter().map(|v| f16::from_f32(*v)).collect(); - let results: Vec = cast(&input, "cast_f16_f32"); - assert_eq!(results.len(), 10_000); - assert_eq!(&results[..10], vec![1.0f32; 10]); - assert_eq!(results, vec![1.0f32; 10_000]); + // f32 -> bf16 + let results: Vec = run_cast(&v_f32, "cast_f32_bf16"); + assert_eq!(results, v_bf16); + + // f32 -> u32 + let results: Vec = run_cast(&v_f32, "cast_f32_u32"); + assert_eq!(results, v_u32); + + // f32 -> u8 + let results: Vec = run_cast(&v_f32, "cast_f32_u8"); + assert_eq!(results, v_u8); + + // f32 -> i64 + let results: Vec = run_cast(&v_f32, "cast_f32_i64"); + assert_eq!(results, v_i64); } #[test] -fn it_cast_bf16_u32() { - let input: Vec = (1..=3).map(|v| bf16::from_f32(v as f32)).collect(); +fn cast_f16() { + let v_f64 = vec![1.0f64, 2.0, 3.0]; + let v_f32: Vec = v_f64.iter().map(|&v| v as f32).collect(); + let v_f16: Vec = v_f64.iter().map(|&v| f16::from_f32(v as f32)).collect(); + let v_bf16: Vec = v_f64.iter().map(|&v| bf16::from_f32(v as f32)).collect(); + let v_u32: Vec = v_f64.iter().map(|&v| v as u32).collect(); + let v_u8: Vec = v_f64.iter().map(|&v| v as u8).collect(); + let v_i64: Vec = v_f64.iter().map(|&v| v as i64).collect(); - let output: Vec = cast(&input, "cast_bf16_u32"); - let expected: Vec = (1..=3).map(|v| v as u32).collect(); + // f16 -> f32 + let results: Vec = run_cast(&v_f16, "cast_f16_f32"); + assert_eq!(results, v_f32); - assert_eq!(output, expected); + // f16 -> bf16 + let results: Vec = run_cast(&v_f16, "cast_f16_bf16"); + assert_eq!(results, v_bf16); + + // f16 -> u32 + let results: Vec = run_cast(&v_f16, "cast_f16_u32"); + assert_eq!(results, v_u32); + + // f16 -> u8 + let results: Vec = run_cast(&v_f16, "cast_f16_u8"); + assert_eq!(results, v_u8); + + // f16 -> i64 + let results: Vec = run_cast(&v_f16, "cast_f16_i64"); + assert_eq!(results, v_i64); } #[test] -fn it_cast_bf16_f32() { - let input: Vec = (1..=3).map(|v| bf16::from_f32(v as f32)).collect(); +fn cast_bf16() { + let v_f64 = vec![1.0f64, 2.0, 3.0]; + let v_f32: Vec = v_f64.iter().map(|&v| v as f32).collect(); + let v_f16: Vec = v_f64.iter().map(|&v| f16::from_f32(v as f32)).collect(); + let v_bf16: Vec = v_f64.iter().map(|&v| bf16::from_f32(v as f32)).collect(); + let v_u32: Vec = v_f64.iter().map(|&v| v as u32).collect(); + let v_u8: Vec = v_f64.iter().map(|&v| v as u8).collect(); + let v_i64: Vec = v_f64.iter().map(|&v| v as i64).collect(); - let output: Vec = cast(&input, "cast_bf16_f32"); - let expected: Vec = (1..=3).map(|v| v as f32).collect(); + // bf16 -> f32 + let results: Vec = run_cast(&v_bf16, "cast_bf16_f32"); + assert_eq!(results, v_f32); - assert_eq!(output, expected); + // bf16 -> f16 + let results: Vec = run_cast(&v_bf16, "cast_bf16_f16"); + assert_eq!(results, v_f16); + + // bf16 -> u32 + let results: Vec = run_cast(&v_bf16, "cast_bf16_u32"); + assert_eq!(results, v_u32); + + // bf16 -> u8 + let results: Vec = run_cast(&v_bf16, "cast_bf16_u8"); + assert_eq!(results, v_u8); + + // bf16 -> i64 + let results: Vec = run_cast(&v_bf16, "cast_bf16_i64"); + assert_eq!(results, v_i64); } #[test] -fn it_cast_u8_bf16() { - let input: Vec = (1..=3).map(|v| v as u8).collect(); +fn cast_u32() { + let v_f64 = vec![1.0f64, 2.0, 3.0]; + let v_f32: Vec = v_f64.iter().map(|&v| v as f32).collect(); + let v_f16: Vec = v_f64.iter().map(|&v| f16::from_f32(v as f32)).collect(); + let v_bf16: Vec = v_f64.iter().map(|&v| bf16::from_f32(v as f32)).collect(); + let v_u32: Vec = v_f64.iter().map(|&v| v as u32).collect(); + let v_u8: Vec = v_f64.iter().map(|&v| v as u8).collect(); + let v_i64: Vec = v_f64.iter().map(|&v| v as i64).collect(); - let output: Vec = cast(&input, "cast_u8_bf16"); - let expected: Vec = input - .iter() - .map(|v| bf16::from_f32(*v as f32)) - .collect::>(); + // u32 -> f32 + let results: Vec = run_cast(&v_u32, "cast_u32_f32"); + assert_eq!(results, v_f32); - assert_eq!(output, expected); + // u32 -> f16 + let results: Vec = run_cast(&v_u32, "cast_u32_f16"); + assert_eq!(results, v_f16); + + // u32 -> bf16 + let results: Vec = run_cast(&v_u32, "cast_u32_bf16"); + assert_eq!(results, v_bf16); + + // u32 -> u8 + let results: Vec = run_cast(&v_u32, "cast_u32_u8"); + assert_eq!(results, v_u8); + + // u32 -> i64 + let results: Vec = run_cast(&v_u32, "cast_u32_i64"); + assert_eq!(results, v_i64); } #[test] -fn it_cast_u32_bf16() { - let input: Vec = (1..=3).map(|v| v as u32).collect(); +fn cast_u8() { + let v_f64 = vec![1.0f64, 2.0, 3.0]; + let v_f32: Vec = v_f64.iter().map(|&v| v as f32).collect(); + let v_f16: Vec = v_f64.iter().map(|&v| f16::from_f32(v as f32)).collect(); + let v_bf16: Vec = v_f64.iter().map(|&v| bf16::from_f32(v as f32)).collect(); + let v_u32: Vec = v_f64.iter().map(|&v| v as u32).collect(); + let v_u8: Vec = v_f64.iter().map(|&v| v as u8).collect(); + let v_i64: Vec = v_f64.iter().map(|&v| v as i64).collect(); - let output: Vec = cast(&input, "cast_u32_bf16"); - let expected: Vec = input.iter().map(|v| bf16::from_f32(*v as f32)).collect(); + // u8 -> f32 + let results: Vec = run_cast(&v_u8, "cast_u8_f32"); + assert_eq!(results, v_f32); - assert_eq!(output, expected); + // u8 -> f16 + let results: Vec = run_cast(&v_u8, "cast_u8_f16"); + assert_eq!(results, v_f16); + + // u8 -> bf16 + let results: Vec = run_cast(&v_u8, "cast_u8_bf16"); + assert_eq!(results, v_bf16); + + // u8 -> u32 + let results: Vec = run_cast(&v_u8, "cast_u8_u32"); + assert_eq!(results, v_u32); + + // u8 -> i64 + let results: Vec = run_cast(&v_u8, "cast_u8_i64"); + assert_eq!(results, v_i64); } #[test] -fn it_cast_f32_bf16() { - let input: Vec = (1..=3).map(|v| v as f32).collect(); +fn cast_i64() { + let v_f64 = vec![1.0f64, 2.0, 3.0]; + let v_f32: Vec = v_f64.iter().map(|&v| v as f32).collect(); + let v_f16: Vec = v_f64.iter().map(|&v| f16::from_f32(v as f32)).collect(); + let v_bf16: Vec = v_f64.iter().map(|&v| bf16::from_f32(v as f32)).collect(); + let v_u32: Vec = v_f64.iter().map(|&v| v as u32).collect(); + let v_u8: Vec = v_f64.iter().map(|&v| v as u8).collect(); + let v_i64: Vec = v_f64.iter().map(|&v| v as i64).collect(); - let output: Vec = cast(&input, "cast_f32_bf16"); - let expected: Vec = input.iter().map(|v| bf16::from_f32(*v as f32)).collect(); + // i64 -> f32 + let results: Vec = run_cast(&v_i64, "cast_i64_f32"); + assert_eq!(results, v_f32); - assert_eq!(output, expected); -} + // i64 -> f16 + let results: Vec = run_cast(&v_i64, "cast_i64_f16"); + assert_eq!(results, v_f16); -#[test] -fn it_cast_bf16_u8() { - let input: Vec = (1..=3).map(|v| bf16::from_f32(v as f32)).collect(); + // i64 -> bf16 + let results: Vec = run_cast(&v_i64, "cast_i64_bf16"); + assert_eq!(results, v_bf16); - let output: Vec = cast(&input, "cast_bf16_u8"); - let expected: Vec = input.iter().map(|v| v.to_f32() as u8).collect(); + // i64 -> u32 + let results: Vec = run_cast(&v_i64, "cast_i64_u32"); + assert_eq!(results, v_u32); - assert_eq!(output, expected); -} - -#[test] -fn it_cast_bf16_f16() { - let input: Vec = (1..=3).map(|v| bf16::from_f32(v as f32)).collect(); - - let output: Vec = cast(&input, "cast_bf16_f16"); - let expected: Vec = input.iter().map(|v| f16::from_f32(v.to_f32())).collect(); - - assert_eq!(output, expected); -} - -#[test] -fn it_cast_f16_bf16() { - let input: Vec = (1..=3).map(|v| f16::from_f32(v as f32)).collect(); - - let output: Vec = cast(&input, "cast_f16_bf16"); - let expected: Vec = input.iter().map(|v| bf16::from_f32(v.to_f32())).collect(); - - assert_eq!(output, expected); + // i64 -> u8 + let results: Vec = run_cast(&v_i64, "cast_i64_u8"); + assert_eq!(results, v_u8); } fn run_affine(v: &[T], mul: f64, add: f64) -> Vec { From a15f859ab4b220aea103d516baa209088737c346 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 17 Mar 2024 21:15:12 +0100 Subject: [PATCH 022/131] Fix for the encodec example. (#1861) --- candle-examples/examples/encodec/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/candle-examples/examples/encodec/main.rs b/candle-examples/examples/encodec/main.rs index f1c4a0ee..9d0d81d3 100644 --- a/candle-examples/examples/encodec/main.rs +++ b/candle-examples/examples/encodec/main.rs @@ -109,8 +109,7 @@ fn main() -> Result<()> { let codes = match args.action { Action::CodeToAudio => { let codes = candle::safetensors::load(args.in_file, &device)?; - let codes = codes.get("codes").expect("no codes in input file").i(0)?; - codes + codes.get("codes").expect("no codes in input file").clone() } Action::AudioToCode | Action::AudioToAudio => { let (pcm, sample_rate) = pcm_decode(args.in_file)?; From 184105792f1d5c70ac07da4832938f3963c740dc Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Sun, 17 Mar 2024 17:19:12 -0400 Subject: [PATCH 023/131] add test for index add and add missing match statements (#1862) --- candle-core/src/metal_backend.rs | 22 ++++- candle-metal-kernels/src/indexing.metal | 38 +++++--- candle-metal-kernels/src/tests.rs | 116 ++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 16 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index a6513b1c..3bee7657 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1242,9 +1242,29 @@ impl BackendStorage for MetalStorage { None => Err(crate::Error::RequiresContiguous { op: "index-add" }.bt())?, }; let name = match (ids.dtype, self.dtype) { + (DType::I64, DType::BF16) => "ia_i64_bf16", + (DType::I64, DType::F16) => "ia_i64_f16", + (DType::I64, DType::F32) => "ia_i64_f32", + (DType::I64, DType::I64) => "ia_i64_i64", + (DType::I64, DType::U32) => "ia_i64_u32", + (DType::I64, DType::U8) => "ia_i64_u8", + + (DType::U32, DType::BF16) => "ia_u32_bf16", + (DType::U32, DType::F16) => "ia_u32_f16", (DType::U32, DType::F32) => "ia_u32_f32", + (DType::U32, DType::I64) => "ia_u32_i64", + (DType::U32, DType::U32) => "ia_u32_u32", + (DType::U32, DType::U8) => "ia_u32_u8", + + (DType::U8, DType::BF16) => "ia_u8_bf16", + (DType::U8, DType::F16) => "ia_u8_f16", + (DType::U8, DType::F32) => "ia_u8_f32", + (DType::U8, DType::I64) => "ia_u8_i64", + (DType::U8, DType::U32) => "ia_u8_u32", + (DType::U8, DType::U8) => "ia_u8_u8", + _ => Err(MetalError::UnexpectedDType { - msg: "index-add ids should be u32", + msg: "index-add ids should be u8/u32/i64", expected: DType::U32, got: ids.dtype(), })?, diff --git a/candle-metal-kernels/src/indexing.metal b/candle-metal-kernels/src/indexing.metal index f6b81be0..65491759 100644 --- a/candle-metal-kernels/src/indexing.metal +++ b/candle-metal-kernels/src/indexing.metal @@ -167,6 +167,10 @@ kernel void NAME( \ INDEX_OP(is_u32_f32, uint, float) INDEX_OP(is_u32_f16, uint, half) +#if defined(__HAVE_BFLOAT__) +INDEX_OP(is_u32_bf16, uint32_t, bfloat) +INDEX_OP(is_u8_bf16, uint8_t, bfloat) +#endif GATHER_OP(gather_u32_f32, uint, float) GATHER_OP(gather_u32_f16, uint, half) @@ -177,34 +181,38 @@ SCATTER_ADD_OP(sa_i64_f32, int64_t, float) SCATTER_ADD_OP(sa_u32_f16, uint32_t, half) SCATTER_ADD_OP(sa_u8_f16, uint8_t, half) SCATTER_ADD_OP(sa_i64_f16, int64_t, half) - #if defined(__HAVE_BFLOAT__) -INDEX_OP(is_u32_bf16, uint32_t, bfloat) -INDEX_OP(is_u8_bf16, uint8_t, bfloat) - -INDEX_ADD_OP(ia_i64_bf16, int64_t, bfloat) -INDEX_ADD_OP(ia_u32_bf16, uint32_t, bfloat) -INDEX_ADD_OP(ia_u8_bf16, uint8_t, bfloat) - SCATTER_ADD_OP(sa_u32_bf16, uint32_t, bfloat) SCATTER_ADD_OP(sa_u8_bf16, uint8_t, bfloat) SCATTER_ADD_OP(sa_i64_bf16, int64_t, bfloat) #endif -INDEX_ADD_OP(ia_u32_f16, uint32_t, half) -INDEX_ADD_OP(ia_u8_f16, uint8_t, half) - +// i64 +INDEX_ADD_OP(ia_i64_f16, int64_t, half) INDEX_ADD_OP(ia_i64_f32, int64_t, float) -INDEX_ADD_OP(ia_i64_u8, int64_t, uint8_t) INDEX_ADD_OP(ia_i64_i64, int64_t, int64_t) INDEX_ADD_OP(ia_i64_u32, int64_t, uint32_t) +INDEX_ADD_OP(ia_i64_u8, int64_t, uint8_t) +#if defined(__HAVE_BFLOAT__) +INDEX_ADD_OP(ia_i64_bf16, int64_t, bfloat) +#endif +// u32 +INDEX_ADD_OP(ia_u32_f16, uint32_t, half) INDEX_ADD_OP(ia_u32_f32, uint32_t, float) -INDEX_ADD_OP(ia_u32_u8, uint32_t, uint8_t) INDEX_ADD_OP(ia_u32_i64, uint32_t, int64_t) INDEX_ADD_OP(ia_u32_u32, uint32_t, uint32_t) +INDEX_ADD_OP(ia_u32_u8, uint32_t, uint8_t) +#if defined(__HAVE_BFLOAT__) +INDEX_ADD_OP(ia_u32_bf16, uint32_t, bfloat) +#endif +// u8 +INDEX_ADD_OP(ia_u8_f16, uint8_t, half) INDEX_ADD_OP(ia_u8_f32, uint8_t, float) -INDEX_ADD_OP(ia_u8_u8, uint8_t, uint8_t) -INDEX_ADD_OP(ia_u8_u32, uint8_t, uint32_t) INDEX_ADD_OP(ia_u8_i64, uint8_t, int64_t) +INDEX_ADD_OP(ia_u8_u32, uint8_t, uint32_t) +INDEX_ADD_OP(ia_u8_u8, uint8_t, uint8_t) +#if defined(__HAVE_BFLOAT__) +INDEX_ADD_OP(ia_u8_bf16, uint8_t, bfloat) +#endif \ No newline at end of file diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index b2f1d723..a34882d3 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -1252,3 +1252,119 @@ fn scatter_add() { } } } + +fn run_index_add( + left: &[T], + right: &[T], + indices: &[I], + shape: &[usize], + dim: usize, + name: &'static str, +) -> Vec { + let device = device(); + let kernels = Kernels::new(); + let command_queue = device.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + let input_buffer = new_buffer(&device, right); + let output = new_buffer(&device, left); + let indices_buffer = new_buffer(&device, indices); + call_index_add( + &device, + command_buffer, + &kernels, + name, + shape, + shape, + shape, + dim, + &input_buffer, + 0, + &indices_buffer, + 0, + &output, + ) + .unwrap(); + command_buffer.commit(); + command_buffer.wait_until_completed(); + read_to_vec(&output, left.len()) +} + +#[test] +fn index_add() { + let left = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0]; + let right = vec![1.0f32, 1.0, 1.0, 1.0, 1.0, 1.0]; + let indices = vec![0u32, 1, 0, 1, 0, 1]; + let shape = vec![6]; + + // u32, f32 + { + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_u32_f32"); + assert_eq!(results, vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // u32, f16 + { + let left = left.iter().map(|v| f16::from_f32(*v)).collect::>(); + let right = right.iter().map(|v| f16::from_f32(*v)).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_u32_f16"); + assert_eq!(approx_f16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // u32, bf16 + { + let left = left.iter().map(|v| bf16::from_f32(*v)).collect::>(); + let right = right.iter().map(|v| bf16::from_f32(*v)).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_u32_bf16"); + assert_eq!(approx_bf16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // u8, f32 + { + let indices = indices.iter().map(|v| *v as u8).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_u8_f32"); + assert_eq!(results, vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // u8, f16 + { + let indices = indices.iter().map(|v| *v as u8).collect::>(); + let left = left.iter().map(|v| f16::from_f32(*v)).collect::>(); + let right = right.iter().map(|v| f16::from_f32(*v)).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_u8_f16"); + assert_eq!(approx_f16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // u8, bf16 + { + let indices = indices.iter().map(|v| *v as u8).collect::>(); + let left = left.iter().map(|v| bf16::from_f32(*v)).collect::>(); + let right = right.iter().map(|v| bf16::from_f32(*v)).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_u8_bf16"); + assert_eq!(approx_bf16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // i64, f32 + { + let indices = indices.iter().map(|v| *v as i64).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_i64_f32"); + assert_eq!(results, vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // i64, f16 + { + let indices = indices.iter().map(|v| *v as i64).collect::>(); + let left = left.iter().map(|v| f16::from_f32(*v)).collect::>(); + let right = right.iter().map(|v| f16::from_f32(*v)).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_i64_f16"); + assert_eq!(approx_f16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } + + // i64, bf16 + { + let indices = indices.iter().map(|v| *v as i64).collect::>(); + let left = left.iter().map(|v| bf16::from_f32(*v)).collect::>(); + let right = right.iter().map(|v| bf16::from_f32(*v)).collect::>(); + let results = run_index_add(&left, &right, &indices, &shape, 0, "ia_i64_bf16"); + assert_eq!(approx_bf16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); + } +} From 754fa1e8134dd78c841c936eca746de9408e9ea7 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Mon, 18 Mar 2024 03:33:30 -0400 Subject: [PATCH 024/131] Add support for max_pool2d for Metal backend (#1863) * first pass at implementation of maxpool2d * Add definitions for other dtypes * add tests for other dtypes * Cosmetic tweaks + re-enable maxpool2d tests for metal. --------- Co-authored-by: Laurent --- candle-core/src/metal_backend.rs | 44 ++++- candle-core/tests/pool_tests.rs | 3 - candle-metal-kernels/src/conv.metal | 82 ++++++++++ candle-metal-kernels/src/lib.rs | 33 ++++ candle-metal-kernels/src/tests.rs | 239 +++++++++++++++++++++++++++- 5 files changed, 394 insertions(+), 7 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 3bee7657..d77fbf3f 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -263,7 +263,7 @@ impl MetalDevice { } } } - return best_buffer.map(|b| b.clone()); + best_buffer.cloned() } fn drop_unused_buffers(&self) -> Result<()> { @@ -1048,8 +1048,46 @@ impl BackendStorage for MetalStorage { crate::bail!("Metal avg_pool2d not implemented") } - fn max_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result { - crate::bail!("Metal max_pool2d not implemented") + fn max_pool2d( + &self, + inp_l: &Layout, + (w_k, h_k): (usize, usize), + (w_stride, h_stride): (usize, usize), + ) -> Result { + let shape = inp_l.shape(); + let (b_size, channels, width, height) = shape.dims4()?; + let strides = inp_l.stride(); + let name = match self.dtype { + DType::F32 => "max_pool2d_f32", + DType::F16 => "max_pool2d_f16", + DType::BF16 => "max_pool2d_bf16", + DType::U8 => "max_pool2d_u8", + DType::U32 => "max_pool2d_u32", + dtype => crate::bail!("Metal upsample_nearest2d {dtype:?} not implemented"), + }; + let out_w = (width - w_k) / w_stride + 1; + let out_h = (height - h_k) / h_stride + 1; + let dst_el = out_w * out_h * b_size * channels; + let buffer = self.device.new_buffer(dst_el, self.dtype, "max_pool2d")?; + let command_buffers = self.device.command_buffer()?; + candle_metal_kernels::call_max_pool2d( + &self.device.device, + &command_buffers, + &self.device.kernels, + name, + inp_l.dims(), + strides, + out_w, + out_h, + w_k, + h_k, + w_stride, + h_stride, + &self.buffer, + &buffer, + ) + .map_err(MetalError::from)?; + Ok(Self::new(buffer, self.device.clone(), dst_el, self.dtype)) } fn upsample_nearest1d(&self, _: &Layout, _: usize) -> Result { diff --git a/candle-core/tests/pool_tests.rs b/candle-core/tests/pool_tests.rs index a6530e03..8e273fb9 100644 --- a/candle-core/tests/pool_tests.rs +++ b/candle-core/tests/pool_tests.rs @@ -22,9 +22,6 @@ fn avg_pool2d(dev: &Device) -> Result<()> { } fn max_pool2d(dev: &Device) -> Result<()> { - if dev.is_metal() { - return Ok(()); - } let data: Vec = vec![ 1., 2., 1., 3., 0., 0., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1., ]; diff --git a/candle-metal-kernels/src/conv.metal b/candle-metal-kernels/src/conv.metal index dca53161..d7c23ddf 100644 --- a/candle-metal-kernels/src/conv.metal +++ b/candle-metal-kernels/src/conv.metal @@ -1,3 +1,9 @@ +#include + +using namespace metal; + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) + template METAL_FUNC void im2col( constant size_t &dst_numel, @@ -200,6 +206,74 @@ kernel void FN_NAME( \ upsample_nearest2d(w_out, h_out, w_scale, h_scale, dims, strides, src, dst, tid); \ } \ +template +METAL_FUNC void max_pool2d( + constant size_t &w_k, + constant size_t &h_k, + constant size_t &w_stride, + constant size_t &h_stride, + constant size_t *src_dims, + constant size_t *src_strides, + device const T *src, + device T *dst, + uint tid [[ thread_position_in_grid ]] +) { + const size_t c = src_dims[1]; + const size_t w_in = src_dims[2]; + const size_t h_in = src_dims[3]; + + const size_t w_out = (w_in - w_k) / w_stride + 1; + const size_t h_out = (h_in - h_k) / h_stride + 1; + if (tid >= src_dims[0] * c * w_out * h_out) { + return; + } + + const size_t b_idx = tid / (w_out * h_out * c); + const size_t c_idx = (tid / (w_out * h_out)) % c; + const size_t dst_w = (tid / h_out) % w_out; + const size_t dst_h = tid % h_out; + + const size_t src_idx0 = b_idx * src_strides[0]; + T d = 0; + bool set = false; + for (size_t w_offset = 0; w_offset < w_k; ++w_offset) { + size_t src_w = w_stride * dst_w + w_offset; + if (src_w >= w_in){ + continue; + } + for (size_t h_offset = 0; h_offset < h_k; ++h_offset) { + size_t src_h = h_stride * dst_h + h_offset; + if (src_h >= h_in) { + continue; + } + const size_t src_idx = src_idx0 + c_idx * src_strides[1] + src_w * src_strides[2] + src_h * src_strides[3]; + if (set) { + d = MAX(d, src[src_idx]); + } + else { + d = src[src_idx]; + set = true; + } + } + } + dst[tid] = d; +} + +#define MAXPOOL2D_OP(TYPENAME, FN_NAME) \ +kernel void FN_NAME( \ + constant size_t &w_k, \ + constant size_t &h_k, \ + constant size_t &w_s, \ + constant size_t &h_s, \ + constant size_t *src_dims, \ + constant size_t *src_s, \ + device const TYPENAME *src, \ + device TYPENAME *dst, \ + uint tid [[ thread_position_in_grid ]] \ +) { \ + max_pool2d(w_k, h_k, w_s, h_s, src_dims, src_s, src, dst, tid); \ +} \ + IM2COL_OP(float, im2col_f32) IM2COL_OP(uint8_t, im2col_u8) IM2COL_OP(uint32_t, im2col_u32) @@ -211,3 +285,11 @@ IM2COL1D_OP(uint32_t, im2col1d_u32) UPSAMPLE_NEAREST2D_OP(float, upsample_nearest2d_f32) UPSAMPLE_NEAREST2D_OP(uint8_t, upsample_nearest2d_u8) UPSAMPLE_NEAREST2D_OP(uint32_t, upsample_nearest2d_u32) + +MAXPOOL2D_OP(float, max_pool2d_f32) +MAXPOOL2D_OP(half, max_pool2d_f16) +MAXPOOL2D_OP(uint32_t, max_pool2d_u32) +MAXPOOL2D_OP(uint8_t, max_pool2d_u8) +#if defined(__HAVE_BFLOAT__) +MAXPOOL2D_OP(bfloat, max_pool2d_bf16) +#endif \ No newline at end of file diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index a879c86a..b1830a25 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1826,5 +1826,38 @@ fn divide(m: usize, b: usize) -> NSUInteger { ((m + b - 1) / b) as NSUInteger } +#[allow(clippy::too_many_arguments)] +pub fn call_max_pool2d( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + name: &'static str, + shape: &[usize], + strides: &[usize], + out_w: usize, + out_h: usize, + w_k: usize, + h_k: usize, + w_stride: usize, + h_stride: usize, + input: &Buffer, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let dst_el = out_w * out_h * shape[0] * shape[1]; + let pipeline: ComputePipelineState = kernels.load_pipeline(device, Source::Conv, name)?; + let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + set_params!( + encoder, + (w_k, h_k, w_stride, h_stride, shape, strides, input, output) + ); + encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[cfg(test)] mod tests; diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index a34882d3..74721153 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -1,6 +1,6 @@ use super::*; use half::{bf16, f16}; -use metal::{Buffer, Device, MTLResourceOptions}; +use metal::MTLResourceOptions; fn read_to_vec(buffer: &Buffer, n: usize) -> Vec { let ptr = buffer.contents() as *const T; @@ -1368,3 +1368,240 @@ fn index_add() { assert_eq!(approx_bf16(results, 4), vec![4.0, 5.0, 3.0, 4.0, 5.0, 6.0]); } } + +fn run_max_pool2d( + v: &[T], + (w_k, h_k): (usize, usize), + (w_stride, h_stride): (usize, usize), + shape: &[usize], + strides: &[usize], + name: &'static str, +) -> Vec { + let device = device(); + let command_queue = device.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + let out_w = (shape[2] - w_k) / w_stride + 1; + let out_h = (shape[3] - h_k) / h_stride + 1; + let dst_el = out_w * out_h * shape[0] * shape[1]; + let input = new_buffer(&device, v); + let output = new_buffer(&device, &vec![0.0f32; dst_el]); + let kernels = Kernels::new(); + call_max_pool2d( + &device, + command_buffer, + &kernels, + name, + shape, + strides, + out_w, + out_h, + w_k, + h_k, + w_stride, + h_stride, + &input, + &output, + ) + .unwrap(); + command_buffer.commit(); + command_buffer.wait_until_completed(); + + read_to_vec(&output, dst_el) +} + +#[test] +fn max_pool2d_f32() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| v as f32).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_f32", + ); + let expected = vec![5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 13.0, 14.0, 15.0]; + assert_eq!(results, expected); + + // kernel 2 stride 2 + let v: Vec = (0..16).map(|v| v as f32).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 2; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_f32", + ); + let expected = vec![5.0, 7.0, 13.0, 15.0]; + assert_eq!(results, expected); +} + +#[test] +fn max_pool2d_f16() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| half::f16::from_f32(v as f32)).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_f16", + ); + let expected = vec![5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 13.0, 14.0, 15.0] + .iter() + .map(|v| half::f16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); + + // kernel 2 stride 2 + let v: Vec = (0..16).map(|v| half::f16::from_f32(v as f32)).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 2; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_f16", + ); + let expected = vec![5.0, 7.0, 13.0, 15.0] + .iter() + .map(|v| half::f16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); +} + +#[test] +fn max_pool2d_bf16() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| half::bf16::from_f32(v as f32)).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_bf16", + ); + let expected = vec![5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 13.0, 14.0, 15.0] + .iter() + .map(|v| half::bf16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); + + // kernel 2 stride 2 + let v: Vec = (0..16).map(|v| half::bf16::from_f32(v as f32)).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 2; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_bf16", + ); + let expected = vec![5.0, 7.0, 13.0, 15.0] + .iter() + .map(|v| half::bf16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); +} + +#[test] +fn max_pool2d_u8() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| v as u8).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_u8", + ); + let expected = vec![5, 6, 7, 9, 10, 11, 13, 14, 15]; + assert_eq!(results, expected); + + // kernel 2 stride 2 + let v: Vec = (0..16).map(|v| v as u8).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 2; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_u8", + ); + let expected = vec![5, 7, 13, 15]; + assert_eq!(results, expected); +} + +#[test] +fn max_pool2d_u32() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| v as u32).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_u32", + ); + let expected = vec![5, 6, 7, 9, 10, 11, 13, 14, 15]; + assert_eq!(results, expected); + + // kernel 2 stride 2 + let v: Vec = (0..16).map(|v| v as u32).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 2; + let results = run_max_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "max_pool2d_u32", + ); + let expected = vec![5, 7, 13, 15]; + assert_eq!(results, expected); +} From d365ef32d90b3712da5a1f2f517b83e4197738c0 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 18 Mar 2024 10:09:40 +0100 Subject: [PATCH 025/131] Improve the encodec example: handle resampling. (#1865) * Improve the encodec example: handle resampling. * Play the audio directly. --- candle-examples/Cargo.toml | 6 +- candle-examples/examples/encodec/README.md | 5 +- candle-examples/examples/encodec/audio_io.rs | 275 +++++++++++++++++++ candle-examples/examples/encodec/main.rs | 87 ++---- 4 files changed, 309 insertions(+), 64 deletions(-) create mode 100644 candle-examples/examples/encodec/audio_io.rs diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index cb704f0c..108ca32a 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -27,6 +27,7 @@ intel-mkl-src = { workspace = true, optional = true } num-traits = { workspace = true } pyo3 = { version = "0.20.0", features = ["auto-initialize"], optional = true } rayon = { workspace = true } +rubato = { version = "0.15.0", optional = true } safetensors = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } @@ -63,6 +64,7 @@ nccl = ["cuda", "cudarc/nccl", "dep:half"] onnx = ["candle-onnx"] metal = ["candle/metal", "candle-nn/metal"] microphone = ["cpal"] +encodec = ["cpal", "symphonia", "rubato"] [[example]] name = "llama_multiprocess" @@ -98,6 +100,4 @@ required-features = ["candle-datasets"] [[example]] name = "encodec" -required-features = ["symphonia"] - - +required-features = ["encodec"] diff --git a/candle-examples/examples/encodec/README.md b/candle-examples/examples/encodec/README.md index 3028fb80..4a8eb0b6 100644 --- a/candle-examples/examples/encodec/README.md +++ b/candle-examples/examples/encodec/README.md @@ -13,8 +13,9 @@ cargo run --example encodec --features symphonia --release -- code-to-audio \ ``` This decodes the EnCodec tokens stored in `jfk-codes.safetensors` and generates -an output wav file containing the audio data. Instead of `code-to-audio` one -can use: +an output wav file containing the audio data. If the output file name is set to +`-`, the audio content directly gets played on the computer speakers if any. +Instead of `code-to-audio` one can use: - `audio-to-audio in.mp3 out.wav`: encodes the input audio file then decodes it to a wav file. - `audio-to-code in.mp3 out.safetensors`: generates a safetensors file containing EnCodec tokens for the input audio file. diff --git a/candle-examples/examples/encodec/audio_io.rs b/candle-examples/examples/encodec/audio_io.rs new file mode 100644 index 00000000..2103dd4a --- /dev/null +++ b/candle-examples/examples/encodec/audio_io.rs @@ -0,0 +1,275 @@ +#![allow(unused)] +use anyhow::{Context, Result}; +use std::sync::{Arc, Mutex}; + +pub const SAMPLE_RATE: usize = 24_000; + +pub(crate) struct AudioOutputData_ { + resampled_data: std::collections::VecDeque, + resampler: rubato::FastFixedIn, + output_buffer: Vec, + input_buffer: Vec, + input_len: usize, +} + +impl AudioOutputData_ { + pub(crate) fn new(input_sample_rate: usize, output_sample_rate: usize) -> Result { + use rubato::Resampler; + + let resampled_data = std::collections::VecDeque::with_capacity(output_sample_rate * 10); + let resample_ratio = output_sample_rate as f64 / input_sample_rate as f64; + let resampler = rubato::FastFixedIn::new( + resample_ratio, + f64::max(resample_ratio, 1.0), + rubato::PolynomialDegree::Septic, + 1024, + 1, + )?; + let input_buffer = resampler.input_buffer_allocate(true).remove(0); + let output_buffer = resampler.output_buffer_allocate(true).remove(0); + Ok(Self { + resampled_data, + resampler, + input_buffer, + output_buffer, + input_len: 0, + }) + } + + pub fn reset(&mut self) { + use rubato::Resampler; + self.output_buffer.fill(0.); + self.input_buffer.fill(0.); + self.resampler.reset(); + self.resampled_data.clear(); + } + + pub(crate) fn take_all(&mut self) -> Vec { + let mut data = Vec::with_capacity(self.resampled_data.len()); + while let Some(elem) = self.resampled_data.pop_back() { + data.push(elem); + } + data + } + + pub(crate) fn is_empty(&self) -> bool { + self.resampled_data.is_empty() + } + + // Assumes that the input buffer is large enough. + fn push_input_buffer(&mut self, samples: &[f32]) { + self.input_buffer[self.input_len..self.input_len + samples.len()].copy_from_slice(samples); + self.input_len += samples.len() + } + + pub(crate) fn push_samples(&mut self, samples: &[f32]) -> Result<()> { + use rubato::Resampler; + + let mut pos_in = 0; + loop { + let rem = self.input_buffer.len() - self.input_len; + let pos_end = usize::min(pos_in + rem, samples.len()); + self.push_input_buffer(&samples[pos_in..pos_end]); + pos_in = pos_end; + if self.input_len < self.input_buffer.len() { + break; + } + let (_, out_len) = self.resampler.process_into_buffer( + &[&self.input_buffer], + &mut [&mut self.output_buffer], + None, + )?; + for &elem in self.output_buffer[..out_len].iter() { + self.resampled_data.push_front(elem) + } + self.input_len = 0; + } + Ok(()) + } +} + +type AudioOutputData = Arc>; + +pub(crate) fn setup_output_stream() -> Result<(cpal::Stream, AudioOutputData)> { + use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; + + println!("Setup audio output stream!"); + let host = cpal::default_host(); + let device = host + .default_output_device() + .context("no output device available")?; + let mut supported_configs_range = device.supported_output_configs()?; + let config_range = match supported_configs_range.find(|c| c.channels() == 1) { + // On macOS, it's commonly the case that there are only stereo outputs. + None => device + .supported_output_configs()? + .next() + .context("no audio output available")?, + Some(config_range) => config_range, + }; + let sample_rate = cpal::SampleRate(SAMPLE_RATE as u32).clamp( + config_range.min_sample_rate(), + config_range.max_sample_rate(), + ); + let config: cpal::StreamConfig = config_range.with_sample_rate(sample_rate).into(); + let channels = config.channels as usize; + println!( + "cpal device: {} {} {config:?}", + device.name().unwrap_or_else(|_| "unk".to_string()), + config.sample_rate.0 + ); + let audio_data = Arc::new(Mutex::new(AudioOutputData_::new( + SAMPLE_RATE, + config.sample_rate.0 as usize, + )?)); + let ad = audio_data.clone(); + let stream = device.build_output_stream( + &config, + move |data: &mut [f32], _: &cpal::OutputCallbackInfo| { + data.fill(0.); + let mut ad = ad.lock().unwrap(); + let mut last_elem = 0f32; + for (idx, elem) in data.iter_mut().enumerate() { + if idx % channels == 0 { + match ad.resampled_data.pop_back() { + None => break, + Some(v) => { + last_elem = v; + *elem = v + } + } + } else { + *elem = last_elem + } + } + }, + move |err| eprintln!("cpal error: {err}"), + None, // None=blocking, Some(Duration)=timeout + )?; + stream.play()?; + Ok((stream, audio_data)) +} + +pub(crate) fn setup_input_stream() -> Result<(cpal::Stream, AudioOutputData)> { + use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; + + println!("Setup audio input stream!"); + let host = cpal::default_host(); + let device = host + .default_input_device() + .context("no input device available")?; + let mut supported_configs_range = device.supported_input_configs()?; + let config_range = supported_configs_range + .find(|c| c.channels() == 1) + .context("no audio input available")?; + let sample_rate = cpal::SampleRate(SAMPLE_RATE as u32).clamp( + config_range.min_sample_rate(), + config_range.max_sample_rate(), + ); + let config: cpal::StreamConfig = config_range.with_sample_rate(sample_rate).into(); + println!( + "cpal device: {} {} {config:?}", + device.name().unwrap_or_else(|_| "unk".to_string()), + config.sample_rate.0 + ); + let audio_data = Arc::new(Mutex::new(AudioOutputData_::new( + config.sample_rate.0 as usize, + SAMPLE_RATE, + )?)); + let ad = audio_data.clone(); + let stream = device.build_input_stream( + &config, + move |data: &[f32], _: &cpal::InputCallbackInfo| { + let mut ad = ad.lock().unwrap(); + if let Err(err) = ad.push_samples(data) { + eprintln!("error processing audio input {err:?}") + } + }, + move |err| eprintln!("cpal error: {err}"), + None, // None=blocking, Some(Duration)=timeout + )?; + stream.play()?; + Ok((stream, audio_data)) +} + +fn conv(samples: &mut Vec, data: std::borrow::Cow>) +where + T: symphonia::core::sample::Sample, + f32: symphonia::core::conv::FromSample, +{ + use symphonia::core::audio::Signal; + use symphonia::core::conv::FromSample; + samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v))) +} + +pub(crate) fn pcm_decode>(path: P) -> Result<(Vec, u32)> { + use symphonia::core::audio::{AudioBufferRef, Signal}; + + let src = std::fs::File::open(path)?; + let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default()); + let hint = symphonia::core::probe::Hint::new(); + let meta_opts: symphonia::core::meta::MetadataOptions = Default::default(); + let fmt_opts: symphonia::core::formats::FormatOptions = Default::default(); + let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?; + let mut format = probed.format; + let track = format + .tracks() + .iter() + .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL) + .expect("no supported audio tracks"); + let mut decoder = symphonia::default::get_codecs() + .make(&track.codec_params, &Default::default()) + .expect("unsupported codec"); + let track_id = track.id; + let sample_rate = track.codec_params.sample_rate.unwrap_or(0); + let mut pcm_data = Vec::new(); + while let Ok(packet) = format.next_packet() { + while !format.metadata().is_latest() { + format.metadata().pop(); + } + if packet.track_id() != track_id { + continue; + } + match decoder.decode(&packet)? { + AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)), + AudioBufferRef::U8(data) => conv(&mut pcm_data, data), + AudioBufferRef::U16(data) => conv(&mut pcm_data, data), + AudioBufferRef::U24(data) => conv(&mut pcm_data, data), + AudioBufferRef::U32(data) => conv(&mut pcm_data, data), + AudioBufferRef::S8(data) => conv(&mut pcm_data, data), + AudioBufferRef::S16(data) => conv(&mut pcm_data, data), + AudioBufferRef::S24(data) => conv(&mut pcm_data, data), + AudioBufferRef::S32(data) => conv(&mut pcm_data, data), + AudioBufferRef::F64(data) => conv(&mut pcm_data, data), + } + } + Ok((pcm_data, sample_rate)) +} + +pub(crate) fn resample(pcm_in: &[f32], sr_in: usize, sr_out: usize) -> Result> { + use rubato::Resampler; + + let mut pcm_out = + Vec::with_capacity((pcm_in.len() as f64 * sr_out as f64 / sr_in as f64) as usize + 1024); + + let mut resampler = rubato::FftFixedInOut::::new(sr_in, sr_out, 1024, 1)?; + let mut output_buffer = resampler.output_buffer_allocate(true); + let mut pos_in = 0; + while pos_in + resampler.input_frames_next() < pcm_in.len() { + let (in_len, out_len) = + resampler.process_into_buffer(&[&pcm_in[pos_in..]], &mut output_buffer, None)?; + pos_in += in_len; + pcm_out.extend_from_slice(&output_buffer[0][..out_len]); + } + + if pos_in < pcm_in.len() { + let (_in_len, out_len) = resampler.process_partial_into_buffer( + Some(&[&pcm_in[pos_in..]]), + &mut output_buffer, + None, + )?; + pcm_out.extend_from_slice(&output_buffer[0][..out_len]); + } + + Ok(pcm_out) +} diff --git a/candle-examples/examples/encodec/main.rs b/candle-examples/examples/encodec/main.rs index 9d0d81d3..b8728bdb 100644 --- a/candle-examples/examples/encodec/main.rs +++ b/candle-examples/examples/encodec/main.rs @@ -11,59 +11,7 @@ use candle_transformers::models::encodec::{Config, Model}; use clap::{Parser, ValueEnum}; use hf_hub::api::sync::Api; -fn conv(samples: &mut Vec, data: std::borrow::Cow>) -where - T: symphonia::core::sample::Sample, - f32: symphonia::core::conv::FromSample, -{ - use symphonia::core::audio::Signal; - use symphonia::core::conv::FromSample; - samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v))) -} - -fn pcm_decode>(path: P) -> anyhow::Result<(Vec, u32)> { - use symphonia::core::audio::{AudioBufferRef, Signal}; - - let src = std::fs::File::open(path)?; - let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default()); - let hint = symphonia::core::probe::Hint::new(); - let meta_opts: symphonia::core::meta::MetadataOptions = Default::default(); - let fmt_opts: symphonia::core::formats::FormatOptions = Default::default(); - let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?; - let mut format = probed.format; - let track = format - .tracks() - .iter() - .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL) - .expect("no supported audio tracks"); - let mut decoder = symphonia::default::get_codecs() - .make(&track.codec_params, &Default::default()) - .expect("unsupported codec"); - let track_id = track.id; - let sample_rate = track.codec_params.sample_rate.unwrap_or(0); - let mut pcm_data = Vec::new(); - while let Ok(packet) = format.next_packet() { - while !format.metadata().is_latest() { - format.metadata().pop(); - } - if packet.track_id() != track_id { - continue; - } - match decoder.decode(&packet)? { - AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)), - AudioBufferRef::U8(data) => conv(&mut pcm_data, data), - AudioBufferRef::U16(data) => conv(&mut pcm_data, data), - AudioBufferRef::U24(data) => conv(&mut pcm_data, data), - AudioBufferRef::U32(data) => conv(&mut pcm_data, data), - AudioBufferRef::S8(data) => conv(&mut pcm_data, data), - AudioBufferRef::S16(data) => conv(&mut pcm_data, data), - AudioBufferRef::S24(data) => conv(&mut pcm_data, data), - AudioBufferRef::S32(data) => conv(&mut pcm_data, data), - AudioBufferRef::F64(data) => conv(&mut pcm_data, data), - } - } - Ok((pcm_data, sample_rate)) -} +mod audio_io; #[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)] enum Action { @@ -112,10 +60,13 @@ fn main() -> Result<()> { codes.get("codes").expect("no codes in input file").clone() } Action::AudioToCode | Action::AudioToAudio => { - let (pcm, sample_rate) = pcm_decode(args.in_file)?; - if sample_rate != 24_000 { - println!("WARNING: encodec uses a 24khz sample rate, input uses {sample_rate}") - } + let (pcm, sample_rate) = audio_io::pcm_decode(args.in_file)?; + let pcm = if sample_rate != 24_000 { + println!("WARNING: encodec uses a 24khz sample rate, input uses {sample_rate}, resampling..."); + audio_io::resample(&pcm, sample_rate as usize, 24_000)? + } else { + pcm + }; let pcm_len = pcm.len(); let pcm = Tensor::from_vec(pcm, (1, 1, pcm_len), &device)?; println!("input pcm shape: {:?}", pcm.shape()); @@ -134,8 +85,26 @@ fn main() -> Result<()> { let pcm = pcm.i(0)?.i(0)?; let pcm = candle_examples::audio::normalize_loudness(&pcm, 24_000, true)?; let pcm = pcm.to_vec1::()?; - let mut output = std::fs::File::create(&args.out_file)?; - candle_examples::wav::write_pcm_as_wav(&mut output, &pcm, 24_000)?; + if args.out_file == "-" { + let (stream, ad) = audio_io::setup_output_stream()?; + { + let mut ad = ad.lock().unwrap(); + ad.push_samples(&pcm)?; + } + loop { + let ad = ad.lock().unwrap(); + if ad.is_empty() { + break; + } + // That's very weird, calling thread::sleep here triggers the stream to stop + // playing (the callback doesn't seem to be called anymore). + // std::thread::sleep(std::time::Duration::from_millis(100)); + } + drop(stream) + } else { + let mut output = std::fs::File::create(&args.out_file)?; + candle_examples::wav::write_pcm_as_wav(&mut output, &pcm, 24_000)?; + } } } Ok(()) From 58605252e8c9355d6f2452f54918e9eb4b938b1f Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 18 Mar 2024 11:19:46 +0100 Subject: [PATCH 026/131] Microphone support for the encodec example. (#1866) --- candle-examples/examples/encodec/README.md | 8 ++++-- candle-examples/examples/encodec/main.rs | 30 ++++++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/candle-examples/examples/encodec/README.md b/candle-examples/examples/encodec/README.md index 4a8eb0b6..9de0d4ad 100644 --- a/candle-examples/examples/encodec/README.md +++ b/candle-examples/examples/encodec/README.md @@ -13,9 +13,13 @@ cargo run --example encodec --features symphonia --release -- code-to-audio \ ``` This decodes the EnCodec tokens stored in `jfk-codes.safetensors` and generates -an output wav file containing the audio data. If the output file name is set to -`-`, the audio content directly gets played on the computer speakers if any. +an output wav file containing the audio data. + Instead of `code-to-audio` one can use: - `audio-to-audio in.mp3 out.wav`: encodes the input audio file then decodes it to a wav file. - `audio-to-code in.mp3 out.safetensors`: generates a safetensors file containing EnCodec tokens for the input audio file. + +If the audio output file name is set to `-`, the audio content directly gets +played on default audio output device. If the audio input file is set to `-`, the audio +gets recorded from the default audio input. diff --git a/candle-examples/examples/encodec/main.rs b/candle-examples/examples/encodec/main.rs index b8728bdb..e77f98e7 100644 --- a/candle-examples/examples/encodec/main.rs +++ b/candle-examples/examples/encodec/main.rs @@ -60,12 +60,32 @@ fn main() -> Result<()> { codes.get("codes").expect("no codes in input file").clone() } Action::AudioToCode | Action::AudioToAudio => { - let (pcm, sample_rate) = audio_io::pcm_decode(args.in_file)?; - let pcm = if sample_rate != 24_000 { - println!("WARNING: encodec uses a 24khz sample rate, input uses {sample_rate}, resampling..."); - audio_io::resample(&pcm, sample_rate as usize, 24_000)? + let pcm = if args.in_file == "-" { + println!(">>>> RECORDING AUDIO, PRESS ENTER ONCE DONE <<<<"); + let (stream, input_audio) = audio_io::setup_input_stream()?; + let mut pcms = vec![]; + let stdin = std::thread::spawn(|| { + let mut s = String::new(); + std::io::stdin().read_line(&mut s) + }); + while !stdin.is_finished() { + let input = input_audio.lock().unwrap().take_all(); + if input.is_empty() { + std::thread::sleep(std::time::Duration::from_millis(100)); + continue; + } + pcms.push(input) + } + drop(stream); + pcms.concat() } else { - pcm + let (pcm, sample_rate) = audio_io::pcm_decode(args.in_file)?; + if sample_rate != 24_000 { + println!("WARNING: encodec uses a 24khz sample rate, input uses {sample_rate}, resampling..."); + audio_io::resample(&pcm, sample_rate as usize, 24_000)? + } else { + pcm + } }; let pcm_len = pcm.len(); let pcm = Tensor::from_vec(pcm, (1, 1, pcm_len), &device)?; From 04a61a9c72a1f13546c8b7becd95055129fda22f Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Mon, 18 Mar 2024 13:50:14 -0400 Subject: [PATCH 027/131] Add avg_pool2d metal implementation for the metal backend (#1869) * implement metal avg pool 2d * fixX * add suggested precision workaround for the accumulator --- candle-core/src/metal_backend.rs | 46 +++++++++- candle-core/tests/pool_tests.rs | 3 - candle-metal-kernels/src/conv.metal | 69 ++++++++++++++ candle-metal-kernels/src/lib.rs | 2 +- candle-metal-kernels/src/tests.rs | 136 +++++++++++++++++++++++++--- 5 files changed, 236 insertions(+), 20 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index d77fbf3f..cf354f45 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1044,8 +1044,46 @@ impl BackendStorage for MetalStorage { crate::bail!("Metal conv_tranpose2d not implemented") } - fn avg_pool2d(&self, _: &Layout, _: (usize, usize), _: (usize, usize)) -> Result { - crate::bail!("Metal avg_pool2d not implemented") + fn avg_pool2d( + &self, + inp_l: &Layout, + (w_k, h_k): (usize, usize), + (w_stride, h_stride): (usize, usize), + ) -> Result { + let shape = inp_l.shape(); + let (b_size, channels, width, height) = shape.dims4()?; + let strides = inp_l.stride(); + let name = match self.dtype { + DType::F32 => "avg_pool2d_f32", + DType::F16 => "avg_pool2d_f16", + DType::BF16 => "avg_pool2d_bf16", + DType::U8 => "avg_pool2d_u8", + DType::U32 => "avg_pool2d_u32", + dtype => crate::bail!("Metal avg_pool2d {dtype:?} not implemented"), + }; + let out_w = (width - w_k) / w_stride + 1; + let out_h = (height - h_k) / h_stride + 1; + let dst_el = out_w * out_h * b_size * channels; + let buffer = self.device.new_buffer(dst_el, self.dtype, "avg_pool2d")?; + let command_buffers = self.device.command_buffer()?; + candle_metal_kernels::call_pool2d( + &self.device.device, + &command_buffers, + &self.device.kernels, + name, + inp_l.dims(), + strides, + out_w, + out_h, + w_k, + h_k, + w_stride, + h_stride, + &self.buffer, + &buffer, + ) + .map_err(MetalError::from)?; + Ok(Self::new(buffer, self.device.clone(), dst_el, self.dtype)) } fn max_pool2d( @@ -1063,14 +1101,14 @@ impl BackendStorage for MetalStorage { DType::BF16 => "max_pool2d_bf16", DType::U8 => "max_pool2d_u8", DType::U32 => "max_pool2d_u32", - dtype => crate::bail!("Metal upsample_nearest2d {dtype:?} not implemented"), + dtype => crate::bail!("Metal max_pool2d {dtype:?} not implemented"), }; let out_w = (width - w_k) / w_stride + 1; let out_h = (height - h_k) / h_stride + 1; let dst_el = out_w * out_h * b_size * channels; let buffer = self.device.new_buffer(dst_el, self.dtype, "max_pool2d")?; let command_buffers = self.device.command_buffer()?; - candle_metal_kernels::call_max_pool2d( + candle_metal_kernels::call_pool2d( &self.device.device, &command_buffers, &self.device.kernels, diff --git a/candle-core/tests/pool_tests.rs b/candle-core/tests/pool_tests.rs index 8e273fb9..1edb7d35 100644 --- a/candle-core/tests/pool_tests.rs +++ b/candle-core/tests/pool_tests.rs @@ -2,9 +2,6 @@ use candle_core::{test_device, test_utils, Device, IndexOp, Result, Tensor}; // https://github.com/huggingface/candle/issues/364 fn avg_pool2d(dev: &Device) -> Result<()> { - if dev.is_metal() { - return Ok(()); - } let data: Vec = vec![ 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., ]; diff --git a/candle-metal-kernels/src/conv.metal b/candle-metal-kernels/src/conv.metal index d7c23ddf..7f7a75cf 100644 --- a/candle-metal-kernels/src/conv.metal +++ b/candle-metal-kernels/src/conv.metal @@ -206,6 +206,67 @@ kernel void FN_NAME( \ upsample_nearest2d(w_out, h_out, w_scale, h_scale, dims, strides, src, dst, tid); \ } \ +template +METAL_FUNC void avg_pool2d( + constant size_t &w_k, + constant size_t &h_k, + constant size_t &w_stride, + constant size_t &h_stride, + constant size_t *src_dims, + constant size_t *src_strides, + device const T *src, + device T *dst, + uint tid [[ thread_position_in_grid ]] +) { + const size_t c = src_dims[1]; + const size_t w_in = src_dims[2]; + const size_t h_in = src_dims[3]; + + const size_t w_out = (w_in - w_k) / w_stride + 1; + const size_t h_out = (h_in - h_k) / h_stride + 1; + if (tid >= src_dims[0] * c * w_out * h_out) { + return; + } + + const size_t b_idx = tid / (w_out * h_out * c); + const size_t c_idx = (tid / (w_out * h_out)) % c; + const size_t dst_w = (tid / h_out) % w_out; + const size_t dst_h = tid % h_out; + + const size_t src_idx0 = b_idx * src_strides[0]; + A d = 0; + for (size_t w_offset = 0; w_offset < w_k; ++w_offset) { + size_t src_w = w_stride * dst_w + w_offset; + if (src_w >= w_in){ + continue; + } + for (size_t h_offset = 0; h_offset < h_k; ++h_offset) { + size_t src_h = h_stride * dst_h + h_offset; + if (src_h >= h_in) { + continue; + } + const size_t src_idx = src_idx0 + c_idx * src_strides[1] + src_w * src_strides[2] + src_h * src_strides[3]; + d += static_cast(src[src_idx]); + } + } + dst[tid] = static_cast(d / (w_k * h_k)); +} + +#define AVGPOOL2D_OP(TYPENAME, TYPEACC, FN_NAME) \ +kernel void FN_NAME( \ + constant size_t &w_k, \ + constant size_t &h_k, \ + constant size_t &w_s, \ + constant size_t &h_s, \ + constant size_t *src_dims, \ + constant size_t *src_s, \ + device const TYPENAME *src, \ + device TYPENAME *dst, \ + uint tid [[ thread_position_in_grid ]] \ +) { \ + avg_pool2d(w_k, h_k, w_s, h_s, src_dims, src_s, src, dst, tid); \ +} \ + template METAL_FUNC void max_pool2d( constant size_t &w_k, @@ -292,4 +353,12 @@ MAXPOOL2D_OP(uint32_t, max_pool2d_u32) MAXPOOL2D_OP(uint8_t, max_pool2d_u8) #if defined(__HAVE_BFLOAT__) MAXPOOL2D_OP(bfloat, max_pool2d_bf16) +#endif + +AVGPOOL2D_OP(float, float, avg_pool2d_f32) +AVGPOOL2D_OP(half, float, avg_pool2d_f16) +AVGPOOL2D_OP(uint32_t, uint32_t, avg_pool2d_u32) +AVGPOOL2D_OP(uint8_t, uint8_t, avg_pool2d_u8) +#if defined(__HAVE_BFLOAT__) +AVGPOOL2D_OP(bfloat, float, avg_pool2d_bf16) #endif \ No newline at end of file diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index b1830a25..1161501f 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1827,7 +1827,7 @@ fn divide(m: usize, b: usize) -> NSUInteger { } #[allow(clippy::too_many_arguments)] -pub fn call_max_pool2d( +pub fn call_pool2d( device: &Device, command_buffer: &CommandBufferRef, kernels: &Kernels, diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index 74721153..19e160dd 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -1369,7 +1369,7 @@ fn index_add() { } } -fn run_max_pool2d( +fn run_pool2d( v: &[T], (w_k, h_k): (usize, usize), (w_stride, h_stride): (usize, usize), @@ -1386,7 +1386,7 @@ fn run_max_pool2d( let input = new_buffer(&device, v); let output = new_buffer(&device, &vec![0.0f32; dst_el]); let kernels = Kernels::new(); - call_max_pool2d( + call_pool2d( &device, command_buffer, &kernels, @@ -1417,7 +1417,7 @@ fn max_pool2d_f32() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 1; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1434,7 +1434,7 @@ fn max_pool2d_f32() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 2; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1454,7 +1454,7 @@ fn max_pool2d_f16() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 1; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1474,7 +1474,7 @@ fn max_pool2d_f16() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 2; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1497,7 +1497,7 @@ fn max_pool2d_bf16() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 1; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1517,7 +1517,7 @@ fn max_pool2d_bf16() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 2; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1540,7 +1540,7 @@ fn max_pool2d_u8() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 1; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1557,7 +1557,7 @@ fn max_pool2d_u8() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 2; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1577,7 +1577,7 @@ fn max_pool2d_u32() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 1; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1594,7 +1594,7 @@ fn max_pool2d_u32() { let strides = vec![16, 16, 4, 1]; let kernel = 2; let stride = 2; - let results = run_max_pool2d( + let results = run_pool2d( &v, (kernel, kernel), (stride, stride), @@ -1605,3 +1605,115 @@ fn max_pool2d_u32() { let expected = vec![5, 7, 13, 15]; assert_eq!(results, expected); } + +#[test] +fn avg_pool2d_f32() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| v as f32).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "avg_pool2d_f32", + ); + let expected = vec![ + 2.5000, 3.5000, 4.5000, 6.5000, 7.5000, 8.5000, 10.5000, 11.5000, 12.5000, + ]; + assert_eq!(results, expected); +} + +#[test] +fn avg_pool2d_f16() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| f16::from_f32(v as f32)).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "avg_pool2d_f16", + ); + let expected = vec![ + 2.5000, 3.5000, 4.5000, 6.5000, 7.5000, 8.5000, 10.5000, 11.5000, 12.5000, + ] + .iter() + .map(|v| f16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); +} + +#[test] +fn avg_pool2d_bf16() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| bf16::from_f32(v as f32)).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "avg_pool2d_bf16", + ); + let expected = vec![ + 2.5000, 3.5000, 4.5000, 6.5000, 7.5000, 8.5000, 10.5000, 11.5000, 12.5000, + ] + .iter() + .map(|v| bf16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); +} + +#[test] +fn avg_pool2d_u8() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| v as u8).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "avg_pool2d_u8", + ); + let expected = vec![2, 3, 4, 6, 7, 8, 10, 11, 12]; + assert_eq!(results, expected); +} + +#[test] +fn avg_pool2d_u32() { + // kernel 2 stride 1 + let v: Vec = (0..16).map(|v| v as u32).collect(); + let shape = vec![1, 1, 4, 4]; + let strides = vec![16, 16, 4, 1]; + let kernel = 2; + let stride = 1; + let results = run_pool2d( + &v, + (kernel, kernel), + (stride, stride), + &shape, + &strides, + "avg_pool2d_u32", + ); + let expected = vec![2, 3, 4, 6, 7, 8, 10, 11, 12]; + assert_eq!(results, expected); +} From 6a966cf9e0abee128f0b8b60f0063bfe5fdaff92 Mon Sep 17 00:00:00 2001 From: Gabriel <45515538+gabotechs@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:22:53 +0100 Subject: [PATCH 028/131] Add a DQN example to the reinforcement-learning section (#1872) --- .../examples/reinforcement-learning/dqn.rs | 118 ++++++++++++++++++ .../examples/reinforcement-learning/main.rs | 3 + 2 files changed, 121 insertions(+) create mode 100644 candle-examples/examples/reinforcement-learning/dqn.rs diff --git a/candle-examples/examples/reinforcement-learning/dqn.rs b/candle-examples/examples/reinforcement-learning/dqn.rs new file mode 100644 index 00000000..83457810 --- /dev/null +++ b/candle-examples/examples/reinforcement-learning/dqn.rs @@ -0,0 +1,118 @@ +use std::collections::VecDeque; + +use rand::distributions::Uniform; +use rand::{thread_rng, Rng}; + +use candle::{DType, Device, Module, Result, Tensor}; +use candle_nn::loss::mse; +use candle_nn::{linear, seq, Activation, AdamW, Optimizer, VarBuilder, VarMap}; + +use crate::gym_env::GymEnv; + +const DEVICE: Device = Device::Cpu; +const EPISODES: usize = 200; +const BATCH_SIZE: usize = 64; +const GAMMA: f64 = 0.99; +const LEARNING_RATE: f64 = 0.01; + +pub fn run() -> Result<()> { + let env = GymEnv::new("CartPole-v1")?; + + // Build the model that predicts the estimated rewards given a specific state. + let var_map = VarMap::new(); + let vb = VarBuilder::from_varmap(&var_map, DType::F32, &DEVICE); + let observation_space = *env.observation_space().first().unwrap(); + + let model = seq() + .add(linear(observation_space, 64, vb.pp("linear_in"))?) + .add(Activation::Relu) + .add(linear(64, env.action_space(), vb.pp("linear_out"))?); + + let mut optimizer = AdamW::new_lr(var_map.all_vars(), LEARNING_RATE)?; + + // Initialize the model's memory. + let mut memory = VecDeque::with_capacity(10000); + + // Start the training loop. + let mut state = env.reset(0)?; + let mut episode = 0; + let mut accumulate_rewards = 0.0; + while episode < EPISODES { + // Given the current state, predict the estimated rewards, and take the + // action that is expected to return the most rewards. + let estimated_rewards = model.forward(&state.unsqueeze(0)?)?; + let action: u32 = estimated_rewards.squeeze(0)?.argmax(0)?.to_scalar()?; + + // Take that action in the environment, and memorize the outcome: + // - the state for which the action was taken + // - the action taken + // - the new state resulting of taking that action + // - the actual rewards of taking that action + // - whether the environment reached a terminal state or not (e.g. game over) + let step = env.step(action)?; + accumulate_rewards += step.reward; + memory.push_back(( + state, + action, + step.state.clone(), + step.reward, + step.terminated || step.truncated, + )); + state = step.state; + + // If there's enough entries in the memory, perform a learning step, where + // BATCH_SIZE transitions will be sampled from the memory and will be + // fed to the model so that it performs a backward pass. + if memory.len() > BATCH_SIZE { + // Sample randomly from the memory. + let batch = thread_rng() + .sample_iter(Uniform::from(0..memory.len())) + .take(BATCH_SIZE) + .map(|i| memory.get(i).unwrap().clone()) + .collect::>(); + + // Group all the samples together into tensors with the appropriate shape. + let states: Vec<_> = batch.iter().map(|e| e.0.clone()).collect(); + let states = Tensor::stack(&states, 0)?; + + let actions = batch.iter().map(|e| e.1); + let actions = Tensor::from_iter(actions, &DEVICE)?.unsqueeze(1)?; + + let next_states: Vec<_> = batch.iter().map(|e| e.2.clone()).collect(); + let next_states = Tensor::stack(&next_states, 0)?; + + let rewards = batch.iter().map(|e| e.3 as f32); + let rewards = Tensor::from_iter(rewards, &DEVICE)?.unsqueeze(1)?; + + let non_final_mask = batch.iter().map(|e| !e.4 as u8 as f32); + let non_final_mask = Tensor::from_iter(non_final_mask, &DEVICE)?.unsqueeze(1)?; + + // Get the estimated rewards for the actions that where taken at each step. + let estimated_rewards = model.forward(&states)?; + let x = estimated_rewards.gather(&actions, 1)?; + + // Get the maximum expected rewards for the next state, apply them a discount rate + // GAMMA and add them to the rewards that were actually gathered on the current state. + // If the next state is a terminal state, just omit maximum estimated + // rewards for that state. + let expected_rewards = model.forward(&next_states)?.detach(); + let y = expected_rewards.max_keepdim(1)?; + let y = (y * GAMMA * non_final_mask + rewards)?; + + // Compare the estimated rewards with the maximum expected rewards and + // perform the backward step. + let loss = mse(&x, &y)?; + optimizer.backward_step(&loss)?; + } + + // If we are on a terminal state, reset the environment and log how it went. + if step.terminated || step.truncated { + episode += 1; + println!("Episode {episode} | Rewards {}", accumulate_rewards as i64); + state = env.reset(0)?; + accumulate_rewards = 0.0; + } + } + + Ok(()) +} diff --git a/candle-examples/examples/reinforcement-learning/main.rs b/candle-examples/examples/reinforcement-learning/main.rs index e87afae2..61b5a6db 100644 --- a/candle-examples/examples/reinforcement-learning/main.rs +++ b/candle-examples/examples/reinforcement-learning/main.rs @@ -14,6 +14,7 @@ mod vec_gym_env; mod ddpg; mod policy_gradient; +mod dqn; #[derive(Parser)] struct Args { @@ -25,6 +26,7 @@ struct Args { enum Command { Pg, Ddpg, + Dqn } fn main() -> Result<()> { @@ -32,6 +34,7 @@ fn main() -> Result<()> { match args.command { Command::Pg => policy_gradient::run()?, Command::Ddpg => ddpg::run()?, + Command::Dqn => dqn::run()? } Ok(()) } From 90fc82211f29282991afc8fea33c78169e674db1 Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Mon, 18 Mar 2024 22:40:06 +0200 Subject: [PATCH 029/131] Use a common with_tracing::RmsNorm in a few models. (#1871) * Add RmsNorm with tracing. * Use with_tracing::RmsNorm in some models. --- candle-transformers/src/models/llama.rs | 27 +++---------------- candle-transformers/src/models/mistral.rs | 23 +--------------- candle-transformers/src/models/mixtral.rs | 23 +--------------- candle-transformers/src/models/qwen2.rs | 23 +--------------- .../src/models/with_tracing.rs | 21 +++++++++++++++ candle-transformers/src/models/yi.rs | 23 +--------------- 6 files changed, 29 insertions(+), 111 deletions(-) diff --git a/candle-transformers/src/models/llama.rs b/candle-transformers/src/models/llama.rs index a091d3eb..400351f3 100644 --- a/candle-transformers/src/models/llama.rs +++ b/candle-transformers/src/models/llama.rs @@ -1,4 +1,4 @@ -use super::with_tracing::{linear_no_bias as linear, Linear}; +use super::with_tracing::{linear_no_bias as linear, Linear, RmsNorm}; use candle::{DType, Device, IndexOp, Result, Tensor, D}; use candle_nn::{embedding, Embedding, Module, VarBuilder}; use std::collections::HashMap; @@ -133,25 +133,6 @@ impl Cache { } } -#[derive(Debug, Clone)] -struct RmsNorm { - inner: candle_nn::RmsNorm, - span: tracing::Span, -} - -impl RmsNorm { - fn load(size: usize, eps: f64, vb: VarBuilder) -> Result { - let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); - let inner = candle_nn::rms_norm(size, eps, vb)?; - Ok(Self { inner, span }) - } - - fn forward(&self, x: &Tensor) -> Result { - let _enter = self.span.enter(); - self.inner.forward(x) - } -} - #[derive(Debug, Clone)] struct CausalSelfAttention { q_proj: Linear, @@ -377,8 +358,8 @@ impl Block { let span = tracing::span!(tracing::Level::TRACE, "block"); let attn = CausalSelfAttention::load(vb.pp("self_attn"), cfg)?; let mlp = Mlp::load(vb.pp("mlp"), cfg)?; - let rms_1 = RmsNorm::load(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("input_layernorm"))?; - let rms_2 = RmsNorm::load( + let rms_1 = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("input_layernorm"))?; + let rms_2 = RmsNorm::new( cfg.hidden_size, cfg.rms_norm_eps, vb.pp("post_attention_layernorm"), @@ -417,7 +398,7 @@ impl Llama { pub fn load(vb: VarBuilder, cfg: &Config) -> Result { let wte = embedding(cfg.vocab_size, cfg.hidden_size, vb.pp("model.embed_tokens"))?; let lm_head = linear(cfg.hidden_size, cfg.vocab_size, vb.pp("lm_head"))?; - let ln_f = RmsNorm::load(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("model.norm"))?; + let ln_f = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("model.norm"))?; let blocks: Vec<_> = (0..cfg.num_hidden_layers) .map(|i| Block::load(vb.pp(&format!("model.layers.{i}")), cfg).unwrap()) .collect(); diff --git a/candle-transformers/src/models/mistral.rs b/candle-transformers/src/models/mistral.rs index 2809ae0a..be84f824 100644 --- a/candle-transformers/src/models/mistral.rs +++ b/candle-transformers/src/models/mistral.rs @@ -1,4 +1,4 @@ -use crate::models::with_tracing::{linear_no_bias, Linear}; +use crate::models::with_tracing::{linear_no_bias, Linear, RmsNorm}; /// Mistral LLM, https://github.com/mistralai/mistral-src use candle::{DType, Device, Module, Result, Tensor, D}; use candle_nn::{Activation, VarBuilder}; @@ -77,27 +77,6 @@ impl Config { } } -#[derive(Debug, Clone)] -struct RmsNorm { - inner: candle_nn::RmsNorm, - span: tracing::Span, -} - -impl RmsNorm { - fn new(size: usize, eps: f64, vb: VarBuilder) -> Result { - let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); - let inner = candle_nn::rms_norm(size, eps, vb)?; - Ok(Self { inner, span }) - } -} - -impl Module for RmsNorm { - fn forward(&self, x: &Tensor) -> Result { - let _enter = self.span.enter(); - self.inner.forward(x) - } -} - #[derive(Debug, Clone)] struct RotaryEmbedding { sin: Tensor, diff --git a/candle-transformers/src/models/mixtral.rs b/candle-transformers/src/models/mixtral.rs index ede74d3f..f69c68e3 100644 --- a/candle-transformers/src/models/mixtral.rs +++ b/candle-transformers/src/models/mixtral.rs @@ -1,4 +1,4 @@ -use crate::models::with_tracing::{linear_no_bias, Linear}; +use crate::models::with_tracing::{linear_no_bias, Linear, RmsNorm}; /// Mixtral Model /// https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/modeling_mixtral.py /// https://mistral.ai/news/mixtral-of-experts/ @@ -48,27 +48,6 @@ impl Config { } } -#[derive(Debug, Clone)] -struct RmsNorm { - inner: candle_nn::RmsNorm, - span: tracing::Span, -} - -impl RmsNorm { - fn new(size: usize, eps: f64, vb: VarBuilder) -> Result { - let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); - let inner = candle_nn::rms_norm(size, eps, vb)?; - Ok(Self { inner, span }) - } -} - -impl Module for RmsNorm { - fn forward(&self, x: &Tensor) -> Result { - let _enter = self.span.enter(); - self.inner.forward(x) - } -} - #[derive(Debug, Clone)] struct RotaryEmbedding { sin: Tensor, diff --git a/candle-transformers/src/models/qwen2.rs b/candle-transformers/src/models/qwen2.rs index 26431b7d..9a12eba5 100644 --- a/candle-transformers/src/models/qwen2.rs +++ b/candle-transformers/src/models/qwen2.rs @@ -1,4 +1,4 @@ -use crate::models::with_tracing::{linear, linear_no_bias, Linear}; +use crate::models::with_tracing::{linear, linear_no_bias, Linear, RmsNorm}; use candle::{DType, Device, Module, Result, Tensor, D}; use candle_nn::{Activation, VarBuilder}; use std::sync::Arc; @@ -21,27 +21,6 @@ pub struct Config { pub hidden_act: Activation, } -#[derive(Debug, Clone)] -struct RmsNorm { - inner: candle_nn::RmsNorm, - span: tracing::Span, -} - -impl RmsNorm { - fn new(size: usize, eps: f64, vb: VarBuilder) -> Result { - let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); - let inner = candle_nn::rms_norm(size, eps, vb)?; - Ok(Self { inner, span }) - } -} - -impl Module for RmsNorm { - fn forward(&self, x: &Tensor) -> Result { - let _enter = self.span.enter(); - self.inner.forward(x) - } -} - #[derive(Debug, Clone)] struct RotaryEmbedding { sin: Tensor, diff --git a/candle-transformers/src/models/with_tracing.rs b/candle-transformers/src/models/with_tracing.rs index 2ffec724..1c34bfa2 100644 --- a/candle-transformers/src/models/with_tracing.rs +++ b/candle-transformers/src/models/with_tracing.rs @@ -167,3 +167,24 @@ pub fn layer_norm>( let span = tracing::span!(tracing::Level::TRACE, "layer-norm"); Ok(LayerNorm { inner, span }) } + +#[derive(Debug, Clone)] +pub struct RmsNorm { + inner: candle_nn::RmsNorm, + span: tracing::Span, +} + +impl RmsNorm { + pub fn new(size: usize, eps: f64, vb: VarBuilder) -> Result { + let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); + let inner = candle_nn::rms_norm(size, eps, vb)?; + Ok(Self { inner, span }) + } +} + +impl Module for RmsNorm { + fn forward(&self, x: &Tensor) -> Result { + let _enter = self.span.enter(); + self.inner.forward(x) + } +} diff --git a/candle-transformers/src/models/yi.rs b/candle-transformers/src/models/yi.rs index 14b6feeb..99d9de1b 100644 --- a/candle-transformers/src/models/yi.rs +++ b/candle-transformers/src/models/yi.rs @@ -1,5 +1,5 @@ /// https://huggingface.co/01-ai/Yi-6B/blob/main/modeling_yi.py -use crate::models::with_tracing::{linear_no_bias, Linear}; +use crate::models::with_tracing::{linear_no_bias, Linear, RmsNorm}; use candle::{DType, Device, Module, Result, Tensor, D}; use candle_nn::{Activation, VarBuilder}; use std::sync::Arc; @@ -50,27 +50,6 @@ impl Config { } } -#[derive(Debug, Clone)] -struct RmsNorm { - inner: candle_nn::RmsNorm, - span: tracing::Span, -} - -impl RmsNorm { - fn new(size: usize, eps: f64, vb: VarBuilder) -> Result { - let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); - let inner = candle_nn::rms_norm(size, eps, vb)?; - Ok(Self { inner, span }) - } -} - -impl Module for RmsNorm { - fn forward(&self, x: &Tensor) -> Result { - let _enter = self.span.enter(); - self.inner.forward(x) - } -} - #[derive(Debug, Clone)] struct RotaryEmbedding { sin: Tensor, From f115895b9e981698daa04d0be33555c03f7892ed Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 18 Mar 2024 21:43:31 +0100 Subject: [PATCH 030/131] Apply rustfmt. (#1873) --- candle-examples/examples/reinforcement-learning/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/candle-examples/examples/reinforcement-learning/main.rs b/candle-examples/examples/reinforcement-learning/main.rs index 61b5a6db..1a25cd93 100644 --- a/candle-examples/examples/reinforcement-learning/main.rs +++ b/candle-examples/examples/reinforcement-learning/main.rs @@ -13,8 +13,8 @@ mod gym_env; mod vec_gym_env; mod ddpg; -mod policy_gradient; mod dqn; +mod policy_gradient; #[derive(Parser)] struct Args { @@ -26,7 +26,7 @@ struct Args { enum Command { Pg, Ddpg, - Dqn + Dqn, } fn main() -> Result<()> { @@ -34,7 +34,7 @@ fn main() -> Result<()> { match args.command { Command::Pg => policy_gradient::run()?, Command::Ddpg => ddpg::run()?, - Command::Dqn => dqn::run()? + Command::Dqn => dqn::run()?, } Ok(()) } From 143c481c20abc3420e848eab075d1547a96cc447 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 18 Mar 2024 21:54:15 +0100 Subject: [PATCH 031/131] Expose candle gather op in pyo3. (#1870) --- candle-pyo3/py_src/candle/__init__.pyi | 6 ++++++ candle-pyo3/src/lib.rs | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/candle-pyo3/py_src/candle/__init__.pyi b/candle-pyo3/py_src/candle/__init__.pyi index aef0707d..b0f05de5 100644 --- a/candle-pyo3/py_src/candle/__init__.pyi +++ b/candle-pyo3/py_src/candle/__init__.pyi @@ -324,6 +324,12 @@ class Tensor: """ pass + def gather(self, index, dim): + """ + Gathers values along an axis specified by dim. + """ + pass + def get(self, index: int) -> Tensor: """ Gets the value at the specified index. diff --git a/candle-pyo3/src/lib.rs b/candle-pyo3/src/lib.rs index 7b9a7413..e0d3bf30 100644 --- a/candle-pyo3/src/lib.rs +++ b/candle-pyo3/src/lib.rs @@ -448,6 +448,12 @@ impl PyTensor { Ok(PyTensor(self.0.index_select(rhs, dim).map_err(wrap_err)?)) } + /// Gathers values along an axis specified by dim. + fn gather(&self, index: &Self, dim: i64) -> PyResult { + let dim = actual_dim(self, dim).map_err(wrap_err)?; + Ok(PyTensor(self.0.gather(index, dim).map_err(wrap_err)?)) + } + #[pyo3(text_signature = "(self, rhs:Tensor)")] /// Performs a matrix multiplication between the two tensors. /// &RETURNS&: Tensor From 2a8679509eb55232b37378442c4366343f6dcb11 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Tue, 19 Mar 2024 03:46:58 -0400 Subject: [PATCH 032/131] Add support for conv_transpose1d for metal backend (#1874) * first attempt * progress * integrate into metal backend * finish and get test passing * add other dtype support * update transpose1d dtypes supported --- candle-core/src/metal_backend.rs | 52 ++++++- candle-core/tests/conv_tests.rs | 5 - candle-metal-kernels/src/conv.metal | 78 ++++++++++ candle-metal-kernels/src/lib.rs | 53 +++++++ candle-metal-kernels/src/tests.rs | 216 ++++++++++++++++++++++++++++ 5 files changed, 394 insertions(+), 10 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index cf354f45..303d69ff 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -948,12 +948,54 @@ impl BackendStorage for MetalStorage { fn conv_transpose1d( &self, - _l: &Layout, - _kernel: &Self, - _kernel_l: &Layout, - _params: &ParamsConvTranspose1D, + layout: &Layout, + k: &Self, + k_layout: &Layout, + params: &ParamsConvTranspose1D, ) -> Result { - crate::bail!("Metal conv_transpose1d not implemented") + let device = self.device().clone(); + + let l_out = params.l_out(); + let dst_el = params.c_out * l_out * params.b_size; + + let dst_el = params.c_out * l_out * params.b_size; + let buffer = self + .device + .new_buffer(dst_el, self.dtype, "conv_transpose1d")?; + + let command_buffer = self.device.command_buffer()?; + let name = match self.dtype { + DType::F32 => "conv_transpose1d_f32", + DType::F16 => "conv_transpose1d_f16", + DType::BF16 => "conv_transpose1d_bf16", + DType::U32 => "conv_transpose1d_u32", + DType::U8 => "conv_transpose1d_u8", + dtype => crate::bail!("Metal conv_transpose1d {dtype:?} not implemented"), + }; + candle_metal_kernels::call_conv_transpose1d( + &self.device.device, + &command_buffer, + &self.device.kernels, + name, + params.dilation, + params.stride, + params.padding, + params.output_padding, + params.c_out, + l_out, + params.b_size, + layout.dims(), + layout.stride(), + k_layout.dims(), + k_layout.stride(), + &self.buffer, + layout.start_offset() * self.dtype.size_in_bytes(), + &k.buffer, + k_layout.start_offset() * k.dtype.size_in_bytes(), + &buffer, + ) + .map_err(MetalError::from)?; + Ok(Self::new(buffer, self.device.clone(), dst_el, self.dtype)) } fn conv2d( diff --git a/candle-core/tests/conv_tests.rs b/candle-core/tests/conv_tests.rs index ba60b778..71bf65be 100644 --- a/candle-core/tests/conv_tests.rs +++ b/candle-core/tests/conv_tests.rs @@ -54,11 +54,6 @@ fn conv1d(dev: &Device) -> Result<()> { [2.4509, 2.6357, -1.3336, 4.1393, 0.5657, 1.8091, -1.1784, 3.5675, 0.5069, 3.3352] ); - // conv-transposes are not implemented for metal. - if dev.is_metal() { - return Ok(()); - } - let w = w.transpose(0, 1)?; // The CPU kernels applied in the contiguous and non contiguous cases are different. for w in [w.clone(), w.contiguous()?] { diff --git a/candle-metal-kernels/src/conv.metal b/candle-metal-kernels/src/conv.metal index 7f7a75cf..a258ae58 100644 --- a/candle-metal-kernels/src/conv.metal +++ b/candle-metal-kernels/src/conv.metal @@ -335,6 +335,76 @@ kernel void FN_NAME( \ max_pool2d(w_k, h_k, w_s, h_s, src_dims, src_s, src, dst, tid); \ } \ + +// Naive implementation of conv_transpose1d. +template +METAL_FUNC void conv_transpose1d( + constant size_t &l_out, + constant size_t &stride, + constant size_t &padding, + constant size_t &out_padding, + constant size_t &dilation, + constant size_t *src_dims, + constant size_t *src_strides, + constant size_t *k_dims, + constant size_t *k_strides, + device const T *src, + device const T *k, + device T *dst, + uint tid [[ thread_position_in_grid ]] +) { + // src: (b_size, c_in, l_in) + // kernel: (c_in, c_out, l_k) + const size_t l_k = k_dims[2]; + const size_t c_out = k_dims[1]; + const size_t c_in = src_dims[1]; + const size_t l_in = src_dims[2]; + if (tid >= src_dims[0] * c_out * l_out) { + return; + } + + const size_t b_idx = tid / (l_out * c_out); + const size_t dst_c_idx = (tid / l_out) % c_out; + const size_t out_x = tid % l_out; + + const size_t src_idx0 = b_idx * src_strides[0]; + A d = 0; + for (int k_x = 0; k_x < (int)l_k; ++k_x) { + // let out_x = inp_x * p.stride + k_x * p.dilation - p.padding; + int inp_x_stride = (int)(out_x + padding) - k_x * dilation; + if (inp_x_stride < 0 || inp_x_stride % stride) { + continue; + } + int inp_x = inp_x_stride / stride; + if (inp_x >= l_in) continue; + for (size_t src_c_idx = 0; src_c_idx < c_in; ++src_c_idx) { + const size_t src_idx = src_idx0 + src_c_idx * src_strides[1] + inp_x * src_strides[2]; + const size_t k_idx = src_c_idx * k_strides[0] + dst_c_idx * k_strides[1] + k_x * k_strides[2]; + d += static_cast(src[src_idx]) * static_cast(k[k_idx]); + } + } + dst[tid] = static_cast(d); +} + +#define CONVT1D_OP(TYPENAME, TYPEACC, FN_NAME) \ +kernel void FN_NAME( \ + constant size_t &l_out, \ + constant size_t &stride, \ + constant size_t &padding, \ + constant size_t &out_padding, \ + constant size_t &dilation, \ + constant size_t *src_dims, \ + constant size_t *src_strides, \ + constant size_t *k_dims, \ + constant size_t *k_strides, \ + device const TYPENAME *src, \ + device const TYPENAME *k, \ + device TYPENAME *dst, \ + uint tid [[ thread_position_in_grid ]] \ +) { \ + conv_transpose1d(l_out, stride, padding, out_padding, dilation, src_dims, src_strides, k_dims, k_strides, src, k, dst, tid); \ +} \ + IM2COL_OP(float, im2col_f32) IM2COL_OP(uint8_t, im2col_u8) IM2COL_OP(uint32_t, im2col_u32) @@ -361,4 +431,12 @@ AVGPOOL2D_OP(uint32_t, uint32_t, avg_pool2d_u32) AVGPOOL2D_OP(uint8_t, uint8_t, avg_pool2d_u8) #if defined(__HAVE_BFLOAT__) AVGPOOL2D_OP(bfloat, float, avg_pool2d_bf16) +#endif + +CONVT1D_OP(float, float, conv_transpose1d_f32) +CONVT1D_OP(half, float, conv_transpose1d_f16) +CONVT1D_OP(uint8_t, uint8_t, conv_transpose1d_u8) +CONVT1D_OP(uint32_t, uint32_t, conv_transpose1d_u32) +#if defined(__HAVE_BFLOAT__) +CONVT1D_OP(bfloat, float, conv_transpose1d_bf16) #endif \ No newline at end of file diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 1161501f..f12463a4 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1859,5 +1859,58 @@ pub fn call_pool2d( Ok(()) } +#[allow(clippy::too_many_arguments)] +pub fn call_conv_transpose1d( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + name: &'static str, + dilation: usize, + stride: usize, + padding: usize, + out_padding: usize, + c_out: usize, + l_out: usize, + b_size: usize, + src_shape: &[usize], + src_strides: &[usize], + kernel_shape: &[usize], + kernel_strides: &[usize], + input: &Buffer, + input_offset: usize, + kernel: &Buffer, + kernel_offset: usize, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let dst_el = c_out * l_out * b_size; + let pipeline: ComputePipelineState = kernels.load_pipeline(device, Source::Conv, name)?; + let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + set_params!( + encoder, + ( + l_out, + stride, + padding, + out_padding, + dilation, + src_shape, + src_strides, + kernel_shape, + kernel_strides, + (input, input_offset), + (kernel, kernel_offset), + output + ) + ); + encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(kernel, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[cfg(test)] mod tests; diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index 19e160dd..5045a4a3 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -1717,3 +1717,219 @@ fn avg_pool2d_u32() { let expected = vec![2, 3, 4, 6, 7, 8, 10, 11, 12]; assert_eq!(results, expected); } + +fn run_conv_transpose1d( + input: &[T], + input_shape: &[usize], + input_stride: &[usize], + kernel: &[T], + kernel_shape: &[usize], + kernel_stride: &[usize], + dilation: usize, + stride: usize, + padding: usize, + out_padding: usize, + name: &'static str, +) -> Vec { + let device = device(); + let command_queue = device.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + + let c_out = kernel_shape[1]; + let k_size = kernel_shape[2]; + let b_size = input_shape[0]; + let l_in = input_shape[2]; + let l_out = (l_in - 1) * stride - 2 * padding + dilation * (k_size - 1) + out_padding + 1; + let dst_el = c_out * l_out * b_size; + + let input = new_buffer(&device, input); + let kernel = new_buffer(&device, kernel); + let output = new_buffer(&device, &vec![0.0f32; dst_el]); + let kernels = Kernels::new(); + + call_conv_transpose1d( + &device, + command_buffer, + &kernels, + name, + dilation, + stride, + padding, + out_padding, + c_out, + l_out, + b_size, + input_shape, + input_stride, + kernel_shape, + kernel_stride, + &input, + 0, + &kernel, + 0, + &output, + ) + .unwrap(); + command_buffer.commit(); + command_buffer.wait_until_completed(); + + read_to_vec(&output, dst_el) +} + +#[test] +fn conv_transpose1d_f32() { + let input = vec![1.0f32, 2.0, 3.0, 4.0]; + let input_shape = &[1, 1, 4]; + let input_stride = &[4, 4, 1]; + + let kernel = vec![1.0f32, 2.0, 3.0, 4.0]; + let kernel_shape = &[1, 1, 4]; + let kernel_stride = &[4, 4, 1]; + + let results = run_conv_transpose1d( + &input, + input_shape, + input_stride, + &kernel, + kernel_shape, + kernel_stride, + 1, + 1, + 0, + 0, + "conv_transpose1d_f32", + ); + + let expected = vec![1., 4., 10., 20., 25., 24., 16.]; + assert_eq!(results, expected); +} + +#[test] +fn conv_transpose1d_f16() { + let input: Vec = vec![1.0, 2.0, 3.0, 4.0] + .iter() + .map(|v| f16::from_f32(*v)) + .collect(); + let input_shape = &[1, 1, 4]; + let input_stride = &[4, 4, 1]; + + let kernel: Vec = vec![1.0, 2.0, 3.0, 4.0] + .iter() + .map(|v| f16::from_f32(*v)) + .collect(); + let kernel_shape = &[1, 1, 4]; + let kernel_stride = &[4, 4, 1]; + + let results = run_conv_transpose1d( + &input, + input_shape, + input_stride, + &kernel, + kernel_shape, + kernel_stride, + 1, + 1, + 0, + 0, + "conv_transpose1d_f16", + ); + + let expected = vec![1., 4., 10., 20., 25., 24., 16.] + .iter() + .map(|v| f16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); +} + +#[test] +fn conv_transpose1d_bf16() { + let input: Vec = vec![1.0, 2.0, 3.0, 4.0] + .iter() + .map(|v| bf16::from_f32(*v)) + .collect(); + let input_shape = &[1, 1, 4]; + let input_stride = &[4, 4, 1]; + + let kernel: Vec = vec![1.0, 2.0, 3.0, 4.0] + .iter() + .map(|v| bf16::from_f32(*v)) + .collect(); + let kernel_shape = &[1, 1, 4]; + let kernel_stride = &[4, 4, 1]; + + let results = run_conv_transpose1d( + &input, + input_shape, + input_stride, + &kernel, + kernel_shape, + kernel_stride, + 1, + 1, + 0, + 0, + "conv_transpose1d_bf16", + ); + + let expected = vec![1., 4., 10., 20., 25., 24., 16.] + .iter() + .map(|v| bf16::from_f32(*v)) + .collect::>(); + assert_eq!(results, expected); +} + +#[test] +fn conv_transpose1d_u8() { + let input: Vec = vec![1, 2, 3, 4]; + let input_shape = &[1, 1, 4]; + let input_stride = &[4, 4, 1]; + + let kernel: Vec = vec![1, 2, 3, 4]; + let kernel_shape = &[1, 1, 4]; + let kernel_stride = &[4, 4, 1]; + + let results = run_conv_transpose1d( + &input, + input_shape, + input_stride, + &kernel, + kernel_shape, + kernel_stride, + 1, + 1, + 0, + 0, + "conv_transpose1d_u8", + ); + + let expected = vec![1, 4, 10, 20, 25, 24, 16]; + assert_eq!(results, expected); +} + +#[test] +fn conv_transpose1d_u32() { + let input: Vec = vec![1, 2, 3, 4]; + let input_shape = &[1, 1, 4]; + let input_stride = &[4, 4, 1]; + + let kernel: Vec = vec![1, 2, 3, 4]; + let kernel_shape = &[1, 1, 4]; + let kernel_stride = &[4, 4, 1]; + + let results = run_conv_transpose1d( + &input, + input_shape, + input_stride, + &kernel, + kernel_shape, + kernel_stride, + 1, + 1, + 0, + 0, + "conv_transpose1d_u32", + ); + + let expected = vec![1, 4, 10, 20, 25, 24, 16]; + assert_eq!(results, expected); +} From 455c42aa729d8019fcb496106478e75dd3246c08 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 20 Mar 2024 13:04:36 +0100 Subject: [PATCH 033/131] Avoid copying the data on squeeze and unsqueeze. (#1884) * Avoid copying the data on squeeze and unsqueeze. * Fix the quantized llama example. * Unrelated fix for the quantized stable-lm example on cuda. * Fix for mamba on cuda (unrelated to the PR). --- candle-core/src/tensor.rs | 29 +++++++++++++++++-- candle-core/tests/layout_tests.rs | 16 +++++++++- candle-examples/examples/stable-lm/main.rs | 6 ++-- candle-transformers/src/models/mamba.rs | 2 +- .../src/models/quantized_llama.rs | 2 +- 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index 22cd4950..feab30c8 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -2093,8 +2093,19 @@ impl Tensor { let dim = dim.to_index(self.shape(), "squeeze")?; if dims[dim] == 1 { let mut dims = dims.to_vec(); + let mut strides = self.stride().to_vec(); dims.remove(dim); - self.reshape(dims) + strides.remove(dim); + let tensor_ = Tensor_ { + id: TensorId::new(), + storage: self.storage.clone(), + layout: Layout::new(dims.into(), strides, self.layout.start_offset()), + op: BackpropOp::new1(self, Op::Reshape), + is_variable: false, + dtype: self.dtype, + device: self.device.clone(), + }; + Ok(Tensor(Arc::new(tensor_))) } else { Ok(self.clone()) } @@ -2115,10 +2126,24 @@ impl Tensor { /// ``` pub fn unsqueeze(&self, dim: D) -> Result { let mut dims = self.dims().to_vec(); + let mut strides = self.stride().to_vec(); let dim = dim.to_index_plus_one(self.shape(), "unsqueeze")?; // Cannot panic because to_index_plus_one already checks dimensions dims.insert(dim, 1); - self.reshape(dims) + // Any stride would work here, but we pick one so as to maximize the probability to remain + // C contiguous. + let stride = if dim < strides.len() { strides[dim] } else { 1 }; + strides.insert(dim, stride); + let tensor_ = Tensor_ { + id: TensorId::new(), + storage: self.storage.clone(), + layout: Layout::new(dims.into(), strides, self.layout.start_offset()), + op: BackpropOp::new1(self, Op::Reshape), + is_variable: false, + dtype: self.dtype, + device: self.device.clone(), + }; + Ok(Tensor(Arc::new(tensor_))) } /// Stacks two or more tensors along a particular dimension. diff --git a/candle-core/tests/layout_tests.rs b/candle-core/tests/layout_tests.rs index e0618850..bc67f7de 100644 --- a/candle-core/tests/layout_tests.rs +++ b/candle-core/tests/layout_tests.rs @@ -88,7 +88,7 @@ fn strided_blocks() -> Result<()> { } }; let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?; - let tensor = tensor.i((.., 1))?; + let tensor = tensor.i((.., 1))?.contiguous()?; match tensor.strided_blocks() { candle::StridedBlocks::SingleBlock { start_offset, len } => { assert_eq!(start_offset, 0); @@ -100,6 +100,20 @@ fn strided_blocks() -> Result<()> { } }; let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?; + let tensor = tensor.i((.., 1))?; + match tensor.strided_blocks() { + candle::StridedBlocks::SingleBlock { .. } => { + panic!("unexpected block structure") + } + candle::StridedBlocks::MultipleBlocks { + block_len, + block_start_index, + } => { + assert_eq!(block_len, 4); + assert_eq!(block_start_index.collect::>(), &[4, 16]) + } + }; + let tensor = Tensor::arange(0u32, 24u32, &Cpu)?.reshape((2, 3, 4))?; match tensor.t()?.strided_blocks() { candle::StridedBlocks::SingleBlock { .. } => { panic!("unexpected block structure") diff --git a/candle-examples/examples/stable-lm/main.rs b/candle-examples/examples/stable-lm/main.rs index f467903a..f0707010 100644 --- a/candle-examples/examples/stable-lm/main.rs +++ b/candle-examples/examples/stable-lm/main.rs @@ -288,12 +288,12 @@ fn main() -> Result<()> { }; let device = candle_examples::device(args.cpu)?; - let (model, device) = if args.quantized { + let model = if args.quantized { let filename = &filenames[0]; let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(filename, &device)?; let model = QStableLM::new(&config, vb)?; - (Model::Quantized(model), Device::Cpu) + Model::Quantized(model) } else { let dtype = if device.is_cuda() { DType::BF16 @@ -302,7 +302,7 @@ fn main() -> Result<()> { }; let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? }; let model = StableLM::new(&config, vb)?; - (Model::StableLM(model), device) + Model::StableLM(model) }; println!("loaded the model in {:?}", start.elapsed()); diff --git a/candle-transformers/src/models/mamba.rs b/candle-transformers/src/models/mamba.rs index 81828ad5..597dd2cd 100644 --- a/candle-transformers/src/models/mamba.rs +++ b/candle-transformers/src/models/mamba.rs @@ -121,7 +121,7 @@ impl MambaBlock { // Algorithm 3.2 on page 6, https://arxiv.org/pdf/2312.00752.pdf let x_proj = self.x_proj.forward(&proj_for_conv)?; - let delta = x_proj.narrow(D::Minus1, 0, self.dt_rank)?; + let delta = x_proj.narrow(D::Minus1, 0, self.dt_rank)?.contiguous()?; let b = x_proj.narrow(D::Minus1, self.dt_rank, D_STATE)?; let c = x_proj.narrow(D::Minus1, self.dt_rank + D_STATE, D_STATE)?; diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 94324149..5ce2de59 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -512,7 +512,7 @@ impl ModelWeights { layer_in = x } let x = self.norm.forward(&layer_in)?; - let x = x.i((.., seq_len - 1, ..))?; + let x = x.i((.., seq_len - 1, ..))?.contiguous()?; let _enter = self.span_output.enter(); self.output.forward(&x) } From 469635a3ebc746adf04afaed6b9c07b9ab4303d4 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 20 Mar 2024 14:38:27 +0100 Subject: [PATCH 034/131] Minor cleanup. (#1885) --- candle-core/src/metal_backend.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 303d69ff..acc6c445 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -953,11 +953,7 @@ impl BackendStorage for MetalStorage { k_layout: &Layout, params: &ParamsConvTranspose1D, ) -> Result { - let device = self.device().clone(); - let l_out = params.l_out(); - let dst_el = params.c_out * l_out * params.b_size; - let dst_el = params.c_out * l_out * params.b_size; let buffer = self .device From b219903d0f9ee52f70397c7e9aa4df323b89a700 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 20 Mar 2024 18:32:55 +0100 Subject: [PATCH 035/131] Cuda backend optimization (#1886) * Attempt at making the kernel faster. * Also adapt the cast kernels. * Also apply to binary ops. --- candle-core/src/cuda_backend.rs | 59 ++++++++++++++++++++----- candle-kernels/src/affine.cu | 2 +- candle-kernels/src/binary_op_macros.cuh | 4 +- candle-kernels/src/cast.cu | 4 +- candle-kernels/src/unary.cu | 4 +- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs index 52d1b558..8954fc33 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend.rs @@ -11,6 +11,31 @@ use cudarc::driver::{ use half::{bf16, f16}; use std::sync::{Arc, Mutex}; +enum SlicePtrOrNull { + Ptr(CudaSlice), + Null, +} + +unsafe impl DeviceRepr for &SlicePtrOrNull { + fn as_kernel_param(&self) -> *mut std::ffi::c_void { + match self { + SlicePtrOrNull::Ptr(slice) => slice.as_kernel_param(), + SlicePtrOrNull::Null => 0usize.as_kernel_param(), + } + } +} + +impl SlicePtrOrNull { + fn params_from_layout(dev: &CudaDevice, l: &Layout) -> Result { + let ds = if l.is_contiguous() { + SlicePtrOrNull::Null + } else { + SlicePtrOrNull::Ptr(dev.htod_copy([l.dims(), l.stride()].concat()).w()?) + }; + Ok(ds) + } +} + /// cudarc related errors #[derive(thiserror::Error, Debug)] pub enum CudaError { @@ -564,7 +589,7 @@ impl Map1 for Affine { let dims = shape.dims(); let el = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(el as u32); - let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?; + let ds = SlicePtrOrNull::params_from_layout(dev, layout)?; let src = &src.slice(layout.start_offset()..); let func = dev.get_or_load_func(&kernel_name::("affine"), kernels::AFFINE)?; // SAFETY: Set later by running the kernel. @@ -596,7 +621,7 @@ impl Map1 for Elu { let dims = shape.dims(); let el = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(el as u32); - let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?; + let ds = SlicePtrOrNull::params_from_layout(dev, layout)?; let src = &src.slice(layout.start_offset()..); let func = dev.get_or_load_func(&kernel_name::("uelu"), kernels::UNARY)?; // SAFETY: Set later by running the kernel. @@ -719,7 +744,7 @@ impl Map1 for Powf { let dims = shape.dims(); let el = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(el as u32); - let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?; + let ds = SlicePtrOrNull::params_from_layout(dev, layout)?; let src = &src.slice(layout.start_offset()..); let func = dev.get_or_load_func(&kernel_name::("upowf"), kernels::UNARY)?; // SAFETY: Set later by running the kernel. @@ -852,7 +877,7 @@ impl Map1 for U { let dims = shape.dims(); let el_count = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(el_count as u32); - let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?; + let ds = SlicePtrOrNull::params_from_layout(dev, layout)?; let src = &src.slice(layout.start_offset()..); let func = dev.get_or_load_func(&kernel_name::(U::KERNEL), kernels::UNARY)?; // SAFETY: Set later by running the kernel. @@ -1402,9 +1427,14 @@ impl Map2 for U { let dims = shape.dims(); let elem_count = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(elem_count as u32); - let dims_and_strides = dev - .htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat()) - .w()?; + let dims_and_strides = if lhs_l.is_contiguous() && rhs_l.is_contiguous() { + SlicePtrOrNull::Null + } else { + SlicePtrOrNull::Ptr( + dev.htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat()) + .w()?, + ) + }; let lhs = &lhs.slice(lhs_l.start_offset()..); let rhs = &rhs.slice(rhs_l.start_offset()..); let func = dev.get_or_load_func(&kernel_name::(U::KERNEL), kernels::BINARY)?; @@ -1431,9 +1461,14 @@ impl Map2Any for Cmp { let dims = shape.dims(); let elem_count = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(elem_count as u32); - let dims_and_strides = dev - .htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat()) - .w()?; + let dims_and_strides = if lhs_l.is_contiguous() && rhs_l.is_contiguous() { + SlicePtrOrNull::Null + } else { + SlicePtrOrNull::Ptr( + dev.htod_copy([dims, lhs_l.stride(), rhs_l.stride()].concat()) + .w()?, + ) + }; let lhs = &lhs.slice(lhs_l.start_offset()..); let rhs = &rhs.slice(rhs_l.start_offset()..); let name = match self.0 { @@ -1640,7 +1675,7 @@ impl BackendStorage for CudaStorage { let el = shape.elem_count(); let cfg = LaunchConfig::for_num_elems(el as u32); let dev = self.device(); - let ds = dev.htod_copy([dims, layout.stride()].concat()).w()?; + let ds = SlicePtrOrNull::params_from_layout(dev, layout)?; let start_o = layout.start_offset(); // This returns an i64 rather than a &i64, this is useful to get around some temporary // lifetime issue and is safe as long as self.slice does not go out of scope before inp @@ -2215,7 +2250,7 @@ impl BackendStorage for CudaStorage { } let cfg = LaunchConfig::for_num_elems(el_count as u32); let dev = &self.device; - let ds = dev.htod_copy([dims, src_l.stride()].concat()).w()?; + let ds = SlicePtrOrNull::params_from_layout(dev, src_l)?; match (&self.slice, &mut dst.slice) { (CudaStorageSlice::BF16(src), CudaStorageSlice::BF16(dst)) => { let (src, mut dst) = slice_src_and_dst(src, src_l, dst, dst_offset); diff --git a/candle-kernels/src/affine.cu b/candle-kernels/src/affine.cu index 152b9463..540d0819 100644 --- a/candle-kernels/src/affine.cu +++ b/candle-kernels/src/affine.cu @@ -13,7 +13,7 @@ extern "C" __global__ void FN_NAME( \ ) { \ const size_t *dims = info; \ const size_t *strides = info + num_dims; \ - if (is_contiguous(num_dims, dims, strides)) { \ + if (info == nullptr || is_contiguous(num_dims, dims, strides)) { \ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \ TYPENAME x = inp ? inp[i] : out[i]; \ out[i] = x * mul + add; \ diff --git a/candle-kernels/src/binary_op_macros.cuh b/candle-kernels/src/binary_op_macros.cuh index 05d0c3df..9cb00874 100644 --- a/candle-kernels/src/binary_op_macros.cuh +++ b/candle-kernels/src/binary_op_macros.cuh @@ -12,8 +12,8 @@ extern "C" __global__ void FN_NAME( \ const size_t *dims = dims_and_strides; \ const size_t *lhs_strides = dims_and_strides + 1 * num_dims; \ const size_t *rhs_strides = dims_and_strides + 2 * num_dims; \ - bool lhs_cont = is_contiguous(num_dims, dims, lhs_strides); \ - bool rhs_cont = is_contiguous(num_dims, dims, rhs_strides); \ + bool lhs_cont = dims_and_strides == nullptr || is_contiguous(num_dims, dims, lhs_strides); \ + bool rhs_cont = dims_and_strides == nullptr || is_contiguous(num_dims, dims, rhs_strides); \ if (lhs_cont && rhs_cont) { \ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \ TYPENAME x = lhs[i]; \ diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu index 024642c6..2fe85e1c 100644 --- a/candle-kernels/src/cast.cu +++ b/candle-kernels/src/cast.cu @@ -11,7 +11,7 @@ __device__ void cast_( ) { const size_t *dims = info; const size_t *strides = info + num_dims; - if (is_contiguous(num_dims, dims, strides)) { + if (info == nullptr || is_contiguous(num_dims, dims, strides)) { for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { out[i] = inp[i]; } @@ -34,7 +34,7 @@ __device__ void cast_through( ) { const size_t *dims = info; const size_t *strides = info + num_dims; - if (is_contiguous(num_dims, dims, strides)) { + if (info == nullptr || is_contiguous(num_dims, dims, strides)) { for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { out[i] = static_cast(static_cast(inp[i])); } diff --git a/candle-kernels/src/unary.cu b/candle-kernels/src/unary.cu index 74ba1fac..13489897 100644 --- a/candle-kernels/src/unary.cu +++ b/candle-kernels/src/unary.cu @@ -13,7 +13,7 @@ extern "C" __global__ void FN_NAME( \ ) { \ const size_t *dims = info; \ const size_t *strides = info + num_dims; \ - if (is_contiguous(num_dims, dims, strides)) { \ + if (info == nullptr || is_contiguous(num_dims, dims, strides)) { \ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \ TYPENAME x = inp ? inp[i] : out[i]; \ out[i] = FUNC; \ @@ -71,7 +71,7 @@ extern "C" __global__ void FN_NAME( \ ) { \ const size_t *dims = info; \ const size_t *strides = info + num_dims; \ - if (is_contiguous(num_dims, dims, strides)) { \ + if (info == nullptr || is_contiguous(num_dims, dims, strides)) { \ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \ TYPENAME x = inp ? inp[i] : out[i]; \ out[i] = FUNC; \ From af7f8b87d35e2ee595cf871c3401beed4dc9b3d8 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 06:36:28 +0100 Subject: [PATCH 036/131] Custom op for RmsNorm (#1890) * Trying out a custom RmsNorm cuda kernel. * CPU implementation for rms-norm. * Cuda wrappers. * Add some validation. * Add some testing. * More testing. --- candle-kernels/src/reduce.cu | 65 +++++++++++++ candle-nn/src/ops.rs | 171 ++++++++++++++++++++++++++++++++++- candle-nn/tests/ops.rs | 34 ++++++- 3 files changed, 262 insertions(+), 8 deletions(-) diff --git a/candle-kernels/src/reduce.cu b/candle-kernels/src/reduce.cu index fca6865e..19fb213a 100644 --- a/candle-kernels/src/reduce.cu +++ b/candle-kernels/src/reduce.cu @@ -2,6 +2,7 @@ #include #include +#define WARP_SIZE 32 const int BLOCK_SIZE = 1024; // TODO: Maybe add some fast_sum_f16_f32 variant that not only accumulate in f32 @@ -49,6 +50,59 @@ fast_sum(const size_t src_numel, const size_t el_to_sum_per_block, dst[dst_id] = shr[0]; } +static __device__ __forceinline__ float warp_reduce_sum(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); + } + return x; +} + +// RmsNorm implementation adapted from ggml, accumulation is made using f32. +// https://github.com/ggerganov/llama.cpp/blob/d59bd97065cd7ded6c4ecab54b1d5e0b1b11e318/ggml-cuda.cu#L523 +template +__device__ void rmsnorm(const T * x, T * dst, const T * alpha, const int ncols, const float eps) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + const int block_size = blockDim.x; + + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = static_cast(x[row*ncols + col]); + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __shared__ float s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + const float mean = tmp / ncols; + const float scale = rsqrtf(mean + eps); + + if (alpha == nullptr) { + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = static_cast(scale * static_cast(x[row*ncols + col])); + } + } + else { + for (int col = tid; col < ncols; col += block_size) { + float a = static_cast(alpha[col]); + dst[row*ncols + col] = static_cast(scale * static_cast(x[row*ncols + col]) * a); + } + } +} + // Softmax implementation adapted from ggml. // https://github.com/ggerganov/llama.cpp/blob/d59bd97065cd7ded6c4ecab54b1d5e0b1b11e318/ggml-cuda.cu#L4159 template @@ -341,14 +395,23 @@ fast_argmax(const size_t src_numel, const size_t el_to_sum_per_block, softmax(src, dst, n_cols); \ } \ +#define RMSNORM_OP(TYPENAME, FN_NAME) \ + extern "C" __global__ void FN_NAME( \ + const TYPENAME *src, TYPENAME *dst, const TYPENAME *alpha, \ + const int n_cols, const float eps) { \ + rmsnorm(src, dst, alpha, n_cols, eps); \ + } \ + #if __CUDA_ARCH__ >= 800 SOFTMAX_OP(__nv_bfloat16, float, softmax_bf16) +RMSNORM_OP(__nv_bfloat16, rmsnorm_bf16) SUM_OP(__nv_bfloat16, sum_bf16) FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argmax_bf16, fast_sum_bf16) #endif #if __CUDA_ARCH__ >= 530 SOFTMAX_OP(__half, float, softmax_f16) +RMSNORM_OP(__half, rmsnorm_f16) SUM_OP(__half, sum_f16) FAST_OP(__half, fast_min_f16, fast_max_f16, fast_argmin_f16, fast_argmax_f16, fast_sum_f16) #endif @@ -358,6 +421,8 @@ SUM_OP(double, sum_f64) SUM_OP(uint32_t, sum_u32) SOFTMAX_OP(float, float, softmax_f32) SOFTMAX_OP(double, double, softmax_f64) +RMSNORM_OP(float, rmsnorm_f32) +RMSNORM_OP(double, rmsnorm_f64) FAST_OP(float, fast_min_f32, fast_max_f32, fast_argmin_f32, fast_argmax_f32, fast_sum_f32) FAST_OP(double, fast_min_f64, fast_max_f64, fast_argmin_f64, fast_argmax_f64, fast_sum_f64) diff --git a/candle-nn/src/ops.rs b/candle-nn/src/ops.rs index 88d1b3d6..d725bdc2 100644 --- a/candle-nn/src/ops.rs +++ b/candle-nn/src/ops.rs @@ -1,4 +1,4 @@ -use candle::{CpuStorage, Layout, Result, Shape, Tensor}; +use candle::{CpuStorage, DType, Layout, Result, Shape, Tensor}; use rayon::prelude::*; /// Applies the softmax function to the input tensor, rescaling the element so that elements on @@ -180,11 +180,10 @@ impl candle::CustomOp1 for SoftmaxLastDim { block_dim: (1, 32, 1), shared_mem_bytes: 0, }; - let src = &src.slice(layout.start_offset()..); let func = dev.get_or_load_func(&kernel_name::("softmax"), kernels::REDUCE)?; // SAFETY: Set later by running the kernel. let dst = unsafe { dev.alloc::(el) }.w()?; - let params = (src, &dst, n_cols as i32); + let params = (&src, &dst, n_cols as i32); // SAFETY: ffi. unsafe { func.launch(cfg, params) }.w()?; Ok(dst) @@ -207,7 +206,7 @@ impl candle::CustomOp1 for SoftmaxLastDim { storage: &candle::MetalStorage, layout: &Layout, ) -> Result<(candle::MetalStorage, Shape)> { - use candle::{backend::BackendStorage, DType}; + use candle::backend::BackendStorage; let device = storage.device(); let command_buffer = device.command_buffer()?; let kernels = device.kernels(); @@ -248,6 +247,170 @@ pub fn softmax_last_dim(xs: &Tensor) -> Result { xs.apply_op1_no_bwd(&SoftmaxLastDim) } +#[derive(Debug, Clone)] +struct RmsNorm { + eps: f32, +} + +impl candle::CustomOp2 for RmsNorm { + fn name(&self) -> &'static str { + "rms-norm" + } + + fn cpu_fwd( + &self, + s1: &CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + ) -> Result<(CpuStorage, Shape)> { + use candle::backend::BackendStorage; + + let eps = self.eps; + fn inner< + T: candle::WithDType + + num_traits::Float + + num_traits::AsPrimitive + + num_traits::FromPrimitive, + >( + src: &[T], + layout: &Layout, + alpha: &[T], + alpha_layout: &Layout, + eps: f32, + ) -> Result<(CpuStorage, Shape)> { + let src = match layout.contiguous_offsets() { + None => candle::bail!("input has to be contiguous"), + Some((o1, o2)) => &src[o1..o2], + }; + let alpha = match alpha_layout.contiguous_offsets() { + None => candle::bail!("alpha has to be contiguous"), + Some((o1, o2)) => &alpha[o1..o2], + }; + let el_count = layout.shape().elem_count(); + let dims = layout.shape().dims(); + let dim_m1 = dims[dims.len() - 1]; + let mut dst = vec![T::zero(); el_count]; + src.par_chunks(dim_m1) + .zip(dst.par_chunks_mut(dim_m1)) + .for_each(|(src, dst)| { + let sum2 = src + .iter() + .map(|&v| { + let v = v.as_(); + v * v + }) + .sum::(); + let m = (sum2 / dim_m1 as f32 + eps).sqrt(); + let m = T::from_f32(m).unwrap_or_else(T::nan); + for ((d, s), alpha) in dst.iter_mut().zip(src.iter()).zip(alpha) { + *d = *s / m * *alpha + } + }); + let storage = candle::WithDType::to_cpu_storage_owned(dst); + Ok((storage, Shape::from_dims(dims))) + } + + use CpuStorage as C; + match (s1, s2) { + (C::BF16(s1), C::BF16(s2)) => inner::(s1, l1, s2, l2, eps), + (C::F16(s1), C::F16(s2)) => inner::(s1, l1, s2, l2, eps), + (C::F32(s1), C::F32(s2)) => inner::(s1, l1, s2, l2, eps), + _ => candle::bail!("unsupported dtype for rmsnorm {:?}", s1.dtype()), + } + } + + #[cfg(feature = "cuda")] + fn cuda_fwd( + &self, + s1: &candle::CudaStorage, + l1: &Layout, + s2: &candle::CudaStorage, + l2: &Layout, + ) -> Result<(candle::CudaStorage, Shape)> { + use candle::cuda_backend::cudarc::driver::{ + CudaSlice, DeviceRepr, LaunchAsync, LaunchConfig, + }; + use candle::cuda_backend::{kernel_name, kernels, Map2, WrapErr}; + use candle::{CudaDevice, WithDType}; + + struct S { + eps: f32, + } + impl Map2 for S { + fn f( + &self, + src: &CudaSlice, + layout: &Layout, + alpha: &CudaSlice, + alpha_layout: &Layout, + dev: &CudaDevice, + ) -> Result> { + let src = match layout.contiguous_offsets() { + None => candle::bail!("input has to be contiguous"), + Some((o1, o2)) => src.slice(o1..o2), + }; + let alpha = match alpha_layout.contiguous_offsets() { + None => candle::bail!("alpha has to be contiguous"), + Some((o1, o2)) => alpha.slice(o1..o2), + }; + let el = layout.shape().elem_count(); + let dims = layout.shape().dims(); + let dim_m1 = dims[dims.len() - 1]; + let (n_rows, n_cols) = (el / dim_m1, dim_m1); + + let cfg = LaunchConfig { + grid_dim: (n_rows as u32, 1, 1), + block_dim: (1024, 1, 1), + shared_mem_bytes: 0, + }; + let func = dev.get_or_load_func(&kernel_name::("rmsnorm"), kernels::REDUCE)?; + // SAFETY: Set later by running the kernel. + let dst = unsafe { dev.alloc::(el) }.w()?; + let params = (&src, &dst, &alpha, n_cols as i32, self.eps); + // SAFETY: ffi. + unsafe { func.launch(cfg, params) }.w()?; + Ok(dst) + } + } + + use candle::backend::BackendStorage; + let dev = s1.device(); + let slice = S { eps: self.eps }.map(&s1.slice, l1, &s2.slice, l2, dev)?; + let dst = candle::cuda_backend::CudaStorage { + slice, + device: dev.clone(), + }; + Ok((dst, l1.shape().clone())) + } +} + +pub fn rms_norm_slow(x: &Tensor, alpha: &Tensor, eps: f32) -> Result { + let x_dtype = x.dtype(); + let internal_dtype = match x_dtype { + DType::F16 | DType::BF16 => DType::F32, + d => d, + }; + let hidden_size = x.dim(candle::D::Minus1)?; + let x = x.to_dtype(internal_dtype)?; + let norm_x = (x.sqr()?.sum_keepdim(candle::D::Minus1)? / hidden_size as f64)?; + let x_normed = x.broadcast_div(&(norm_x + eps as f64)?.sqrt()?)?; + x_normed.to_dtype(x_dtype)?.broadcast_mul(alpha) +} + +pub fn rms_norm(xs: &Tensor, alpha: &Tensor, eps: f32) -> Result { + let hidden_size_xs = xs.dim(candle::D::Minus1)?; + let hidden_size_alpha = alpha.dims1()?; + if hidden_size_xs != hidden_size_alpha { + candle::bail!( + "shape mismatch in rms-norm {:?} {:?}", + xs.shape(), + alpha.shape() + ) + } + xs.apply_op2_no_bwd(alpha, &RmsNorm { eps }) +} + // https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html pub fn pixel_shuffle(xs: &Tensor, upscale_factor: usize) -> Result { let (b_size, c, h, w) = xs.dims4()?; diff --git a/candle-nn/tests/ops.rs b/candle-nn/tests/ops.rs index 5ca01b37..c1e3031f 100644 --- a/candle-nn/tests/ops.rs +++ b/candle-nn/tests/ops.rs @@ -4,11 +4,9 @@ extern crate intel_mkl_src; #[cfg(feature = "accelerate")] extern crate accelerate_src; -use candle::{test_utils::to_vec3_round, Device, Result, Tensor}; +use candle::{test_device, test_utils::to_vec3_round, Device, Result, Tensor}; -#[test] -fn softmax() -> Result<()> { - let device = &Device::Cpu; +fn softmax(device: &Device) -> Result<()> { let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]]; let tensor = Tensor::new(data, device)?; let t0 = candle_nn::ops::softmax(&tensor.log()?, 0)?; @@ -54,6 +52,31 @@ fn softmax() -> Result<()> { Ok(()) } +fn rms_norm(device: &Device) -> Result<()> { + let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]]; + let tensor = Tensor::new(data, device)?; + let alpha = Tensor::new(&[1f32, 2f32, 3f32], device)?; + let t = candle_nn::ops::rms_norm(&tensor, &alpha, 1e-5)?; + assert_eq!( + to_vec3_round(&t, 4)?, + &[ + [[1.019, 0.6794, 4.0762], [0.1674, 1.6744, 4.521]], + [[0.4714, 0.4714, 4.9497], [1.206, 0.603, 3.6181]] + ] + ); + let t2 = candle_nn::ops::rms_norm_slow(&tensor, &alpha, 1e-5)?; + assert_eq!( + to_vec3_round(&t2, 4)?, + &[ + [[1.019, 0.6794, 4.0762], [0.1674, 1.6744, 4.521]], + [[0.4714, 0.4714, 4.9497], [1.206, 0.603, 3.6181]] + ] + ); + let diff = (t - t2)?.abs()?.sum_all()?.to_vec0::()?; + assert!(diff < 1e-5); + Ok(()) +} + #[test] fn softmax_numerical_stability() -> Result<()> { let dev = &Device::Cpu; @@ -62,3 +85,6 @@ fn softmax_numerical_stability() -> Result<()> { assert_eq!(softmax.to_vec1::()?, &[1f32, 0.]); Ok(()) } + +test_device!(softmax, softmax_cpu, softmax_gpu, softmax_metal); +test_device!(rms_norm, rms_norm_cpu, rms_norm_gpu, rms_norm_metal); From 74b7f59261c72010e329fd8eb467c088673671f5 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 07:02:20 +0100 Subject: [PATCH 037/131] Prepare for the custom-op extension. (#1892) --- candle-core/src/custom_op.rs | 244 +++++++++++++++++++++++++++++++++++ candle-core/src/lib.rs | 3 +- candle-core/src/op.rs | 159 +---------------------- candle-core/src/storage.rs | 3 +- candle-core/src/tensor.rs | 94 +------------- 5 files changed, 256 insertions(+), 247 deletions(-) create mode 100644 candle-core/src/custom_op.rs diff --git a/candle-core/src/custom_op.rs b/candle-core/src/custom_op.rs new file mode 100644 index 00000000..3dc66e94 --- /dev/null +++ b/candle-core/src/custom_op.rs @@ -0,0 +1,244 @@ +use crate::op::{BackpropOp, Op}; +use crate::tensor::from_storage; +use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor}; +use std::sync::Arc; + +/// Unary ops that can be defined in user-land. +pub trait CustomOp1 { + // Box does not support const yet, so use a function to get the name. + fn name(&self) -> &'static str; + + /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>; + + /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> { + Err(crate::Error::Cuda( + format!("no cuda implementation for {}", self.name()).into(), + )) + } + + /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn metal_fwd( + &self, + _storage: &MetalStorage, + _layout: &Layout, + ) -> Result<(MetalStorage, Shape)> { + Err(crate::Error::Metal( + format!("no metal implementation for {}", self.name()).into(), + )) + } + + /// This function takes as argument the argument `arg` used in the forward pass, the result + /// produced by the forward operation `res` and the gradient of the result `grad_res`. + /// The function should return the gradient of the argument. + fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result> { + Err(crate::Error::BackwardNotSupported { op: self.name() }) + } +} + +pub trait CustomOp2 { + fn name(&self) -> &'static str; + + /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cpu_fwd( + &self, + s1: &CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + ) -> Result<(CpuStorage, Shape)>; + + /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cuda_fwd( + &self, + _: &CudaStorage, + _: &Layout, + _: &CudaStorage, + _: &Layout, + ) -> Result<(CudaStorage, Shape)> { + Err(crate::Error::Cuda( + format!("no cuda implementation for {}", self.name()).into(), + )) + } + + /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn metal_fwd( + &self, + _: &MetalStorage, + _: &Layout, + _: &MetalStorage, + _: &Layout, + ) -> Result<(MetalStorage, Shape)> { + Err(crate::Error::Metal( + format!("no metal implementation for {}", self.name()).into(), + )) + } + + fn bwd( + &self, + _arg1: &Tensor, + _arg2: &Tensor, + _res: &Tensor, + _grad_res: &Tensor, + ) -> Result<(Option, Option)> { + Err(crate::Error::BackwardNotSupported { op: self.name() }) + } +} + +pub trait CustomOp3 { + fn name(&self) -> &'static str; + + /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cpu_fwd( + &self, + s1: &CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + s3: &CpuStorage, + l3: &Layout, + ) -> Result<(CpuStorage, Shape)>; + + /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cuda_fwd( + &self, + _: &CudaStorage, + _: &Layout, + _: &CudaStorage, + _: &Layout, + _: &CudaStorage, + _: &Layout, + ) -> Result<(CudaStorage, Shape)> { + Err(crate::Error::Cuda( + format!("no cuda implementation for {}", self.name()).into(), + )) + } + + /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn metal_fwd( + &self, + _: &MetalStorage, + _: &Layout, + _: &MetalStorage, + _: &Layout, + _: &MetalStorage, + _: &Layout, + ) -> Result<(MetalStorage, Shape)> { + Err(crate::Error::Metal( + format!("no metal implementation for {}", self.name()).into(), + )) + } + + fn bwd( + &self, + _arg1: &Tensor, + _arg2: &Tensor, + _arg3: &Tensor, + _res: &Tensor, + _grad_res: &Tensor, + ) -> Result<(Option, Option, Option)> { + Err(crate::Error::BackwardNotSupported { op: self.name() }) + } +} + +impl Tensor { + /// Applies a unary custom op without backward support + pub fn apply_op1_no_bwd(&self, c: &C) -> Result { + let (storage, shape) = self.storage().apply_op1(self.layout(), c)?; + Ok(from_storage(storage, shape, BackpropOp::none(), false)) + } + + /// Applies a binary custom op without backward support + pub fn apply_op2_no_bwd(&self, rhs: &Self, c: &C) -> Result { + let (storage, shape) = + self.storage() + .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?; + Ok(from_storage(storage, shape, BackpropOp::none(), false)) + } + + /// Applies a ternary custom op without backward support + pub fn apply_op3_no_bwd(&self, t2: &Self, t3: &Self, c: &C) -> Result { + let (storage, shape) = self.storage().apply_op3( + self.layout(), + &t2.storage(), + t2.layout(), + &t3.storage(), + t3.layout(), + c, + )?; + Ok(from_storage(storage, shape, BackpropOp::none(), false)) + } + + /// Applies a unary custom op. + pub fn apply_op1_arc(&self, c: Arc>) -> Result { + let (storage, shape) = self + .storage() + .apply_op1(self.layout(), c.as_ref().as_ref())?; + let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone())); + Ok(from_storage(storage, shape, op, false)) + } + + pub fn apply_op1(&self, c: C) -> Result { + self.apply_op1_arc(Arc::new(Box::new(c))) + } + + /// Applies a binary custom op. + pub fn apply_op2_arc( + &self, + rhs: &Self, + c: Arc>, + ) -> Result { + let (storage, shape) = self.storage().apply_op2( + self.layout(), + &rhs.storage(), + rhs.layout(), + c.as_ref().as_ref(), + )?; + let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone())); + Ok(from_storage(storage, shape, op, false)) + } + + pub fn apply_op2(&self, r: &Self, c: C) -> Result { + self.apply_op2_arc(r, Arc::new(Box::new(c))) + } + + /// Applies a ternary custom op. + pub fn apply_op3_arc( + &self, + t2: &Self, + t3: &Self, + c: Arc>, + ) -> Result { + let (storage, shape) = self.storage().apply_op3( + self.layout(), + &t2.storage(), + t2.layout(), + &t3.storage(), + t3.layout(), + c.as_ref().as_ref(), + )?; + let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| { + Op::CustomOp3(t1, t2, t3, c.clone()) + }); + Ok(from_storage(storage, shape, op, false)) + } + + pub fn apply_op3( + &self, + t2: &Self, + t3: &Self, + c: C, + ) -> Result { + self.apply_op3_arc(t2, t3, Arc::new(Box::new(c))) + } +} diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index 31ef1169..87e60d9b 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -45,6 +45,7 @@ pub mod cpu_backend; pub mod cuda_backend; #[cfg(feature = "cudnn")] pub mod cudnn; +mod custom_op; mod device; pub mod display; mod dtype; @@ -73,12 +74,12 @@ pub mod utils; mod variable; pub use cpu_backend::CpuStorage; +pub use custom_op::{CustomOp1, CustomOp2, CustomOp3}; pub use device::{Device, DeviceLocation, NdArray}; pub use dtype::{DType, FloatDType, IntDType, WithDType}; pub use error::{Error, Result}; pub use indexer::IndexOp; pub use layout::Layout; -pub use op::{CustomOp1, CustomOp2, CustomOp3}; pub use shape::{Shape, D}; pub use storage::Storage; pub use strided_index::{StridedBlocks, StridedIndex}; diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs index 022b4fc3..3b34eb75 100644 --- a/candle-core/src/op.rs +++ b/candle-core/src/op.rs @@ -1,5 +1,5 @@ #![allow(clippy::redundant_closure_call)] -use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor}; +use crate::Tensor; use half::{bf16, f16}; use num_traits::float::Float; @@ -161,168 +161,23 @@ pub enum Op { Permute(Tensor, Vec), Elu(Tensor, f64), Powf(Tensor, f64), - CustomOp1(Tensor, std::sync::Arc>), + CustomOp1( + Tensor, + std::sync::Arc>, + ), CustomOp2( Tensor, Tensor, - std::sync::Arc>, + std::sync::Arc>, ), CustomOp3( Tensor, Tensor, Tensor, - std::sync::Arc>, + std::sync::Arc>, ), } -/// Unary ops that can be defined in user-land. -pub trait CustomOp1 { - // Box does not support const yet, so use a function to get the name. - fn name(&self) -> &'static str; - - /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>; - - /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> { - Err(crate::Error::Cuda( - format!("no cuda implementation for {}", self.name()).into(), - )) - } - - /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn metal_fwd( - &self, - _storage: &MetalStorage, - _layout: &Layout, - ) -> Result<(MetalStorage, Shape)> { - Err(crate::Error::Metal( - format!("no metal implementation for {}", self.name()).into(), - )) - } - - /// This function takes as argument the argument `arg` used in the forward pass, the result - /// produced by the forward operation `res` and the gradient of the result `grad_res`. - /// The function should return the gradient of the argument. - fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result> { - Err(crate::Error::BackwardNotSupported { op: self.name() }) - } -} - -pub trait CustomOp2 { - fn name(&self) -> &'static str; - - /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn cpu_fwd( - &self, - s1: &CpuStorage, - l1: &Layout, - s2: &CpuStorage, - l2: &Layout, - ) -> Result<(CpuStorage, Shape)>; - - /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn cuda_fwd( - &self, - _: &CudaStorage, - _: &Layout, - _: &CudaStorage, - _: &Layout, - ) -> Result<(CudaStorage, Shape)> { - Err(crate::Error::Cuda( - format!("no cuda implementation for {}", self.name()).into(), - )) - } - - /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn metal_fwd( - &self, - _: &MetalStorage, - _: &Layout, - _: &MetalStorage, - _: &Layout, - ) -> Result<(MetalStorage, Shape)> { - Err(crate::Error::Metal( - format!("no metal implementation for {}", self.name()).into(), - )) - } - - fn bwd( - &self, - _arg1: &Tensor, - _arg2: &Tensor, - _res: &Tensor, - _grad_res: &Tensor, - ) -> Result<(Option, Option)> { - Err(crate::Error::BackwardNotSupported { op: self.name() }) - } -} - -pub trait CustomOp3 { - fn name(&self) -> &'static str; - - /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn cpu_fwd( - &self, - s1: &CpuStorage, - l1: &Layout, - s2: &CpuStorage, - l2: &Layout, - s3: &CpuStorage, - l3: &Layout, - ) -> Result<(CpuStorage, Shape)>; - - /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn cuda_fwd( - &self, - _: &CudaStorage, - _: &Layout, - _: &CudaStorage, - _: &Layout, - _: &CudaStorage, - _: &Layout, - ) -> Result<(CudaStorage, Shape)> { - Err(crate::Error::Cuda( - format!("no cuda implementation for {}", self.name()).into(), - )) - } - - /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, - /// offsets etc so the associated layout should be used to access it. - fn metal_fwd( - &self, - _: &MetalStorage, - _: &Layout, - _: &MetalStorage, - _: &Layout, - _: &MetalStorage, - _: &Layout, - ) -> Result<(MetalStorage, Shape)> { - Err(crate::Error::Metal( - format!("no metal implementation for {}", self.name()).into(), - )) - } - - fn bwd( - &self, - _arg1: &Tensor, - _arg2: &Tensor, - _arg3: &Tensor, - _res: &Tensor, - _grad_res: &Tensor, - ) -> Result<(Option, Option, Option)> { - Err(crate::Error::BackwardNotSupported { op: self.name() }) - } -} - pub trait UnaryOpT { const NAME: &'static str; const KERNEL: &'static str; diff --git a/candle-core/src/storage.rs b/candle-core/src/storage.rs index 3bd4b022..ffc75188 100644 --- a/candle-core/src/storage.rs +++ b/candle-core/src/storage.rs @@ -1,6 +1,7 @@ use crate::backend::BackendStorage; -use crate::op::{self, CmpOp, CustomOp1, CustomOp2, CustomOp3, ReduceOp}; +use crate::op::{self, CmpOp, ReduceOp}; use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape}; +use crate::{CustomOp1, CustomOp2, CustomOp3}; // We do not want to implement Clone on Storage as cloning may fail because of // out of memory. Instead try_clone should be used. diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index feab30c8..d7c2ed66 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -1,9 +1,7 @@ //! Tensors are N-dimensional matrixes of elements using a single data type. #![allow(clippy::redundant_closure_call)] use crate::backend::{BackendDevice, BackendStorage}; -use crate::op::{ - BackpropOp, BinaryOp, CmpOp, CustomOp1, CustomOp2, CustomOp3, Op, ReduceOp, UnaryOp, -}; +use crate::op::{BackpropOp, BinaryOp, CmpOp, Op, ReduceOp, UnaryOp}; use crate::scalar::TensorOrScalar; use crate::shape::{Dim, Dims}; use crate::{bail, storage::Storage, DType, Device, Error, Layout, Result, Shape}; @@ -2277,96 +2275,6 @@ impl Tensor { std::ptr::eq(lhs, rhs) } - /// Applies a unary custom op without backward support - pub fn apply_op1_no_bwd(&self, c: &C) -> Result { - let (storage, shape) = self.storage().apply_op1(self.layout(), c)?; - Ok(from_storage(storage, shape, BackpropOp::none(), false)) - } - - /// Applies a binary custom op without backward support - pub fn apply_op2_no_bwd(&self, rhs: &Self, c: &C) -> Result { - let (storage, shape) = - self.storage() - .apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?; - Ok(from_storage(storage, shape, BackpropOp::none(), false)) - } - - /// Applies a ternary custom op without backward support - pub fn apply_op3_no_bwd(&self, t2: &Self, t3: &Self, c: &C) -> Result { - let (storage, shape) = self.storage().apply_op3( - self.layout(), - &t2.storage(), - t2.layout(), - &t3.storage(), - t3.layout(), - c, - )?; - Ok(from_storage(storage, shape, BackpropOp::none(), false)) - } - - /// Applies a unary custom op. - pub fn apply_op1_arc(&self, c: Arc>) -> Result { - let (storage, shape) = self - .storage() - .apply_op1(self.layout(), c.as_ref().as_ref())?; - let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone())); - Ok(from_storage(storage, shape, op, false)) - } - - pub fn apply_op1(&self, c: C) -> Result { - self.apply_op1_arc(Arc::new(Box::new(c))) - } - - /// Applies a binary custom op. - pub fn apply_op2_arc( - &self, - rhs: &Self, - c: Arc>, - ) -> Result { - let (storage, shape) = self.storage().apply_op2( - self.layout(), - &rhs.storage(), - rhs.layout(), - c.as_ref().as_ref(), - )?; - let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone())); - Ok(from_storage(storage, shape, op, false)) - } - - pub fn apply_op2(&self, r: &Self, c: C) -> Result { - self.apply_op2_arc(r, Arc::new(Box::new(c))) - } - - /// Applies a ternary custom op. - pub fn apply_op3_arc( - &self, - t2: &Self, - t3: &Self, - c: Arc>, - ) -> Result { - let (storage, shape) = self.storage().apply_op3( - self.layout(), - &t2.storage(), - t2.layout(), - &t3.storage(), - t3.layout(), - c.as_ref().as_ref(), - )?; - let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| { - Op::CustomOp3(t1, t2, t3, c.clone()) - }); - Ok(from_storage(storage, shape, op, false)) - } - - pub fn apply_op3( - &self, - t2: &Self, - t3: &Self, - c: C, - ) -> Result { - self.apply_op3_arc(t2, t3, Arc::new(Box::new(c))) - } - /// Normalize a 'relative' axis value: positive values are kept, negative /// values means counting the dimensions from the back. pub fn normalize_axis(&self, axis: i64) -> Result { From 0fddec762e3c17c56be5b6356478b9565dd628bb Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 09:48:56 +0100 Subject: [PATCH 038/131] RmsNorm kernel for metal. (#1895) * RmsNorm kernel for metal. * Wrapper for the metal kernel. * Get the ops to actually work. * Fix, get the tests to pass. --- candle-metal-kernels/src/lib.rs | 58 +++++++++++++++++++++++++++ candle-metal-kernels/src/reduce.metal | 56 ++++++++++++++++++++++++++ candle-nn/src/ops.rs | 47 +++++++++++++++++++++- 3 files changed, 160 insertions(+), 1 deletion(-) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index f12463a4..bab44a05 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -750,6 +750,64 @@ pub fn call_last_softmax( Ok(()) } +#[allow(clippy::too_many_arguments)] +pub fn call_rms_norm( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + kernel_name: &'static str, + length: usize, + elements_to_sum: usize, + eps: f32, + input: &Buffer, + input_offset: usize, + alpha: &Buffer, + alpha_offset: usize, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?; + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + + set_params!( + encoder, + ( + length, + elements_to_sum, + (input, input_offset), + output, + (alpha, alpha_offset), + eps + ) + ); + + let out_length = length / elements_to_sum; + + let thread_group_count = MTLSize { + width: out_length as u64, + height: 1, + depth: 1, + }; + + let width = std::cmp::min( + pipeline.max_total_threads_per_threadgroup(), + elements_to_sum as u64, + ) + .next_power_of_two(); + + let thread_group_size = MTLSize { + width, + height: 1, + depth: 1, + }; + + encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[allow(clippy::too_many_arguments)] pub fn call_affine( device: &Device, diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index 93dac662..3c3cbc14 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -260,6 +260,59 @@ kernel void NAME( } \ } \ +#define RMSNORM(NAME, T) \ +kernel void NAME( \ + constant size_t &src_numel, \ + constant size_t &el_to_sum_per_block, \ + device const T *src, \ + device T *dst, \ + device const T *alpha, \ + constant float &eps, \ + \ + uint id [[ thread_position_in_grid ]], \ + uint tid [[ thread_index_in_threadgroup ]], \ + uint dst_id [[ threadgroup_position_in_grid ]], \ + uint block_dim [[ threads_per_threadgroup ]] \ +) { \ + threadgroup float shared_memory[THREADGROUP_SIZE]; \ + shared_memory[tid] = 0; \ + size_t start_idx = dst_id * el_to_sum_per_block; \ + size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel); \ + size_t idx = start_idx + tid; \ + \ + \ + float tmp = 0; \ + while (idx < stop_idx) { \ + tmp = tmp + float(src[idx]) * float(src[idx]); \ + idx += block_dim; \ + } \ + shared_memory[tid] = tmp; \ + \ + threadgroup_barrier(mem_flags::mem_threadgroup); \ + \ + for (uint s = block_dim / 2; s > 0; s >>= 1) { \ + if (tid < s) { \ + shared_memory[tid] = shared_memory[tid] + shared_memory[tid + s]; \ + } \ + threadgroup_barrier(mem_flags::mem_threadgroup); \ + } \ + \ + /* wait for shared_memory[0] to be filled */ \ + threadgroup_barrier(mem_flags::mem_threadgroup); \ + \ + float norm = sqrt(shared_memory[0] / float(el_to_sum_per_block) + eps); \ + float inv_norm = 1.0f / norm; \ + idx = start_idx + tid; \ + while (idx < stop_idx) { \ + float val = float(src[idx]) * inv_norm; \ + if (alpha != nullptr) { \ + val *= float(alpha[idx - start_idx]); \ + } \ + dst[idx] = T(val); \ + idx += block_dim; \ + } \ +} \ + REDUCE(x + y, fast_sum_f32_strided, float, 0) REDUCE(x + y, fast_sum_u32_strided, uint, 0) REDUCE(x + y, fast_sum_f16_strided, half, 0) @@ -286,6 +339,8 @@ ARGMAX(fast_argmax_u8_strided, uint8_t, 0) SOFTMAX(softmax_f32, float) SOFTMAX(softmax_f16, half) +RMSNORM(rmsnorm_f32, float) +RMSNORM(rmsnorm_f16, half) #if __METAL_VERSION__ >= 220 REDUCE(x + y, fast_sum_i64_strided, int64_t, 0) @@ -303,4 +358,5 @@ REDUCE(MIN(x, y), fast_min_bf16, bfloat, HUGE_VALBF) ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF) ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF) SOFTMAX(softmax_bf16, bfloat) +RMSNORM(rmsnorm_bf16, bfloat) #endif diff --git a/candle-nn/src/ops.rs b/candle-nn/src/ops.rs index d725bdc2..1dac8c3b 100644 --- a/candle-nn/src/ops.rs +++ b/candle-nn/src/ops.rs @@ -236,7 +236,7 @@ impl candle::CustomOp1 for SoftmaxLastDim { layout.start_offset() * storage.dtype().size_in_bytes(), &output, ) - .unwrap(); + .map_err(candle::Error::wrap)?; let newstorage = candle::MetalStorage::new(output, device.clone(), elem_count, storage.dtype()); Ok((newstorage, layout.shape().clone())) @@ -383,6 +383,51 @@ impl candle::CustomOp2 for RmsNorm { }; Ok((dst, l1.shape().clone())) } + + #[cfg(feature = "metal")] + fn metal_fwd( + &self, + s1: &candle::MetalStorage, + l1: &Layout, + s2: &candle::MetalStorage, + l2: &Layout, + ) -> Result<(candle::MetalStorage, Shape)> { + use candle::backend::BackendStorage; + let device = s1.device(); + let command_buffer = device.command_buffer()?; + let kernels = device.kernels(); + let name = match (s1.dtype(), s2.dtype()) { + (DType::F32, DType::F32) => "rmsnorm_f32", + (DType::F16, DType::F16) => "rmsnorm_f16", + (DType::BF16, DType::BF16) => "rmsnorm_bf16", + (dt1, dt2) => candle::bail!("rmsnorm is not implemented for {dt1:?} {dt2:?}"), + }; + + if !(l1.is_contiguous() && l2.is_contiguous()) { + candle::bail!("Non contiguous rmsnorm is not implemented"); + } + + let last_dim = l1.dims()[l1.shape().rank() - 1]; + let elem_count = l1.shape().elem_count(); + let output = device.new_buffer(elem_count, s1.dtype(), "rmsnorm")?; + candle_metal_kernels::call_rms_norm( + device.metal_device(), + &command_buffer, + kernels, + name, + elem_count, + last_dim, + self.eps, + s1.buffer(), + l1.start_offset() * s1.dtype().size_in_bytes(), + s2.buffer(), + l2.start_offset() * s2.dtype().size_in_bytes(), + &output, + ) + .map_err(candle::Error::wrap)?; + let newstorage = candle::MetalStorage::new(output, device.clone(), elem_count, s1.dtype()); + Ok((newstorage, l1.shape().clone())) + } } pub fn rms_norm_slow(x: &Tensor, alpha: &Tensor, eps: f32) -> Result { From 18036c6ccbd216e0397fa6469bb50a8b8ea47892 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 10:56:41 +0100 Subject: [PATCH 039/131] Update the image crate + use the re-exported version. (#1893) * Update the image crate + use the re-exported version. * Update to using ab_glyph. --- Cargo.toml | 6 +-- candle-examples/Cargo.toml | 2 +- candle-examples/examples/segformer/main.rs | 2 +- candle-examples/examples/yolo-v8/main.rs | 45 +++++++++++----------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1a3e1983..1f0067b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ categories = ["science"] license = "MIT OR Apache-2.0" [workspace.dependencies] +ab_glyph = "0.2.23" accelerate-src = { version = "0.3.2" } anyhow = { version = "1", features = ["backtrace"] } byteorder = "1.4.3" @@ -46,8 +47,8 @@ fancy-regex = "0.13.0" gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] } hf-hub = "0.3.0" half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] } -image = { version = "0.24.7", default-features = false, features = ["jpeg", "png"] } -imageproc = { version = "0.23.0", default-features = false } +image = { version = "0.25.0", default-features = false, features = ["jpeg", "png"] } +imageproc = { version = "0.24.0", default-features = false } intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] } libc = { version = "0.2.147" } log = "0.4" @@ -58,7 +59,6 @@ parquet = { version = "50.0.0" } rand = "0.8.5" rand_distr = "0.4.3" rayon = "1.7.0" -rusttype = { version = "0.9", default-features = false } safetensors = "0.4.1" serde = { version = "1.0.171", features = ["derive"] } serde_plain = "1.0.2" diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index 108ca32a..864d2f6a 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -42,7 +42,7 @@ clap = { workspace = true } imageproc = { workspace = true } memmap2 = { workspace = true } rand = { workspace = true } -rusttype = { workspace = true } +ab_glyph = { workspace = true } tracing = { workspace = true } tracing-chrome = { workspace = true } tracing-subscriber = { workspace = true } diff --git a/candle-examples/examples/segformer/main.rs b/candle-examples/examples/segformer/main.rs index 76c9f30e..16db62fc 100644 --- a/candle-examples/examples/segformer/main.rs +++ b/candle-examples/examples/segformer/main.rs @@ -5,7 +5,7 @@ use candle_transformers::models::segformer::{ Config, ImageClassificationModel, SemanticSegmentationModel, }; use clap::{Args, Parser, Subcommand}; -use image::Rgb; +use imageproc::image::Rgb; use imageproc::integral_image::ArrayData; use std::collections::HashMap; use std::path::PathBuf; diff --git a/candle-examples/examples/yolo-v8/main.rs b/candle-examples/examples/yolo-v8/main.rs index c65a5ca1..eb338647 100644 --- a/candle-examples/examples/yolo-v8/main.rs +++ b/candle-examples/examples/yolo-v8/main.rs @@ -99,7 +99,7 @@ pub fn report_detect( let h_ratio = initial_h as f32 / h as f32; let mut img = img.to_rgb8(); let font = Vec::from(include_bytes!("roboto-mono-stripped.ttf") as &[u8]); - let font = rusttype::Font::try_from_vec(font); + let font = ab_glyph::FontRef::try_from_slice(&font).map_err(candle::Error::wrap)?; for (class_index, bboxes_for_class) in bboxes.iter().enumerate() { for b in bboxes_for_class.iter() { println!( @@ -119,27 +119,28 @@ pub fn report_detect( ); } if legend_size > 0 { - if let Some(font) = font.as_ref() { - imageproc::drawing::draw_filled_rect_mut( - &mut img, - imageproc::rect::Rect::at(xmin, ymin).of_size(dx as u32, legend_size), - image::Rgb([170, 0, 0]), - ); - let legend = format!( - "{} {:.0}%", - candle_examples::coco_classes::NAMES[class_index], - 100. * b.confidence - ); - imageproc::drawing::draw_text_mut( - &mut img, - image::Rgb([255, 255, 255]), - xmin, - ymin, - rusttype::Scale::uniform(legend_size as f32 - 1.), - font, - &legend, - ) - } + imageproc::drawing::draw_filled_rect_mut( + &mut img, + imageproc::rect::Rect::at(xmin, ymin).of_size(dx as u32, legend_size), + image::Rgb([170, 0, 0]), + ); + let legend = format!( + "{} {:.0}%", + candle_examples::coco_classes::NAMES[class_index], + 100. * b.confidence + ); + imageproc::drawing::draw_text_mut( + &mut img, + image::Rgb([255, 255, 255]), + xmin, + ymin, + ab_glyph::PxScale { + x: legend_size as f32 - 1., + y: legend_size as f32 - 1., + }, + &font, + &legend, + ) } } } From 0c11e055be1e156153b71cff8ea714d5829b9371 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Thu, 21 Mar 2024 16:16:49 +0530 Subject: [PATCH 040/131] support distil-large-v3 (#1898) --- candle-examples/examples/whisper/main.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/candle-examples/examples/whisper/main.rs b/candle-examples/examples/whisper/main.rs index da8c73ae..ecd5ff84 100644 --- a/candle-examples/examples/whisper/main.rs +++ b/candle-examples/examples/whisper/main.rs @@ -374,6 +374,8 @@ enum WhichModel { DistilMediumEn, #[value(name = "distil-large-v2")] DistilLargeV2, + #[value(name = "distil-large-v3")] + DistilLargeV3, } impl WhichModel { @@ -386,7 +388,8 @@ impl WhichModel { | Self::Large | Self::LargeV2 | Self::LargeV3 - | Self::DistilLargeV2 => true, + | Self::DistilLargeV2 + | Self::DistilLargeV3 => true, Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn | Self::DistilMediumEn => { false } @@ -408,6 +411,7 @@ impl WhichModel { Self::LargeV3 => ("openai/whisper-large-v3", "main"), Self::DistilMediumEn => ("distil-whisper/distil-medium.en", "main"), Self::DistilLargeV2 => ("distil-whisper/distil-large-v2", "main"), + Self::DistilLargeV3 => ("distil-whisper/distil-large-v3", "main"), } } } From bb3ee48039ed040da48def94f57a6cf1eb0e7911 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Thu, 21 Mar 2024 17:24:09 +0530 Subject: [PATCH 041/131] whisper readme (#1899) --- candle-examples/examples/whisper/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/candle-examples/examples/whisper/README.md b/candle-examples/examples/whisper/README.md index 124cd182..a7dd4081 100644 --- a/candle-examples/examples/whisper/README.md +++ b/candle-examples/examples/whisper/README.md @@ -34,6 +34,7 @@ from the hub. - `--timestamps`: enable the timestamp mode where some timestamps are reported for each recognized audio extracts. - `--model`: the model to be used. Models that do not end with `-en` are - multilingual models, other ones are English only models. The supported models - are `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, - `medium.en`, `large`, and `large-v2`. + multilingual models, other ones are English only models. The supported OpenAI + Whisper models are `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, + `medium`, `medium.en`, `large`, `large-v2` and `large-v3`. The supported + Distil-Whisper models are `distil-medium.en`, `distil-large-v2` and `distil-large-v3`. From ec97c98e81707c8f66db6be22d2df7c8791c55b8 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 13:09:42 +0100 Subject: [PATCH 042/131] Async tensor copying. (#1900) --- candle-core/src/backend.rs | 2 ++ candle-core/src/cpu_backend.rs | 4 +++ candle-core/src/cuda_backend.rs | 37 ++++++++++++++++++++++++++ candle-core/src/device.rs | 8 +++--- candle-core/src/dummy_cuda_backend.rs | 4 +++ candle-core/src/dummy_metal_backend.rs | 4 +++ candle-core/src/metal_backend.rs | 4 +++ 7 files changed, 59 insertions(+), 4 deletions(-) diff --git a/candle-core/src/backend.rs b/candle-core/src/backend.rs index ea1ac1a9..c63aad54 100644 --- a/candle-core/src/backend.rs +++ b/candle-core/src/backend.rs @@ -129,6 +129,8 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone { fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result; + fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result; + fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result; fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result; diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs index 1504d5b8..fa48577c 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend.rs @@ -2814,6 +2814,10 @@ impl BackendDevice for CpuDevice { Ok(s.clone()) } + fn storage_from_cpu_storage_owned(&self, s: CpuStorage) -> Result { + Ok(s) + } + fn new(_: usize) -> Result { Ok(Self) } diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs index 8954fc33..fec37c39 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend.rs @@ -420,6 +420,43 @@ impl BackendDevice for CudaDevice { device: self.clone(), }) } + + fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result { + let slice = match storage { + CpuStorage::U8(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::U8(data) + } + CpuStorage::U32(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::U32(data) + } + CpuStorage::I64(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::I64(data) + } + CpuStorage::BF16(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::BF16(data) + } + CpuStorage::F16(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::F16(data) + } + CpuStorage::F32(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::F32(data) + } + CpuStorage::F64(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } } #[derive(Debug)] diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs index 1e33021b..9c39d27a 100644 --- a/candle-core/src/device.rs +++ b/candle-core/src/device.rs @@ -294,12 +294,12 @@ impl Device { Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())), Device::Cuda(device) => { let storage = array.to_cpu_storage(); - let storage = device.storage_from_cpu_storage(&storage)?; + let storage = device.storage_from_cpu_storage_owned(storage)?; Ok(Storage::Cuda(storage)) } Device::Metal(device) => { let storage = array.to_cpu_storage(); - let storage = device.storage_from_cpu_storage(&storage)?; + let storage = device.storage_from_cpu_storage_owned(storage)?; Ok(Storage::Metal(storage)) } } @@ -310,12 +310,12 @@ impl Device { Device::Cpu => Ok(Storage::Cpu(S::to_cpu_storage_owned(data))), Device::Cuda(device) => { let storage = S::to_cpu_storage_owned(data); - let storage = device.storage_from_cpu_storage(&storage)?; + let storage = device.storage_from_cpu_storage_owned(storage)?; Ok(Storage::Cuda(storage)) } Device::Metal(device) => { let storage = S::to_cpu_storage_owned(data); - let storage = device.storage_from_cpu_storage(&storage)?; + let storage = device.storage_from_cpu_storage_owned(storage)?; Ok(Storage::Metal(storage)) } } diff --git a/candle-core/src/dummy_cuda_backend.rs b/candle-core/src/dummy_cuda_backend.rs index 43d55fa4..d4887f19 100644 --- a/candle-core/src/dummy_cuda_backend.rs +++ b/candle-core/src/dummy_cuda_backend.rs @@ -214,6 +214,10 @@ impl crate::backend::BackendDevice for CudaDevice { Err(Error::NotCompiledWithCudaSupport) } + fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result { + Err(Error::NotCompiledWithCudaSupport) + } + fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result { Err(Error::NotCompiledWithCudaSupport) } diff --git a/candle-core/src/dummy_metal_backend.rs b/candle-core/src/dummy_metal_backend.rs index 791ec153..33c6c9fe 100644 --- a/candle-core/src/dummy_metal_backend.rs +++ b/candle-core/src/dummy_metal_backend.rs @@ -226,6 +226,10 @@ impl crate::backend::BackendDevice for MetalDevice { Err(Error::NotCompiledWithMetalSupport) } + fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result { + Err(Error::NotCompiledWithMetalSupport) + } + fn rand_uniform(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result { Err(Error::NotCompiledWithMetalSupport) } diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index acc6c445..c4245652 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1867,6 +1867,10 @@ impl BackendDevice for MetalDevice { )) } + fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result { + self.storage_from_cpu_storage(&storage) + } + fn rand_uniform( &self, shape: &Shape, From 9563a5fee42f8fef754c238e28ca79725813cea1 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Thu, 21 Mar 2024 13:08:45 -0400 Subject: [PATCH 043/131] Add support for conv_transpose2d on Metal backend (#1903) * add support for conv transpose 2d and add bench mark for float types * update bench calculation * enable testing all conv operations on metal --- candle-core/benches/bench_main.rs | 3 +- .../benches/benchmarks/conv_transpose2d.rs | 59 +++++++++ candle-core/benches/benchmarks/mod.rs | 1 + candle-core/src/metal_backend.rs | 66 +++++++++- candle-core/tests/conv_tests.rs | 124 ++++++++---------- candle-metal-kernels/src/conv.metal | 86 ++++++++++++ candle-metal-kernels/src/lib.rs | 58 ++++++++ 7 files changed, 321 insertions(+), 76 deletions(-) create mode 100644 candle-core/benches/benchmarks/conv_transpose2d.rs diff --git a/candle-core/benches/bench_main.rs b/candle-core/benches/bench_main.rs index 162e3f2b..9f94b252 100644 --- a/candle-core/benches/bench_main.rs +++ b/candle-core/benches/bench_main.rs @@ -5,5 +5,6 @@ criterion_main!( benchmarks::affine::benches, benchmarks::matmul::benches, benchmarks::random::benches, - benchmarks::where_cond::benches + benchmarks::where_cond::benches, + benchmarks::conv_transpose2d::benches, ); diff --git a/candle-core/benches/benchmarks/conv_transpose2d.rs b/candle-core/benches/benchmarks/conv_transpose2d.rs new file mode 100644 index 00000000..7b252ec6 --- /dev/null +++ b/candle-core/benches/benchmarks/conv_transpose2d.rs @@ -0,0 +1,59 @@ +use crate::benchmarks::{BenchDevice, BenchDeviceHandler}; +use candle_core::{DType, Device, Tensor}; +use criterion::{black_box, criterion_group, Criterion, Throughput}; +use std::time::Instant; + +fn run( + x: &Tensor, + k: &Tensor, + padding: usize, + output_padding: usize, + stride: usize, + dilation: usize, +) { + x.conv_transpose2d(k, padding, output_padding, stride, dilation) + .unwrap(); +} + +fn run_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) { + let t = Tensor::arange(0.0f32, 10000.0, device) + .unwrap() + .reshape((1, 4, 50, 50)) + .unwrap() + .to_dtype(dtype) + .unwrap(); + + let kernel = Tensor::arange(0.0f32, 100.0, device) + .unwrap() + .reshape((4, 1, 5, 5)) + .unwrap() + .to_dtype(dtype) + .unwrap(); + + let flops = t.dims().iter().product::() * dtype.size_in_bytes(); + + let mut group = c.benchmark_group(device.bench_name(name)); + group.throughput(Throughput::Bytes(flops as u64)); + group.bench_function("iter", move |b| { + b.iter_custom(|iters| { + let start = Instant::now(); + for _i in 0..iters { + run(black_box(&t), black_box(&kernel), 1, 0, 1, 2); + } + device.sync().unwrap(); + start.elapsed() + }) + }); + group.finish(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let handler = BenchDeviceHandler::new().unwrap(); + for device in handler.devices { + run_benchmark(c, &device, DType::F32, "conv_transpose2d_f32"); + run_benchmark(c, &device, DType::F16, "conv_transpose2d_f16"); + run_benchmark(c, &device, DType::BF16, "conv_transpose2d_bf16"); + } +} + +criterion_group!(benches, criterion_benchmark); diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs index c45effee..a0ffa3eb 100644 --- a/candle-core/benches/benchmarks/mod.rs +++ b/candle-core/benches/benchmarks/mod.rs @@ -1,4 +1,5 @@ pub(crate) mod affine; +pub(crate) mod conv_transpose2d; pub(crate) mod matmul; pub(crate) mod random; pub(crate) mod where_cond; diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index c4245652..4f4162e2 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -2,8 +2,8 @@ use crate::backend::{BackendDevice, BackendStorage}; use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvTranspose2D}; use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT}; use crate::{CpuStorage, DType, Layout, Result, Shape}; -use candle_metal_kernels; use candle_metal_kernels::Kernels; +use candle_metal_kernels::{self, CallConvTranspose2dCfg}; use metal; use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; use std::collections::HashMap; @@ -1074,12 +1074,66 @@ impl BackendStorage for MetalStorage { fn conv_transpose2d( &self, - _l: &Layout, - _kernel: &Self, - _kernel_l: &Layout, - _params: &ParamsConvTranspose2D, + l: &Layout, + kernel: &Self, + kernel_l: &Layout, + params: &ParamsConvTranspose2D, ) -> Result { - crate::bail!("Metal conv_tranpose2d not implemented") + // Kernel shape: (c_in_k, c_out, h_k, w_k) + // Input shape: (b_size, c_in, h_in, w_in) + let (out_w, out_h) = (params.out_w(), params.out_h()); + let dst_el = params.c_out * out_w * out_h * params.b_size; + + let dims = l.dims(); + if dims.len() != 4 { + crate::bail!("unexpected input shape for conv_transpose2d {dims:?}, expected 4") + } + + let k_dims = kernel_l.dims(); + if k_dims.len() != 4 { + crate::bail!("unexpected kernel shape for conv_transpose2d {k_dims:?}, expected 4") + } + + let buffer = self + .device + .new_buffer(dst_el, self.dtype, "conv_transpose2d")?; + + let command_buffer = self.device.command_buffer()?; + + let name = match self.dtype { + DType::F32 => "conv_transpose2d_f32", + DType::F16 => "conv_transpose2d_f16", + DType::BF16 => "conv_transpose2d_bf16", + dtype => crate::bail!("Metal conv_transpose2d {dtype:?} not implemented"), + }; + + candle_metal_kernels::call_conv_transpose2d( + &self.device.device, + &command_buffer, + &self.device.kernels, + name, + CallConvTranspose2dCfg { + dilation: params.dilation, + stride: params.stride, + padding: params.padding, + output_padding: params.output_padding, + c_out: params.c_out, + out_h: out_h, + out_w: out_w, + b_size: params.b_size, + input_dims: l.dims(), + input_stride: l.stride(), + kernel_dims: kernel_l.dims(), + kernel_stride: kernel_l.stride(), + input_offset: l.start_offset() * self.dtype.size_in_bytes(), + kernel_offset: kernel_l.start_offset() * kernel.dtype.size_in_bytes(), + }, + &self.buffer, + &kernel.buffer, + &buffer, + ) + .map_err(MetalError::from)?; + Ok(Self::new(buffer, self.device.clone(), dst_el, self.dtype)) } fn avg_pool2d( diff --git a/candle-core/tests/conv_tests.rs b/candle-core/tests/conv_tests.rs index 71bf65be..6cc48ec7 100644 --- a/candle-core/tests/conv_tests.rs +++ b/candle-core/tests/conv_tests.rs @@ -163,33 +163,34 @@ fn conv2d(dev: &Device) -> Result<()> { 10.389, 3.6023, -4.2808, 0.2672, 5.3646, -5.2023, -2.1955, -9.4075 ] ); - if !dev.is_metal() { - let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?; - assert_eq!(res.dims(), [1, 2, 7, 7]); - assert_eq!( - test_utils::to_vec3_round(&res.i(0)?, 4)?, + + let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?; + + assert_eq!(res.dims(), [1, 2, 7, 7]); + assert_eq!( + test_utils::to_vec3_round(&res.i(0)?, 4)?, + [ [ - [ - [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277], - [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375], - [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889], - [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632], - [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985], - [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114], - [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579] - ], - [ - [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211], - [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131], - [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621], - [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142], - [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059], - [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516], - [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171] - ] + [-1.9918, 2.6797, -0.4599, -1.6037, 1.4131, -2.4012, 2.9277], + [1.8016, -3.5361, 1.0757, 3.5395, -8.2168, -3.2023, 0.5375], + [0.8243, 1.8675, 7.8929, -4.0746, -6.4415, 5.1139, 1.6889], + [0.2722, 8.9679, 3.3477, 1.8514, -4.2896, -3.8228, -7.5632], + [-8.5412, -5.8142, -7.1587, -1.6095, 0.4651, 0.2748, -2.0985], + [2.0833, -0.6482, -12.1692, -4.1284, -2.9765, -0.0656, -4.5114], + [5.307, 2.6957, 2.3087, 1.0478, 0.7808, -1.1519, -0.9579] + ], + [ + [1.089, 0.1872, -0.6408, -0.9897, 0.8503, 1.1019, -0.9211], + [-0.1741, -0.2915, 4.2472, 1.9417, 1.65, 0.6303, -4.7131], + [1.6555, 2.4026, -2.9293, 2.9953, 0.5328, 3.5873, -0.9621], + [-1.4289, -3.2787, 4.1747, -6.0341, -4.6341, -5.7945, 4.142], + [7.5973, 6.4431, 5.9872, 2.1639, -8.6566, 3.3143, -3.4059], + [-0.8775, -3.048, 11.6543, 0.6442, 2.3218, -0.4765, 1.1516], + [-5.5423, -2.5188, 1.0754, -0.0563, -2.9386, -1.1504, 1.0171] ] - ); - } + ] + ); + // Dilations. let res = t.conv2d(&w, 0, 1, 2, 1)?; assert_eq!(res.dims(), [1, 2, 1, 1]); @@ -198,44 +199,37 @@ fn conv2d(dev: &Device) -> Result<()> { [2.45, -2.3504], ); - if !dev.is_metal() { - // Transpose and dilations. - let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?; - assert_eq!(res.dims(), [1, 2, 9, 9]); - assert_eq!( - test_utils::to_vec3_round(&res.i(0)?, 4)?, + // Transpose and dilations. + let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 2)?; + assert_eq!(res.dims(), [1, 2, 9, 9]); + assert_eq!( + test_utils::to_vec3_round(&res.i(0)?, 4)?, + [ [ - [ - [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277], - [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499], - [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376], - [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141], - [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822], - [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03], - [ - -2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, - -3.5024 - ], - [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787], - [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579] - ], - [ - [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211], - [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278], - [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861], - [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185], - [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642], - [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957], - [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856], - [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908], - [ - -5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, - 1.0171 - ] - ] + [-1.9918, 3.1652, -0.6778, -4.3442, 4.4351, 0.6652, -3.0124, -0.6031, 2.9277], + [2.7036, -1.7156, -0.3969, 1.0516, 1.6381, -2.8886, -0.205, 2.4682, -1.0499], + [-0.9459, 3.1631, 3.707, -4.8369, -8.5166, -1.4496, -2.7559, -3.2698, 1.4376], + [-0.2157, 3.7786, -2.0252, -4.2633, 3.6731, -1.5142, 5.9391, -0.2622, -0.141], + [-6.8121, -3.1744, 1.5945, 3.0637, -9.6088, 1.4446, 2.9489, -3.0082, -7.3822], + [0.2371, 3.3303, 0.3861, 2.2646, -4.6784, 4.1235, -0.0109, 0.3176, -0.03], + [-2.5339, -2.9564, -3.4518, -4.4594, -9.1873, -1.9709, -0.4676, 0.51, -3.5024], + [4.007, 0.3067, -2.2954, 1.1105, -0.1992, 1.6372, -2.9268, 0.2807, -1.2787], + [5.307, 1.1317, 1.3518, 0.9049, 3.8116, -0.4075, -0.8874, -0.2241, -0.9579] + ], + [ + [1.089, -0.6483, 0.0726, -0.4752, -1.3283, 1.7103, 1.0703, 0.1076, -0.9211], + [-0.8629, 0.1376, 0.3202, 2.0955, 0.9696, 2.8988, -1.0012, 1.5049, -0.1278], + [1.9286, -1.5255, -2.9563, 2.4589, 3.3611, -0.6951, 0.3525, -1.7724, -5.9861], + [1.1226, 2.1561, 3.6417, 4.7546, -0.692, 4.4126, -5.1902, 6.0805, 2.3185], + [1.0111, 0.3604, 0.6432, -3.6605, 7.9517, -9.2955, -5.2988, -3.7803, -2.0642], + [3.3172, -1.7967, -3.6576, -2.0942, 1.3158, 0.112, -1.7405, 2.9167, 0.7957], + [5.1001, 1.8995, -1.8639, 1.1262, 9.9629, 2.683, -3.6319, -1.1607, 0.5856], + [-4.8445, -0.5642, 4.2317, 0.0856, 1.2267, -0.5712, 1.736, 1.0997, 0.6908], + [-5.5423, -1.1831, -1.2176, 0.0843, 0.0446, -0.7545, -2.4798, -0.0827, 1.0171] ] - ); - } + ] + ); + Ok(()) } @@ -290,11 +284,6 @@ fn conv2d_small(dev: &Device) -> Result<()> { ] ); - // conv-transposes are not implemented for metal - if dev.is_metal() { - return Ok(()); - } - let res = t.conv_transpose2d(&w.transpose(0, 1)?, 0, 0, 1, 1)?; assert_eq!(res.dims(), [1, 1, 3, 3]); assert_eq!( @@ -397,9 +386,6 @@ print(w.grad[0]) */ fn conv2d_grad(dev: &Device) -> Result<()> { // conv-transposes are not implemented for metal - if dev.is_metal() { - return Ok(()); - } use candle_core::Var; let t = Var::from_slice( &[ diff --git a/candle-metal-kernels/src/conv.metal b/candle-metal-kernels/src/conv.metal index a258ae58..e28ac6b3 100644 --- a/candle-metal-kernels/src/conv.metal +++ b/candle-metal-kernels/src/conv.metal @@ -405,6 +405,86 @@ kernel void FN_NAME( \ conv_transpose1d(l_out, stride, padding, out_padding, dilation, src_dims, src_strides, k_dims, k_strides, src, k, dst, tid); \ } \ +template +METAL_FUNC void conv_transpose2d( + constant size_t &w_out, + constant size_t &h_out, + constant size_t &stride, + constant size_t &padding, + constant size_t &out_padding, + constant size_t &dilation, + constant size_t *input_dims, + constant size_t *input_stride, + constant size_t *k_dims, + constant size_t *k_stride, + device const T *src, + device const T *k, + device T *dst, + uint tid [[ thread_position_in_grid ]] +) { + const size_t h_k = k_dims[2]; + const size_t w_k = k_dims[3]; + const size_t c_out = k_dims[1]; + const size_t c_in = input_dims[1]; + const size_t h_in = input_dims[2]; + const size_t w_in = input_dims[3]; + + if (tid >= input_dims[0] * c_out * w_out * h_out) { + return; + } + + const size_t b_idx = tid / (w_out * h_out * c_out); + const size_t dst_c_idx = (tid / (w_out * h_out)) % c_out; + const size_t out_y = (tid / w_out) % h_out; + const size_t out_x = tid % w_out; + + const size_t src_idx0 = b_idx * input_stride[0]; + + A d = 0; + for (int k_x = 0; k_x < (int)w_k; ++k_x) { + const int inp_x_stride = (int)(out_x + padding) - k_x * dilation; + if (inp_x_stride < 0 || inp_x_stride % stride) { + continue; + } + const int inp_x = inp_x_stride / stride; + if (inp_x >= w_in) continue; + for (int k_y = 0; k_y < (int)h_k; ++k_y) { + const int inp_y_stride = (int)(out_y + padding) - k_y * dilation; + if (inp_y_stride < 0 || inp_y_stride % stride) { + continue; + } + const int inp_y = inp_y_stride / stride; + if (inp_y >= h_in) continue; + for (size_t src_c_idx = 0; src_c_idx < c_in; ++src_c_idx) { + const size_t src_idx = src_idx0 + src_c_idx * input_stride[1] + inp_y * input_stride[2] + inp_x * input_stride[3]; + const size_t k_idx = src_c_idx * k_stride[0] + dst_c_idx * k_stride[1] + k_y * k_stride[2] + k_x * k_stride[3]; + d += static_cast(src[src_idx]) * static_cast(k[k_idx]); + } + } + } + dst[tid] = static_cast(d); +} + +#define CONVT2D_OP(TYPENAME, TYPEACC, FN_NAME) \ +kernel void FN_NAME( \ + constant size_t &w_out, \ + constant size_t &h_out, \ + constant size_t &stride, \ + constant size_t &padding, \ + constant size_t &out_padding, \ + constant size_t &dilation, \ + constant size_t *input_dims, \ + constant size_t *input_stride, \ + constant size_t *k_dims, \ + constant size_t *k_stride, \ + device const TYPENAME *src, \ + device const TYPENAME *k, \ + device TYPENAME *dst, \ + uint tid [[ thread_position_in_grid ]] \ +) { \ + conv_transpose2d(w_out, h_out, stride, padding, out_padding, dilation, input_dims, input_stride, k_dims, k_stride, src, k, dst, tid); \ +} \ + IM2COL_OP(float, im2col_f32) IM2COL_OP(uint8_t, im2col_u8) IM2COL_OP(uint32_t, im2col_u32) @@ -439,4 +519,10 @@ CONVT1D_OP(uint8_t, uint8_t, conv_transpose1d_u8) CONVT1D_OP(uint32_t, uint32_t, conv_transpose1d_u32) #if defined(__HAVE_BFLOAT__) CONVT1D_OP(bfloat, float, conv_transpose1d_bf16) +#endif + +CONVT2D_OP(float, float, conv_transpose2d_f32) +CONVT2D_OP(half, float, conv_transpose2d_f16) +#if defined(__HAVE_BFLOAT__) +CONVT1D_OP(bfloat, float, conv_transpose2d_bf16) #endif \ No newline at end of file diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index bab44a05..f2c9c7fe 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1970,5 +1970,63 @@ pub fn call_conv_transpose1d( Ok(()) } +pub struct CallConvTranspose2dCfg<'a> { + pub dilation: usize, + pub stride: usize, + pub padding: usize, + pub output_padding: usize, + pub c_out: usize, + pub out_w: usize, + pub out_h: usize, + pub b_size: usize, + pub input_dims: &'a [usize], + pub input_stride: &'a [usize], + pub kernel_dims: &'a [usize], + pub kernel_stride: &'a [usize], + pub input_offset: usize, + pub kernel_offset: usize, +} + +pub fn call_conv_transpose2d( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + name: &'static str, + cfg: CallConvTranspose2dCfg, + input: &Buffer, + kernel: &Buffer, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let dst_el = cfg.c_out * cfg.out_w * cfg.out_h * cfg.b_size; + let pipeline: ComputePipelineState = kernels.load_pipeline(device, Source::Conv, name)?; + let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + set_params!( + encoder, + ( + cfg.out_w, + cfg.out_h, + cfg.stride, + cfg.padding, + cfg.output_padding, + cfg.dilation, + cfg.input_dims, + cfg.input_stride, + cfg.kernel_dims, + cfg.kernel_stride, + (input, cfg.input_offset), + (kernel, cfg.kernel_offset), + output + ) + ); + encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(kernel, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[cfg(test)] mod tests; From c0bdd9c7a613682ed1f2e7010374bb03621c4153 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 18:49:35 +0100 Subject: [PATCH 044/131] Use the fast RmsNorm in the quantized model. (#1904) --- .../src/models/quantized_llama.rs | 35 +++++-------------- .../src/models/quantized_mistral.rs | 1 + candle-transformers/src/quantized_nn.rs | 20 ++++++----- 3 files changed, 21 insertions(+), 35 deletions(-) diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 5ce2de59..ee50c092 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use crate::quantized_nn::RmsNorm; use candle::quantized::QTensor; use candle::quantized::{ggml_file, gguf_file}; use candle::{DType, Device, IndexOp, Result, Tensor, D}; @@ -7,26 +8,6 @@ use candle_nn::{Embedding, Module}; pub const MAX_SEQ_LEN: usize = 4096; -#[derive(Debug, Clone)] -struct RmsNorm { - inner: candle_nn::LayerNorm, - span: tracing::Span, -} - -impl RmsNorm { - fn new(scale: QTensor, eps: f32) -> Result { - let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); - let scale = scale.dequantize(&scale.device())?; - let inner = candle_nn::LayerNorm::rms_norm(scale, eps as f64); - Ok(Self { inner, span }) - } - - fn forward(&self, x: &Tensor) -> Result { - let _enter = self.span.enter(); - self.inner.forward(x) - } -} - // QMatMul wrapper adding some tracing. #[derive(Debug, Clone)] struct QMatMul { @@ -301,7 +282,7 @@ impl ModelWeights { let neg_inf = Tensor::new(f32::NEG_INFINITY, &ct.device)?; let tok_embeddings = ct.remove("tok_embeddings.weight")?; let tok_embeddings = tok_embeddings.dequantize(&ct.device)?; - let norm = RmsNorm::new(ct.remove("norm.weight")?, 1e-5)?; + let norm = RmsNorm::from_qtensor(ct.remove("norm.weight")?, 1e-5)?; let output = ct.remove("output.weight")?; let mut layers = Vec::with_capacity(ct.hparams.n_layer as usize); for layer_idx in 0..ct.hparams.n_layer { @@ -330,9 +311,9 @@ impl ModelWeights { attention_wk: QMatMul::from_qtensor(attention_wk)?, attention_wv: QMatMul::from_qtensor(attention_wv)?, attention_wo: QMatMul::from_qtensor(attention_wo)?, - attention_norm: RmsNorm::new(attention_norm, 1e-5)?, + attention_norm: RmsNorm::from_qtensor(attention_norm, 1e-5)?, mlp_or_moe, - ffn_norm: RmsNorm::new(ffn_norm, 1e-5)?, + ffn_norm: RmsNorm::from_qtensor(ffn_norm, 1e-5)?, n_head: ct.hparams.n_head as usize, n_kv_head: ct.hparams.n_head as usize / gqa, head_dim: (ct.hparams.n_embd / ct.hparams.n_head) as usize, @@ -381,7 +362,7 @@ impl ModelWeights { let embedding_length = md_get("llama.embedding_length")?.to_u32()? as usize; let rope_dim = md_get("llama.rope.dimension_count")?.to_u32()? as usize; // Strangely this value is generally 1e-6 in GGUF file but used to be 1e-5 by default. - let rms_norm_eps = md_get("llama.attention.layer_norm_rms_epsilon")?.to_f32()?; + let rms_norm_eps = md_get("llama.attention.layer_norm_rms_epsilon")?.to_f64()?; let rope_freq_base = md_get("llama.rope.freq_base") .and_then(|m| m.to_f32()) @@ -391,7 +372,7 @@ impl ModelWeights { let tok_embeddings = ct.tensor(reader, "token_embd.weight", device)?; let tok_embeddings = tok_embeddings.dequantize(device)?; - let norm = RmsNorm::new( + let norm = RmsNorm::from_qtensor( ct.tensor(reader, "output_norm.weight", device)?, rms_norm_eps, )?; @@ -450,9 +431,9 @@ impl ModelWeights { attention_wk: QMatMul::from_qtensor(attention_wk)?, attention_wv: QMatMul::from_qtensor(attention_wv)?, attention_wo: QMatMul::from_qtensor(attention_wo)?, - attention_norm: RmsNorm::new(attention_norm, rms_norm_eps)?, + attention_norm: RmsNorm::from_qtensor(attention_norm, rms_norm_eps)?, mlp_or_moe, - ffn_norm: RmsNorm::new(ffn_norm, rms_norm_eps)?, + ffn_norm: RmsNorm::from_qtensor(ffn_norm, rms_norm_eps)?, n_head: head_count, n_kv_head: head_count_kv, head_dim: embedding_length / head_count, diff --git a/candle-transformers/src/models/quantized_mistral.rs b/candle-transformers/src/models/quantized_mistral.rs index f2cb3b27..77de7b75 100644 --- a/candle-transformers/src/models/quantized_mistral.rs +++ b/candle-transformers/src/models/quantized_mistral.rs @@ -327,6 +327,7 @@ impl Model { xs = layer.forward(&xs, attention_mask.as_ref(), seqlen_offset)? } xs.narrow(1, seq_len - 1, 1)? + .contiguous()? .apply(&self.norm)? .apply(&self.lm_head) } diff --git a/candle-transformers/src/quantized_nn.rs b/candle-transformers/src/quantized_nn.rs index bb0a8641..9298b80e 100644 --- a/candle-transformers/src/quantized_nn.rs +++ b/candle-transformers/src/quantized_nn.rs @@ -1,5 +1,6 @@ use crate::models::with_tracing::QMatMul; use crate::quantized_var_builder::VarBuilder; +use candle::quantized::QTensor; use candle::{Module, Result, Tensor}; #[derive(Debug, Clone)] @@ -35,10 +36,7 @@ pub struct Linear { } impl Linear { - pub fn from_arc( - weight: std::sync::Arc, - bias: Option, - ) -> Result { + pub fn from_arc(weight: std::sync::Arc, bias: Option) -> Result { let weight = QMatMul::from_weights(weight)?; Ok(Self { weight, bias }) } @@ -95,7 +93,8 @@ pub fn linear_no_bias(in_dim: usize, out_dim: usize, vb: VarBuilder) -> Result Result { let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); let weight = vb.get(size, "weight")?.dequantize(vb.device())?; - let inner = candle_nn::RmsNorm::new(weight, eps); - Ok(Self { inner, span }) + Ok(Self { weight, eps, span }) + } + + pub fn from_qtensor(weight: QTensor, eps: f64) -> Result { + let span = tracing::span!(tracing::Level::TRACE, "rms-norm"); + let weight = weight.dequantize(&weight.device())?; + Ok(Self { weight, eps, span }) } } impl Module for RmsNorm { fn forward(&self, x: &Tensor) -> Result { let _enter = self.span.enter(); - self.inner.forward(x) + candle_nn::ops::rms_norm(x, &self.weight, self.eps as f32) } } From c07e4057ab18c51d73c740a5950c02fd19d56cde Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 19:36:10 +0100 Subject: [PATCH 045/131] Fix for the llama model. (#1906) --- candle-transformers/src/models/llama.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-transformers/src/models/llama.rs b/candle-transformers/src/models/llama.rs index 400351f3..c311d4c4 100644 --- a/candle-transformers/src/models/llama.rs +++ b/candle-transformers/src/models/llama.rs @@ -390,7 +390,7 @@ impl Llama { x = block.forward(&x, index_pos, block_idx, cache)?; } let x = self.ln_f.forward(&x)?; - let x = x.i((.., seq_len - 1, ..))?; + let x = x.i((.., seq_len - 1, ..))?.contiguous()?; let logits = self.lm_head.forward(&x)?; logits.to_dtype(DType::F32) } From a00e24d752f3f62978c878859a01a4246244d4bc Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 21 Mar 2024 21:08:07 +0100 Subject: [PATCH 046/131] Improve the error message on overlong prompts. (#1908) --- candle-examples/examples/stable-diffusion/main.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/candle-examples/examples/stable-diffusion/main.rs b/candle-examples/examples/stable-diffusion/main.rs index 14642e9a..0e39902b 100644 --- a/candle-examples/examples/stable-diffusion/main.rs +++ b/candle-examples/examples/stable-diffusion/main.rs @@ -292,6 +292,13 @@ fn text_embeddings( .map_err(E::msg)? .get_ids() .to_vec(); + if tokens.len() > sd_config.clip.max_position_embeddings { + anyhow::bail!( + "the prompt is too long, {} > max-tokens ({})", + tokens.len(), + sd_config.clip.max_position_embeddings + ) + } while tokens.len() < sd_config.clip.max_position_embeddings { tokens.push(pad_id) } @@ -319,6 +326,13 @@ fn text_embeddings( .map_err(E::msg)? .get_ids() .to_vec(); + if uncond_tokens.len() > sd_config.clip.max_position_embeddings { + anyhow::bail!( + "the negative prompt is too long, {} > max-tokens ({})", + uncond_tokens.len(), + sd_config.clip.max_position_embeddings + ) + } while uncond_tokens.len() < sd_config.clip.max_position_embeddings { uncond_tokens.push(pad_id) } From 6708870e633af636660c556c19703c38cbe2af8d Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 22 Mar 2024 07:25:23 +0100 Subject: [PATCH 047/131] Add the alloc_uninit function. (#1901) * Add the alloc_uninit function. * Dummy metal fix. * Lazy initialization. --- candle-core/src/backend.rs | 6 +++ candle-core/src/cpu_backend.rs | 61 ++++++++++++++++++++++++-- candle-core/src/cuda_backend.rs | 56 ++++++++++++++++++++--- candle-core/src/device.rs | 17 +++++++ candle-core/src/dummy_cuda_backend.rs | 4 ++ candle-core/src/dummy_metal_backend.rs | 4 ++ candle-core/src/metal_backend.rs | 10 +++++ candle-core/src/tensor.rs | 8 ++-- candle-core/src/tensor_cat.rs | 4 +- 9 files changed, 154 insertions(+), 16 deletions(-) diff --git a/candle-core/src/backend.rs b/candle-core/src/backend.rs index c63aad54..27ffe934 100644 --- a/candle-core/src/backend.rs +++ b/candle-core/src/backend.rs @@ -127,6 +127,12 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone { fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result; + /// # Safety + /// This function is unsafe as it doesn't initialize the underlying data store. + /// The caller should ensure that the data is properly initialized as early as possible + /// after this call. + unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result; + fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result; fn storage_from_cpu_storage_owned(&self, _: CpuStorage) -> Result; diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend.rs index fa48577c..6d2ba361 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend.rs @@ -2582,7 +2582,10 @@ impl BackendStorage for CpuStorage { col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)? } else { // Make the kernel contiguous if not already the case. - let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?; + let mut kernel_c = unsafe { + self.device() + .alloc_uninit(kernel_l.shape(), kernel.dtype())? + }; kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?; let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset()) .transpose(1, 2)? @@ -2590,7 +2593,7 @@ impl BackendStorage for CpuStorage { col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)? }; let res_l = Layout::contiguous((b, l_out, params.c_out)).transpose(1, 2)?; - let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?; + let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? }; res.copy_strided_src(&mut res_t, 0, &res_l)?; Ok(res_t) } @@ -2681,7 +2684,10 @@ impl BackendStorage for CpuStorage { col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)? } else { // Make the kernel contiguous if not already the case. - let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?; + let mut kernel_c = unsafe { + self.device() + .alloc_uninit(kernel_l.shape(), kernel.dtype())? + }; kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?; let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset()) .transpose(1, 2)? @@ -2691,7 +2697,7 @@ impl BackendStorage for CpuStorage { let res_l = Layout::contiguous((b, h_out, w_out, params.c_out)) .transpose(1, 2)? .transpose(1, 3)?; - let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?; + let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? }; res.copy_strided_src(&mut res_t, 0, &res_l)?; Ok(res_t) } @@ -2919,6 +2925,53 @@ impl BackendDevice for CpuDevice { } } + #[allow(clippy::uninit_vec)] + unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { + let elem_count = shape.elem_count(); + // The code below is highly unsafe but hopefully not directly unsound as we only consider + // types that are Copy, not Drop, and for which all bit patterns are proper values. + // It's still pretty risky, see the following for more details: + // https://github.com/rust-lang/rust-clippy/issues/4483 + let storage = match dtype { + DType::U8 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::U8(v) + } + DType::U32 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::U32(v) + } + DType::I64 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::I64(v) + } + DType::BF16 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::BF16(v) + } + DType::F16 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::F16(v) + } + DType::F32 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::F32(v) + } + DType::F64 => { + let mut v = Vec::with_capacity(elem_count); + v.set_len(elem_count); + CpuStorage::F64(v) + } + }; + Ok(storage) + } + fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result { let elem_count = shape.elem_count(); let storage = match dtype { diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs index fec37c39..f0f03053 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend.rs @@ -384,6 +384,44 @@ impl BackendDevice for CudaDevice { self.const_impl(1., shape, dtype) } + unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { + let elem_count = shape.elem_count(); + let slice = match dtype { + DType::U8 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::U8(data) + } + DType::U32 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::U32(data) + } + DType::I64 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::I64(data) + } + DType::BF16 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::BF16(data) + } + DType::F16 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::F16(data) + } + DType::F32 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::F32(data) + } + DType::F64 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result { let slice = match storage { CpuStorage::U8(storage) => { @@ -1916,7 +1954,10 @@ impl BackendStorage for CudaStorage { col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)? } else { // Make the kernel contiguous if not already the case. - let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?; + let mut kernel_c = unsafe { + self.device() + .alloc_uninit(kernel_l.shape(), kernel.dtype())? + }; kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?; let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset()) .transpose(1, 2)? @@ -1924,7 +1965,7 @@ impl BackendStorage for CudaStorage { col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)? }; let res_l = Layout::contiguous((b, l_out, n)).transpose(1, 2)?; - let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?; + let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? }; res.copy_strided_src(&mut res_t, 0, &res_l)?; Ok(res_t) } @@ -1981,7 +2022,10 @@ impl BackendStorage for CudaStorage { col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)? } else { // Make the kernel contiguous if not already the case. - let mut kernel_c = self.device().zeros_impl(kernel_l.shape(), kernel.dtype())?; + let mut kernel_c = unsafe { + self.device() + .alloc_uninit(kernel_l.shape(), kernel.dtype())? + }; kernel.copy_strided_src(&mut kernel_c, 0, kernel_l)?; let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset()) .transpose(1, 2)? @@ -1991,7 +2035,7 @@ impl BackendStorage for CudaStorage { let res_l = Layout::contiguous((b, h_out, w_out, n)) .transpose(1, 2)? .transpose(1, 3)?; - let mut res_t = self.device().zeros_impl(res_l.shape(), res.dtype())?; + let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? }; res.copy_strided_src(&mut res_t, 0, &res_l)?; Ok(res_t) } @@ -2128,7 +2172,7 @@ impl BackendStorage for CudaStorage { dim: usize, ) -> Result { let device = self.device().clone(); - let mut acc = device.zeros_impl(l.shape(), self.dtype())?; + let mut acc = unsafe { device.alloc_uninit(l.shape(), self.dtype())? }; self.copy_strided_src(&mut acc, 0, l)?; ScatterAdd(ids, ids_l, dim).map(&mut acc.slice, l.shape(), &src.slice, src_l, &device)?; Ok(acc) @@ -2143,7 +2187,7 @@ impl BackendStorage for CudaStorage { dim: usize, ) -> Result { let device = self.device().clone(); - let mut acc = device.zeros_impl(l.shape(), self.dtype())?; + let mut acc = unsafe { device.alloc_uninit(l.shape(), self.dtype())? }; self.copy_strided_src(&mut acc, 0, l)?; IndexAdd(ids, ids_l, dim).map(&mut acc.slice, l.shape(), &src.slice, src_l, &device)?; Ok(acc) diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs index 9c39d27a..846c62ce 100644 --- a/candle-core/src/device.rs +++ b/candle-core/src/device.rs @@ -289,6 +289,23 @@ impl Device { } } + pub(crate) unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { + match self { + Device::Cpu => { + let storage = CpuDevice.alloc_uninit(shape, dtype)?; + Ok(Storage::Cpu(storage)) + } + Device::Cuda(device) => { + let storage = device.alloc_uninit(shape, dtype)?; + Ok(Storage::Cuda(storage)) + } + Device::Metal(device) => { + let storage = device.alloc_uninit(shape, dtype)?; + Ok(Storage::Metal(storage)) + } + } + } + pub(crate) fn storage(&self, array: A) -> Result { match self { Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())), diff --git a/candle-core/src/dummy_cuda_backend.rs b/candle-core/src/dummy_cuda_backend.rs index d4887f19..5348233c 100644 --- a/candle-core/src/dummy_cuda_backend.rs +++ b/candle-core/src/dummy_cuda_backend.rs @@ -210,6 +210,10 @@ impl crate::backend::BackendDevice for CudaDevice { Err(Error::NotCompiledWithCudaSupport) } + unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result { + Err(Error::NotCompiledWithCudaSupport) + } + fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result { Err(Error::NotCompiledWithCudaSupport) } diff --git a/candle-core/src/dummy_metal_backend.rs b/candle-core/src/dummy_metal_backend.rs index 33c6c9fe..322f81d2 100644 --- a/candle-core/src/dummy_metal_backend.rs +++ b/candle-core/src/dummy_metal_backend.rs @@ -222,6 +222,10 @@ impl crate::backend::BackendDevice for MetalDevice { Err(Error::NotCompiledWithMetalSupport) } + unsafe fn alloc_uninit(&self, _shape: &Shape, _dtype: DType) -> Result { + Err(Error::NotCompiledWithMetalSupport) + } + fn storage_from_cpu_storage(&self, _: &CpuStorage) -> Result { Err(Error::NotCompiledWithMetalSupport) } diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 4f4162e2..ef044fc8 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1886,6 +1886,16 @@ impl BackendDevice for MetalDevice { self.device.registry_id() == rhs.device.registry_id() } + unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { + let buffer = self.new_buffer(shape.elem_count(), dtype, "alloc-uninit")?; + Ok(MetalStorage::new( + buffer, + self.clone(), + shape.elem_count(), + dtype, + )) + } + fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result { let size = shape.elem_count() * dtype.size_in_bytes(); let buffer = self.allocate_zeros(size)?; diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index d7c2ed66..6b5aed96 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -1349,7 +1349,7 @@ impl Tensor { } .bt())? } - let mut storage = self.device().zeros(self.shape(), self.dtype())?; + let mut storage = unsafe { self.device().alloc_uninit(self.shape(), self.dtype())? }; self.storage() .copy_strided_src(&mut storage, 0, self.layout())?; let offset = start * src.dims()[1..].iter().product::(); @@ -1999,7 +1999,7 @@ impl Tensor { Ok(self.clone()) } else { let shape = self.shape(); - let mut storage = self.device().zeros(shape, self.dtype())?; + let mut storage = unsafe { self.device().alloc_uninit(shape, self.dtype())? }; self.storage() .copy_strided_src(&mut storage, 0, self.layout())?; let op = BackpropOp::new1(self, Op::Copy); @@ -2011,7 +2011,7 @@ impl Tensor { /// copied. pub(crate) fn make_var(&self) -> Result { let shape = self.shape().clone(); - let mut storage = self.device().zeros(&shape, self.dtype())?; + let mut storage = unsafe { self.device().alloc_uninit(&shape, self.dtype())? }; self.storage() .copy_strided_src(&mut storage, 0, self.layout())?; Ok(from_storage(storage, shape, BackpropOp::none(), true)) @@ -2064,7 +2064,7 @@ impl Tensor { }; Ok(Tensor(Arc::new(tensor_))) } else { - let mut storage = self.device().zeros(&shape, self.dtype())?; + let mut storage = unsafe { self.device().alloc_uninit(&shape, self.dtype())? }; self.storage() .copy_strided_src(&mut storage, 0, self.layout())?; Ok(from_storage(storage, shape, op, false)) diff --git a/candle-core/src/tensor_cat.rs b/candle-core/src/tensor_cat.rs index 25acc80e..31cc8503 100644 --- a/candle-core/src/tensor_cat.rs +++ b/candle-core/src/tensor_cat.rs @@ -141,7 +141,7 @@ impl Tensor { } let shape = Shape::from(cat_dims); let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, 0)); - let mut storage = device.zeros(&shape, dtype)?; + let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? }; for (arg, &offset) in args.iter().zip(offsets.iter()) { let arg = arg.as_ref(); arg.storage() @@ -215,7 +215,7 @@ impl Tensor { let block_size: usize = cat_dims.iter().skip(1 + dim).product(); let shape = Shape::from(cat_dims); let op = crate::op::BackpropOp::new(args, |args| crate::op::Op::Cat(args, dim)); - let mut storage = device.zeros(&shape, dtype)?; + let mut storage = unsafe { device.alloc_uninit(&shape, dtype)? }; let mut dst_o = 0; for arg in args.iter() { let arg = arg.as_ref(); From fee33b45c2b635d83fa2ca0955ae453fe26374ea Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Fri, 22 Mar 2024 02:30:02 -0400 Subject: [PATCH 048/131] Add support for strided index-select on Metal (#1909) * initial implementation * use correct index, but still not breaking like it should have... * fix test --- candle-core/src/metal_backend.rs | 18 +++--- candle-metal-kernels/src/indexing.metal | 41 ++++++++++--- candle-metal-kernels/src/lib.rs | 12 +++- candle-metal-kernels/src/tests.rs | 81 +++++++++++++++++++++++-- 4 files changed, 129 insertions(+), 23 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index ef044fc8..73a141ea 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -2,9 +2,8 @@ use crate::backend::{BackendDevice, BackendStorage}; use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvTranspose2D}; use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT}; use crate::{CpuStorage, DType, Layout, Result, Shape}; +use candle_metal_kernels::CallConvTranspose2dCfg; use candle_metal_kernels::Kernels; -use candle_metal_kernels::{self, CallConvTranspose2dCfg}; -use metal; use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; use std::collections::HashMap; use std::ffi::c_void; @@ -1348,12 +1347,8 @@ impl BackendStorage for MetalStorage { } fn index_select(&self, ids: &Self, src_l: &Layout, ids_l: &Layout, dim: usize) -> Result { - if !(src_l.is_contiguous() - && src_l.start_offset() == 0 - && ids_l.is_contiguous() - && ids_l.start_offset() == 0) - { - crate::bail!("Metal strided index_select not implemented"); + if !ids_l.is_contiguous() { + crate::bail!("Metal index_select requires contiguous ids") } let left_size: usize = src_l.dims()[..dim].iter().product(); let right_size: usize = src_l.dims()[dim + 1..].iter().product(); @@ -1364,6 +1359,8 @@ impl BackendStorage for MetalStorage { let buffer = device.new_buffer(dst_el, dtype, "index_select")?; let name = match (ids.dtype, self.dtype) { (DType::U8, DType::BF16) => "is_u8_bf16", + (DType::U8, DType::F32) => "is_u8_f32", + (DType::U8, DType::F16) => "is_u8_f16", (DType::U32, DType::F32) => "is_u32_f32", (DType::U32, DType::F16) => "is_u32_f16", @@ -1382,8 +1379,13 @@ impl BackendStorage for MetalStorage { src_l.dims(), ids_el, dim, + src_l.is_contiguous(), + src_l.dims(), + src_l.stride(), &self.buffer, + src_l.start_offset() * dtype.size_in_bytes(), &ids.buffer, + ids_l.start_offset() * ids.dtype.size_in_bytes(), &buffer, ) .map_err(MetalError::from)?; diff --git a/candle-metal-kernels/src/indexing.metal b/candle-metal-kernels/src/indexing.metal index 65491759..ad4a8605 100644 --- a/candle-metal-kernels/src/indexing.metal +++ b/candle-metal-kernels/src/indexing.metal @@ -1,20 +1,38 @@ #include using namespace metal; +METAL_FUNC uint get_strided_index( + uint idx, + constant size_t &num_dims, + constant size_t *dims, + constant size_t *strides +) { + uint strided_i = 0; + for (uint d = 0; d < num_dims; d++) { + uint dim_idx = num_dims - 1 - d; + strided_i += (idx % dims[dim_idx]) * strides[dim_idx]; + idx /= dims[dim_idx]; + } + return strided_i; +} + template METAL_FUNC void index( constant size_t &dst_size, constant size_t &left_size, constant size_t &src_dim_size, constant size_t &right_size, - constant size_t &ids_size, - const device TYPENAME *input, + constant size_t &ids_size, + constant bool &contiguous, + constant size_t *src_dims, + constant size_t *src_strides, + const device TYPENAME *input, const device INDEX_TYPENAME *input_ids, device TYPENAME *output, uint tid [[ thread_position_in_grid ]] ) { if (tid >= dst_size) { - return; + return; } const size_t id_i = (tid / right_size) % ids_size; const INDEX_TYPENAME input_i = min(input_ids[id_i], (INDEX_TYPENAME)(src_dim_size - 1)); @@ -26,7 +44,8 @@ METAL_FUNC void index( // No need to check for zero we're only allowing unsized. */ const size_t src_i = left_rank_i * src_dim_size * right_size + input_i * right_size + right_rank_i; - output[tid] = input[src_i]; + const size_t strided_src_i = contiguous ? src_i : get_strided_index(src_i, src_dim_size, src_dims, src_strides); + output[tid] = input[strided_src_i]; } # define INDEX_OP(NAME, INDEX_TYPENAME, TYPENAME) \ @@ -36,12 +55,15 @@ kernel void NAME( \ constant size_t &src_dim_size, \ constant size_t &right_size, \ constant size_t &ids_size, \ + constant bool &contiguous, \ + constant size_t *src_dims, \ + constant size_t *src_strides, \ const device TYPENAME *input, \ const device INDEX_TYPENAME *input_ids, \ device TYPENAME *output, \ uint tid [[ thread_position_in_grid ]] \ ) { \ - index(dst_size, left_size, src_dim_size, right_size, ids_size, input, input_ids, output, tid); \ + index(dst_size, left_size, src_dim_size, right_size, ids_size, contiguous, src_dims, src_strides, input, input_ids, output, tid); \ } @@ -165,10 +187,15 @@ kernel void NAME( \ } -INDEX_OP(is_u32_f32, uint, float) -INDEX_OP(is_u32_f16, uint, half) +INDEX_OP(is_u32_f32, uint32_t, float) +INDEX_OP(is_u32_f16, uint32_t, half) #if defined(__HAVE_BFLOAT__) INDEX_OP(is_u32_bf16, uint32_t, bfloat) +#endif + +INDEX_OP(is_u8_f32, uint8_t, float) +INDEX_OP(is_u8_f16, uint8_t, half) +#if defined(__HAVE_BFLOAT__) INDEX_OP(is_u8_bf16, uint8_t, bfloat) #endif diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index f2c9c7fe..e17365a0 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1067,8 +1067,13 @@ pub fn call_index_select( shape: &[usize], ids_size: usize, dim: usize, + contiguous: bool, + src_dims: &[usize], + src_strides: &[usize], input: &Buffer, + src_offset: usize, ids: &Buffer, + ids_offset: usize, output: &Buffer, ) -> Result<(), MetalKernelError> { let left_size: usize = shape[..dim].iter().product(); @@ -1090,8 +1095,11 @@ pub fn call_index_select( src_dim_size, right_size, ids_size, - input, - ids, + contiguous, + src_dims, + src_strides, + (input, src_offset), + (ids, ids_offset), output ) ); diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index 5045a4a3..b15d9b36 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -600,22 +600,35 @@ fn affine_strided() { fn index_select() { let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; let shape = [5, 2]; + let stride = [2, 1]; let ids = [0u32, 4, 2]; let dim = 0; - let result = run_index_select(&embedding, &shape, &ids, dim, "is_u32_f32"); + let result = run_index_select(&embedding, &shape, &stride, &ids, dim, "is_u32_f32"); assert_eq!(result, vec![1.0f32, 2.0, 9.0, 10.0, 5.0, 6.0]); let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; let shape = [2, 5]; + let stride = [1, 2]; let ids = [0u32, 1, 0]; let dim = 0; - let result = run_index_select(&embedding, &shape, &ids, dim, "is_u32_f32"); + let result = run_index_select(&embedding, &shape, &stride, &ids, dim, "is_u32_f32"); assert_eq!( result, vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 1.0f32, 2.0, 3.0, 4.0, 5.0] ); } +#[test] +fn index_select_strided() { + let embedding = (0..16).map(|x| x as f32).collect::>(); + let shape = [2, 2]; + let stride = [2, 4]; + let ids = [0u32]; + let dim = 0; + let result = run_index_select_strided(&embedding, &shape, &stride, &ids, dim, "is_u32_f32"); + assert_eq!(result, vec![0.0, 4.0]); +} + #[test] fn index_select_f16() { let embedding: Vec<_> = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] @@ -623,9 +636,10 @@ fn index_select_f16() { .map(|x| f16::from_f32(x)) .collect(); let shape = [5, 2]; + let stride = [2, 1]; let ids = [0u32, 4, 2]; let dim = 0; - let result = run_index_select(&embedding, &shape, &ids, dim, "is_u32_f16"); + let result = run_index_select(&embedding, &shape, &stride, &ids, dim, "is_u32_f16"); assert_eq!( approx_f16(result, 4), vec![1.0f32, 2.0, 9.0, 10.0, 5.0, 6.0] @@ -636,9 +650,10 @@ fn index_select_f16() { fn index_select_is_u32_bf16() { let embedding: Vec = (1..=10).map(|x| bf16::from_f32(x as f32)).collect(); let shape = [5, 2]; + let stride = [2, 1]; let ids = [0u32, 4, 2]; let dim = 0; - let result = run_index_select(&embedding, &shape, &ids, dim, "is_u32_bf16"); + let result = run_index_select(&embedding, &shape, &stride, &ids, dim, "is_u32_bf16"); assert_eq!( approx_bf16(result, 4), vec![1.0f32, 2.0, 9.0, 10.0, 5.0, 6.0] @@ -649,9 +664,10 @@ fn index_select_is_u32_bf16() { fn index_select_is_u8_bf16() { let embedding: Vec = (1..=10).map(|x| bf16::from_f32(x as f32)).collect(); let shape = [5, 2]; + let stride = [2, 1]; let ids = [0u8, 4, 2]; let dim = 0; - let result = run_index_select(&embedding, &shape, &ids, dim, "is_u8_bf16"); + let result = run_index_select(&embedding, &shape, &stride, &ids, dim, "is_u8_bf16"); assert_eq!( approx_bf16(result, 4), vec![1.0f32, 2.0, 9.0, 10.0, 5.0, 6.0] @@ -662,9 +678,10 @@ fn index_select_is_u8_bf16() { fn index_select_dim1() { let embedding = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]; let shape = [5, 2]; + let stride = [2, 1]; let ids = [0u32, 1, 0]; let dim = 1; - let result = run_index_select(&embedding, &shape, &ids, dim, "is_u32_f32"); + let result = run_index_select(&embedding, &shape, &stride, &ids, dim, "is_u32_f32"); assert_eq!( result, vec![1.0f32, 2.0, 1.0, 3.0, 4.0, 3.0, 5.0, 6.0, 5.0, 7.0, 8.0f32, 7.0, 9.0, 10.0, 9.0] @@ -674,6 +691,7 @@ fn index_select_dim1() { fn run_index_select( embeddings: &[T], shape: &[usize], + stride: &[usize], ids: &[I], dim: usize, name: &'static str, @@ -699,8 +717,59 @@ fn run_index_select( shape, ids.len(), dim, + true, + shape, + stride, &embeddings_buffer, + 0, &ids_buffer, + 0, + &dst_buffer, + ) + .unwrap(); + + command_buffer.commit(); + command_buffer.wait_until_completed(); + + read_to_vec(&dst_buffer, dst_el) +} + +fn run_index_select_strided( + embeddings: &[T], + shape: &[usize], + stride: &[usize], + ids: &[I], + dim: usize, + name: &'static str, +) -> Vec { + let device = Device::system_default().expect("no device found"); + + let command_queue = device.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + let embeddings_buffer = new_buffer(&device, &embeddings); + let ids_buffer = new_buffer(&device, &ids); + + let left_size: usize = shape[..dim].iter().product(); + let right_size: usize = shape[dim + 1..].iter().product(); + let dst_el = ids.len() * left_size * right_size; + let dst_buffer = new_buffer(&device, &vec![0.0f32; dst_el]); + + let kernels = Kernels::new(); + call_index_select( + &device, + &command_buffer, + &kernels, + name, + shape, + ids.len(), + dim, + false, + shape, + stride, + &embeddings_buffer, + 0, + &ids_buffer, + 0, &dst_buffer, ) .unwrap(); From 32f567bac491aa0f52dfbe1001ea4d6187bb4301 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 22 Mar 2024 10:28:38 +0100 Subject: [PATCH 049/131] Fix loading the gguf files. (#1913) --- candle-transformers/src/models/quantized_llama.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index ee50c092..717e6771 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -362,7 +362,7 @@ impl ModelWeights { let embedding_length = md_get("llama.embedding_length")?.to_u32()? as usize; let rope_dim = md_get("llama.rope.dimension_count")?.to_u32()? as usize; // Strangely this value is generally 1e-6 in GGUF file but used to be 1e-5 by default. - let rms_norm_eps = md_get("llama.attention.layer_norm_rms_epsilon")?.to_f64()?; + let rms_norm_eps = md_get("llama.attention.layer_norm_rms_epsilon")?.to_f32()? as f64; let rope_freq_base = md_get("llama.rope.freq_base") .and_then(|m| m.to_f32()) From fc1fe5e45b046771589126c355fdfb4d3bb49fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 22 Mar 2024 11:51:41 +0100 Subject: [PATCH 050/131] Support scatter/index_add with i64 indices for f16 (#1915) --- candle-kernels/src/indexing.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/candle-kernels/src/indexing.cu b/candle-kernels/src/indexing.cu index 8fc69363..8af2954d 100644 --- a/candle-kernels/src/indexing.cu +++ b/candle-kernels/src/indexing.cu @@ -168,8 +168,10 @@ IS_OP(__half, uint8_t, is_u8_f16) GATHER_OP(__half, int64_t, gather_i64_f16) GATHER_OP(__half, uint32_t, gather_u32_f16) GATHER_OP(__half, uint8_t, gather_u8_f16) +IA_OP(__half, int64_t, ia_i64_f16) IA_OP(__half, uint32_t, ia_u32_f16) IA_OP(__half, uint8_t, ia_u8_f16) +SA_OP(__half, int64_t, sa_i64_f16) SA_OP(__half, uint32_t, sa_u32_f16) SA_OP(__half, uint8_t, sa_u8_f16) #endif From cc856db9ce2541e09731165f88cdd7aae37f558e Mon Sep 17 00:00:00 2001 From: Kirpal Grewal <45569241+KGrewal1@users.noreply.github.com> Date: Sat, 23 Mar 2024 06:05:55 +0000 Subject: [PATCH 051/131] Backwards for ConvTranspose2D (#1910) * add documentation for nackprop * add backwards for ConvTranspose2D * add test python code to test --- candle-core/src/backprop.rs | 38 +++++++- candle-core/tests/conv_tests.rs | 161 ++++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 10 deletions(-) diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs index 2a1db58a..f39eedbb 100644 --- a/candle-core/src/backprop.rs +++ b/candle-core/src/backprop.rs @@ -1,3 +1,4 @@ +/// Methods for backpropagation of gradients. use crate::op::{BinaryOp, Op, ReduceOp, UnaryOp}; use crate::{Error, Result, Tensor, TensorId}; use std::collections::HashMap; @@ -310,9 +311,32 @@ impl Tensor { Op::ConvTranspose1D { .. } => Err(Error::BackwardNotSupported { op: "conv-transpose1d", })?, - Op::ConvTranspose2D { .. } => Err(Error::BackwardNotSupported { - op: "conv-transpose2d", - })?, + Op::ConvTranspose2D { + arg, + kernel, + padding, + stride, + dilation, + output_padding: _output_padding, + } => { + let grad_arg = grad.conv2d(kernel, *padding, *dilation, *stride, 1)?; + let sum_grad = grads.or_insert(arg)?; + *sum_grad = sum_grad.add(&grad_arg)?; + + let grad_kernel = grad + .transpose(0, 1)? + .conv2d(&arg.transpose(0, 1)?, *padding, *stride, *dilation, 1)? + .transpose(0, 1)?; + let sum_grad = grads.or_insert(kernel)?; + let (_, _, k0, k1) = kernel.dims4()?; + let (_, _, g_k0, g_k1) = grad_kernel.dims4()?; + let grad_kernel = if g_k0 != k0 || g_k1 != k1 { + grad_kernel.narrow(2, 0, k0)?.narrow(3, 0, k1)? + } else { + grad_kernel + }; + *sum_grad = sum_grad.add(&grad_kernel)?; + } Op::AvgPool2D { arg, kernel_size, @@ -690,30 +714,38 @@ impl Tensor { } } +/// A store for gradients, associating a tensor id to the corresponding gradient tensor, used for back propagation. #[derive(Debug)] pub struct GradStore(HashMap); impl GradStore { + /// Create a new gradient store fn new() -> Self { GradStore(HashMap::new()) } + /// Get the gradient tensor corresponding to the given tensor id pub fn get_id(&self, id: TensorId) -> Option<&Tensor> { self.0.get(&id) } + /// Get the gradient tensor associated with the given tensor pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> { self.0.get(&tensor.id()) } + /// Remove the gradient tensor associated with the given tensor, returning it if it exists pub fn remove(&mut self, tensor: &Tensor) -> Option { self.0.remove(&tensor.id()) } + /// Insert a gradient tensor associated with the given tensor, returning the previous gradient tensor if it existed pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option { self.0.insert(tensor.id(), grad) } + /// Get the gradient tensor associated with the given tensor, or, if it does not exist, + /// insert a tensor of zeroes, with the same shape and type as the given tensors and return it fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> { use std::collections::hash_map::Entry; let grad = match self.0.entry(tensor.id()) { diff --git a/candle-core/tests/conv_tests.rs b/candle-core/tests/conv_tests.rs index 6cc48ec7..3762e02f 100644 --- a/candle-core/tests/conv_tests.rs +++ b/candle-core/tests/conv_tests.rs @@ -135,7 +135,7 @@ fn conv2d(dev: &Device) -> Result<()> { 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006, - -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085, + -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085, ], dev, )?; @@ -276,11 +276,10 @@ fn conv2d_small(dev: &Device) -> Result<()> { assert_eq!( test_utils::to_vec1_round(&res.flatten_all()?, 4)?, [ - 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, - 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1640, -0.0111, -0.1742, 0.0000, 0.0000, - 0.0000, 0.0000, 2.6437, -2.0268, 1.1823, 0.0000, 0.0000, 0.0000, 0.0000, 3.2855, - -1.0324, 0.2539, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, - 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000 + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1640, + -0.0111, -0.1742, 0.0, 0.0, 0.0, 0.0, 2.6437, -2.0268, 1.1823, 0.0, 0.0, 0.0, 0.0, + 3.2855, -1.0324, 0.2539, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0 ] ); @@ -398,7 +397,7 @@ fn conv2d_grad(dev: &Device) -> Result<()> { 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, 0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.7875, 0.5377, -0.6779, -0.8088, 1.9098, 1.2006, - -0.8000, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085, + -0.8, -0.4983, 1.5480, 0.8265, -0.1025, 0.5138, 0.5748, 0.3821, -0.4607, 0.0085, ], (1, 4, 5, 5), dev, @@ -583,6 +582,154 @@ fn conv2d_grad(dev: &Device) -> Result<()> { ] ); + // Conv Transpose 2d Test + //tested against following python + + // import torch + // torch.manual_seed(4242) + // padding = 4 + // outpadding = 2 + // dilation = 3 + // stride = 3 + // input = torch.randn((1, 4, 7, 5), requires_grad=True) + // kernel = torch.randn((4, 2, 3, 5), requires_grad=True) + // print("input", input.flatten()) + // print("kernel", kernel.flatten()) + // res = torch.nn.functional.conv_transpose2d( + // input, + // kernel, + // stride=stride, + // padding=padding, + // dilation=dilation, + // output_padding=outpadding, + // ) + // res.retain_grad() + // print(res.shape) + // loss = (res**2).sum() + // print(loss) + // loss.backward() + // print(input.grad.shape) + // print("input grad", torch.round(input.grad, decimals=1)) + // print(kernel.grad.shape) + // print("kernel grad", torch.round(kernel.grad.flatten(), decimals=1)) + + let padding = 4; + let outpadding = 2; + let dilation = 3; + let stride = 3; + + let t = Var::from_slice( + &[ + 0.4056_f32, -0.8689, -0.0773, -1.5630, -2.8012, -1.5059, 0.3972, 1.0852, 0.4997, + 3.0616, 1.6541, 0.0964, -0.8338, -1.6523, -0.8323, -0.1699, 0.0823, 0.3526, 0.6843, + 0.2395, 1.2279, -0.9287, -1.7030, 0.1370, 0.6047, 0.3770, -0.6266, 0.3529, 2.2013, + -0.6836, 0.2477, 1.3127, -0.2260, 0.2622, -1.2974, -0.8140, -0.8404, -0.3490, 0.0130, + 1.3123, 1.7569, -0.3956, -1.8255, 0.1727, -0.3538, 2.6941, 1.0529, 0.4219, -0.2071, + 1.1586, 0.4717, 0.3865, -0.5690, -0.5010, -0.1310, 0.7796, 0.6630, -0.2021, 2.6090, + 0.2049, 0.6466, -0.5042, -0.0603, -1.6538, -1.2429, 1.8357, 1.6052, -1.3844, 0.3323, + -1.3712, 0.9634, -0.4799, -0.6451, -0.0840, -1.4247, 0.5512, -0.1747, -0.5509, -0.3742, + 0.3790, -0.4431, -0.4720, -0.7890, 0.2620, 0.5411, -1.1715, -2.4997, 2.3249, -0.8912, + -0.4733, -0.5701, -2.8888, -1.4112, -0.5471, -0.9234, -1.1660, 0.4189, -0.7465, + -0.6473, 0.1402, 0.7875, 0.5377, -0.6779, -0.8088, -0.4864, -0.2312, 0.9279, 0.1264, + 1.5480, 0.8265, -0.1025, 0.5138, -0.2512, 0.1576, 1.2705, 0.3641, -0.9325, 0.6451, + -0.8537, 0.2378, 0.1794, 0.2752, -0.3687, -1.1149, -0.1410, -0.5829, -0.0892, 1.4258, + -2.2789, 0.5270, 0.1825, 1.7007, -0.5263, -0.2954, 0.4440, 0.5537, 0.3492, 0.6186, + 1.6475, 0.2219, + ], + (1, 4, 7, 5), + dev, + )?; + + #[rustfmt::skip] + let w = Var::from_slice( + &[ + -1.1744_f32, 0.3266, 2.5893, 1.0142, 0.1763, 0.7752, 0.6604, 0.2029, -0.2145, 0.7234, + -0.3441, -1.5400, -0.6333, 0.6613, 0.2083, 0.6230, -1.7002, 0.3393, 0.4049, 1.0762, + 0.2723, 1.4181, 0.0029, -0.2122, 1.7668, 1.4168, 0.3320, -0.2719, 0.7932, -0.7204, + 0.4447, 0.1211, 0.5908, 1.0089, -0.1646, 1.8033, -0.6286, 0.2016, -0.3370, 1.2555, + 0.8009, -0.6488, -0.4652, -1.5685, 1.5860, 0.5583, 0.4623, 0.6026, 0.8828, 2.4990, + 0.6811, -0.3369, 1.3320, 1.7669, -1.1067, 1.2958, -0.9415, -0.9655, -0.4462, 0.7181, + 0.5181, -1.1658, -1.8467, -0.7763, 1.2769, 0.8651, 0.9890, 1.5092, 0.7207, -0.8481, + 0.7417, 0.3375, -1.2685, 1.4572, 1.0915, 0.1093, -0.8550, -0.5831, -0.6309, -0.2509, + 0.5220, -0.0914, 0.7900, 0.1096, 0.3258, 0.2723, -1.0942, -0.3393, -0.1653, 0.5732, + -0.8014, 1.8194, -1.9023, 0.2127, 1.8636, -0.8979, 0.1927, -0.2778, 0.3105, 0.0071, + -1.1823, 0.2476, -0.7178, -1.3821, 1.0769, -0.4376, -0.9967, -0.1227, 1.6197, -1.0604, + 0.1372, 0.8141, -0.6163, 0.7304, -0.8285, 2.0636, -0.7176, 0.2495, -0.2581, -0.4478, + ], + (4, 2, 3, 5), + dev, + )?; + let res = t.conv_transpose2d(&w, padding, outpadding, stride, dilation)?; + let loss = res.sqr()?.sum_all()?; + assert_eq!(test_utils::to_vec0_round(&loss, 0)?, 2904.0); + let grads = loss.backward()?; + + let grad_t = grads.get(&t).unwrap(); + let grad_w = grads.get(&w).unwrap(); + assert_eq!(grad_t.dims(), [1, 4, 7, 5]); + assert_eq!(grad_w.dims(), [4, 2, 3, 5]); + + assert_eq!( + test_utils::to_vec1_round(&grad_w.flatten_all()?, 1)?, + [ + // torch gets 89.1 + -89.0, -135.3, 136.7, 102.0, -53.4, 117.9, 118.6, -43.9, -218.0, -58.5, -114.3, -150.0, + -15.6, 172.1, 66.3, -64.3, -27.9, -19.8, 31.7, 62.1, 5.5, 92.6, 28.2, -29.6, 55.9, + 52.7, -72.7, -119.8, 53.8, -25.5, 128.8, 19.3, 68.0, 190.9, -64.1, -86.2, -111.2, + 106.6, -67.7, 37.8, 115.9, 50.4, -77.7, -54.9, 22.3, -4.6, 89.8, 61.7, 122.4, 192.6, + -27.8, -104.6, 57.0, 166.4, 27.1, 6.1, 18.7, -93.2, 31.5, 168.2, -3.7, -99.5, -55.5, + -10.8, 17.5, 20.8, 16.9, 43.8, 42.0, -89.2, 18.8, -9.6, -84.1, 212.6, 19.7, -50.0, + -52.0, -40.0, -166.6, -73.2, -10.8, -73.3, 31.5, -23.4, -79.3, -27.0, -84.4, -42.9, + -20.3, 51.8, -16.7, 76.3, -120.5, -65.8, 96.5, -10.7, -45.9, -88.1, 65.4, -7.0, -1.5, + 92.8, -25.1, -114.2, -5.8, -14.8, -51.2, -20.7, 54.2, -79.8, 47.7, -29.2, -8.8, 53.5, + -28.4, 85.0, -18.3, 107.0, 28.3, -71.8 + ] + ); + + assert_eq!( + test_utils::to_vec3_round(&grad_t.i(0)?, 1)?, + [ + [ + [32.3, -41.6, -24.0, 14.1, 17.6], + [-11.8, 72.5, 87.6, 46.4, 61.5], + [115.0, 108.5, -48.6, -63.4, -50.0], + [51.3, 5.4, 31.3, 91.1, -30.9], + [52.7, 92.8, -68.0, -47.0, 83.0], + // pytorch gets -107.1 + [-10.2, -107.0, -5.4, 213.1, -31.4], + [-2.4, 65.1, 9.2, -146.2, -24.2] + ], + [ + [-72.6, -63.9, -61.9, 45.3, 33.0], + [79.3, -0.5, -26.2, 78.2, 42.7], + [90.9, 141.6, 40.1, -62.7, 37.0], + [32.8, 198.2, -0.8, -31.1, 27.3], + // torch gets 48.0 + [34.5, 34.9, -47.9, 127.6, -12.3], + [-61.4, -3.2, -2.9, -10.9, -16.6], + [74.6, 60.1, -68.9, 34.5, -50.4] + ], + [ + [37.5, -56.9, -43.6, -13.5, -9.9], + [40.0, 97.3, 28.6, 14.2, -30.1], + [-22.3, -126.3, -68.8, -8.2, 26.1], + [-32.9, 37.3, 108.5, -54.8, 29.6], + [34.9, -176.9, -125.0, -28.3, -13.9], + [-54.9, 142.6, 62.1, -80.4, -65.6], + [7.4, -91.1, -67.6, 35.0, 39.7] + ], + [ + [-57.2, -40.9, -10.1, 32.6, 29.4], + [18.7, -18.0, 29.5, -1.2, 59.2], + [-14.0, -74.4, 19.8, -117.0, 58.2], + [-21.8, 163.5, -71.1, -99.0, 80.9], + [-58.9, -10.9, 93.8, -139.6, 98.0], + // torch gets 54.5 + [-54.4, 135.3, 6.0, -79.1, 134.6], + [27.5, -76.0, 43.4, -2.8, -7.8] + ] + ] + ); Ok(()) } From 6f877592a7d5b5023462e0b8d241a2ba5ad83648 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 23 Mar 2024 13:08:53 +0100 Subject: [PATCH 052/131] Avoid broadcasting on the batch dimension for the attention mask. (#1920) --- candle-transformers/src/models/mistral.rs | 7 +++---- candle-transformers/src/models/quantized_mistral.rs | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/candle-transformers/src/models/mistral.rs b/candle-transformers/src/models/mistral.rs index be84f824..e40ae3ad 100644 --- a/candle-transformers/src/models/mistral.rs +++ b/candle-transformers/src/models/mistral.rs @@ -385,7 +385,6 @@ impl Model { fn prepare_decoder_attention_mask( &self, - b_size: usize, tgt_len: usize, seqlen_offset: usize, ) -> Result { @@ -408,16 +407,16 @@ impl Model { } else { mask }; - mask.expand((b_size, 1, tgt_len, tgt_len + seqlen_offset))? + mask.expand((1, 1, tgt_len, tgt_len + seqlen_offset))? .to_dtype(self.dtype) } pub fn forward(&mut self, input_ids: &Tensor, seqlen_offset: usize) -> Result { - let (b_size, seq_len) = input_ids.dims2()?; + let (_b_size, seq_len) = input_ids.dims2()?; let attention_mask = if seq_len <= 1 { None } else { - let mask = self.prepare_decoder_attention_mask(b_size, seq_len, seqlen_offset)?; + let mask = self.prepare_decoder_attention_mask(seq_len, seqlen_offset)?; Some(mask) }; let mut xs = self.embed_tokens.forward(input_ids)?; diff --git a/candle-transformers/src/models/quantized_mistral.rs b/candle-transformers/src/models/quantized_mistral.rs index 77de7b75..5f026f2b 100644 --- a/candle-transformers/src/models/quantized_mistral.rs +++ b/candle-transformers/src/models/quantized_mistral.rs @@ -287,7 +287,6 @@ impl Model { fn prepare_decoder_attention_mask( &self, - b_size: usize, tgt_len: usize, seqlen_offset: usize, ) -> Result { @@ -310,16 +309,16 @@ impl Model { } else { mask }; - mask.expand((b_size, 1, tgt_len, tgt_len + seqlen_offset))? + mask.expand((1, 1, tgt_len, tgt_len + seqlen_offset))? .to_dtype(DType::F32) } pub fn forward(&mut self, input_ids: &Tensor, seqlen_offset: usize) -> Result { - let (b_size, seq_len) = input_ids.dims2()?; + let (_b_size, seq_len) = input_ids.dims2()?; let attention_mask = if seq_len <= 1 { None } else { - let mask = self.prepare_decoder_attention_mask(b_size, seq_len, seqlen_offset)?; + let mask = self.prepare_decoder_attention_mask(seq_len, seqlen_offset)?; Some(mask) }; let mut xs = self.embed_tokens.forward(input_ids)?; From 790037390ca81a9cf32f35c03f514452d1366e4f Mon Sep 17 00:00:00 2001 From: yinqiwen Date: Sat, 23 Mar 2024 20:44:10 +0800 Subject: [PATCH 053/131] Add cast_bf16_x/cast_x_bf16 when CUDA_ARCH<800 but CUDA_VERSION >= 11000 (#1919) - it make possible to load bf16 models on T4(sm75) --- candle-kernels/src/cast.cu | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu index 2fe85e1c..90f5e7ba 100644 --- a/candle-kernels/src/cast.cu +++ b/candle-kernels/src/cast.cu @@ -83,6 +83,18 @@ CAST_OP(double, __nv_bfloat16, cast_f64_bf16) CAST_THROUGH_OP(__nv_bfloat16, uint8_t, float, cast_bf16_u8) CAST_THROUGH_OP(__nv_bfloat16, __half, float, cast_bf16_f16) CAST_THROUGH_OP(__half, __nv_bfloat16, float, cast_f16_bf16) +#else +#include +#if CUDA_VERSION >= 11000 +CAST_OP(__nv_bfloat16, float, cast_bf16_f32) +CAST_OP(float, __nv_bfloat16, cast_f32_bf16) +CAST_THROUGH_OP(__nv_bfloat16, uint8_t, float, cast_bf16_u8) +CAST_THROUGH_OP(__nv_bfloat16, __half, float, cast_bf16_f16) +CAST_THROUGH_OP(__nv_bfloat16, double, float, cast_bf16_f64) +CAST_THROUGH_OP(__half, __nv_bfloat16, float, cast_f16_bf16) +CAST_THROUGH_OP(double, __nv_bfloat16, float, cast_f64_bf16) +CAST_THROUGH_OP(uint8_t, __nv_bfloat16, float, cast_u8_bf16) +#endif #endif #if __CUDA_ARCH__ >= 530 From fdfe8fd129a0f755f380d4a38f11207c28fc8ee4 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 23 Mar 2024 14:16:19 +0100 Subject: [PATCH 054/131] Preliminary support for inplace ops. (#1921) * Preliminary support for inplace ops. * Add a test. --- candle-core/src/custom_op.rs | 133 +++++++++++++++++++++++++++ candle-core/src/lib.rs | 2 +- candle-core/src/storage.rs | 47 +++++++++- candle-core/src/tensor.rs | 4 + candle-core/tests/custom_op_tests.rs | 31 +++++++ 5 files changed, 215 insertions(+), 2 deletions(-) diff --git a/candle-core/src/custom_op.rs b/candle-core/src/custom_op.rs index 3dc66e94..3a85dba9 100644 --- a/candle-core/src/custom_op.rs +++ b/candle-core/src/custom_op.rs @@ -242,3 +242,136 @@ impl Tensor { self.apply_op3_arc(t2, t3, Arc::new(Box::new(c))) } } + +// In place ops. + +/// Unary ops that can be defined in user-land. +/// These ops work in place and as such back-prop is unsupported. +pub trait InplaceOp1 { + // Box does not support const yet, so use a function to get the name. + fn name(&self) -> &'static str; + + /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result<()>; + + /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Result<()> { + Err(crate::Error::Cuda( + format!("no cuda implementation for {}", self.name()).into(), + )) + } + + /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> Result<()> { + Err(crate::Error::Metal( + format!("no metal implementation for {}", self.name()).into(), + )) + } +} + +pub trait InplaceOp2 { + fn name(&self) -> &'static str; + + /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout) + -> Result<()>; + + /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout) -> Result<()> { + Err(crate::Error::Cuda( + format!("no cuda implementation for {}", self.name()).into(), + )) + } + + /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn metal_fwd( + &self, + _: &mut MetalStorage, + _: &Layout, + _: &MetalStorage, + _: &Layout, + ) -> Result<()> { + Err(crate::Error::Metal( + format!("no metal implementation for {}", self.name()).into(), + )) + } +} + +pub trait InplaceOp3 { + fn name(&self) -> &'static str; + + /// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cpu_fwd( + &self, + s1: &mut CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + s3: &CpuStorage, + l3: &Layout, + ) -> Result<()>; + + /// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn cuda_fwd( + &self, + _: &mut CudaStorage, + _: &Layout, + _: &CudaStorage, + _: &Layout, + _: &CudaStorage, + _: &Layout, + ) -> Result<()> { + Err(crate::Error::Cuda( + format!("no cuda implementation for {}", self.name()).into(), + )) + } + + /// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides, + /// offsets etc so the associated layout should be used to access it. + fn metal_fwd( + &self, + _: &mut MetalStorage, + _: &Layout, + _: &MetalStorage, + _: &Layout, + _: &MetalStorage, + _: &Layout, + ) -> Result<()> { + Err(crate::Error::Metal( + format!("no metal implementation for {}", self.name()).into(), + )) + } +} + +impl Tensor { + /// Applies a unary custom op in place. + pub fn inplace_op1(&self, c: &C) -> Result<()> { + self.storage_mut().inplace_op1(self.layout(), c) + } + + /// Applies a unary custom op in place (for the first tensor). + pub fn inplace_op2(&self, rhs: &Self, c: &C) -> Result<()> { + self.storage_mut() + .inplace_op2(self.layout(), &rhs.storage(), rhs.layout(), c) + } + + /// Applies a ternary custom op in place (for the first tensor). + pub fn inplace_op3(&self, t2: &Self, t3: &Self, c: &C) -> Result<()> { + self.storage_mut().inplace_op3( + self.layout(), + &t2.storage(), + t2.layout(), + &t3.storage(), + t3.layout(), + c, + ) + } +} diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index 87e60d9b..1508b9c0 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -74,7 +74,7 @@ pub mod utils; mod variable; pub use cpu_backend::CpuStorage; -pub use custom_op::{CustomOp1, CustomOp2, CustomOp3}; +pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}; pub use device::{Device, DeviceLocation, NdArray}; pub use dtype::{DType, FloatDType, IntDType, WithDType}; pub use error::{Error, Result}; diff --git a/candle-core/src/storage.rs b/candle-core/src/storage.rs index ffc75188..36620dd9 100644 --- a/candle-core/src/storage.rs +++ b/candle-core/src/storage.rs @@ -1,7 +1,7 @@ use crate::backend::BackendStorage; use crate::op::{self, CmpOp, ReduceOp}; use crate::{CpuStorage, CudaStorage, DType, Device, Error, Layout, MetalStorage, Result, Shape}; -use crate::{CustomOp1, CustomOp2, CustomOp3}; +use crate::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}; // We do not want to implement Clone on Storage as cloning may fail because of // out of memory. Instead try_clone should be used. @@ -253,6 +253,51 @@ impl Storage { } } + pub(crate) fn inplace_op1(&mut self, l: &Layout, c: &dyn InplaceOp1) -> Result<()> { + match self { + Self::Cpu(storage) => c.cpu_fwd(storage, l), + Self::Cuda(storage) => c.cuda_fwd(storage, l), + Self::Metal(storage) => c.metal_fwd(storage, l), + } + } + + pub(crate) fn inplace_op2( + &mut self, + l1: &Layout, + t2: &Self, + l2: &Layout, + c: &dyn InplaceOp2, + ) -> Result<()> { + self.same_device(t2, c.name())?; + match (self, t2) { + (Self::Cpu(s1), Self::Cpu(s2)) => c.cpu_fwd(s1, l1, s2, l2), + (Self::Cuda(s1), Self::Cuda(s2)) => c.cuda_fwd(s1, l1, s2, l2), + (Self::Metal(s1), Self::Metal(s2)) => c.metal_fwd(s1, l1, s2, l2), + _ => unreachable!(), + } + } + + pub(crate) fn inplace_op3( + &mut self, + l1: &Layout, + t2: &Self, + l2: &Layout, + t3: &Self, + l3: &Layout, + c: &dyn InplaceOp3, + ) -> Result<()> { + self.same_device(t2, c.name())?; + self.same_device(t3, c.name())?; + match (self, t2, t3) { + (Self::Cpu(s1), Self::Cpu(s2), Self::Cpu(s3)) => c.cpu_fwd(s1, l1, s2, l2, s3, l3), + (Self::Cuda(s1), Self::Cuda(s2), Self::Cuda(s3)) => c.cuda_fwd(s1, l1, s2, l2, s3, l3), + (Self::Metal(s1), Self::Metal(s2), Self::Metal(s3)) => { + c.metal_fwd(s1, l1, s2, l2, s3, l3) + } + _ => unreachable!(), + } + } + pub(crate) fn unary_impl(&self, layout: &Layout) -> Result { match self { Storage::Cpu(storage) => { diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index 6b5aed96..92c931eb 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -2254,6 +2254,10 @@ impl Tensor { self.storage.read().unwrap() } + pub(crate) fn storage_mut(&self) -> std::sync::RwLockWriteGuard<'_, Storage> { + self.storage.write().unwrap() + } + // If we extend the visibility of this function to be usable outside of this crate, we should // make it unsafe. pub(crate) fn storage_mut_and_layout( diff --git a/candle-core/tests/custom_op_tests.rs b/candle-core/tests/custom_op_tests.rs index cff0aebe..be59e0c0 100644 --- a/candle-core/tests/custom_op_tests.rs +++ b/candle-core/tests/custom_op_tests.rs @@ -112,3 +112,34 @@ fn custom_op1_with_backward() -> Result<()> { Ok(()) } + +impl candle_core::InplaceOp1 for Elu { + fn name(&self) -> &'static str { + "elu" + } + + fn cpu_fwd(&self, s: &mut CpuStorage, _l: &Layout) -> Result<()> { + let alpha = self.alpha; + match s { + CpuStorage::BF16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)), + CpuStorage::F16(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)), + CpuStorage::F32(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)), + CpuStorage::F64(s) => s.iter_mut().for_each(|v| *v = fwd(*v, alpha)), + _ => candle_core::bail!("unsupported dtype for inplace elu"), + } + Ok(()) + } +} + +#[test] +fn inplace_op1() -> Result<()> { + let cpu = &Device::Cpu; + let t = Tensor::arange(0u32, 12u32, cpu)?.to_dtype(DType::F32)?; + let t = (t - 5.)?; + t.inplace_op1(&Elu { alpha: 1. })?; + assert_eq!( + to_vec1_round(&t, 4)?, + &[-0.9933, -0.9817, -0.9502, -0.8647, -0.6321, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + ); + Ok(()) +} From a62a97340c3f11fc7d804d8c6138e3da7e9d7648 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 23 Mar 2024 15:26:09 +0100 Subject: [PATCH 055/131] Add topk sampling. (#1923) --- candle-transformers/src/generation/mod.rs | 85 +++++++++++++------ candle-transformers/tests/generation_tests.rs | 27 ++++++ 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/candle-transformers/src/generation/mod.rs b/candle-transformers/src/generation/mod.rs index b1a567c3..530a6b48 100644 --- a/candle-transformers/src/generation/mod.rs +++ b/candle-transformers/src/generation/mod.rs @@ -1,24 +1,35 @@ use candle::{DType, Error, Result, Tensor}; use rand::{distributions::Distribution, SeedableRng}; +#[derive(Clone, PartialEq, Debug)] +pub enum Sampling { + ArgMax, + All { temperature: f64 }, + TopK { k: usize, temperature: f64 }, + TopP { p: f64, temperature: f64 }, +} + pub struct LogitsProcessor { rng: rand::rngs::StdRng, - temperature: Option, - top_p: Option, + sampling: Sampling, } impl LogitsProcessor { + pub fn from_sampling(seed: u64, sampling: Sampling) -> Self { + let rng = rand::rngs::StdRng::seed_from_u64(seed); + Self { rng, sampling } + } + pub fn new(seed: u64, temperature: Option, top_p: Option) -> Self { - let temperature = if temperature.map_or(true, |v| v < 1e-7) { - None - } else { - temperature + let temperature = temperature.and_then(|v| if v < 1e-7 { None } else { Some(v) }); + let sampling = match temperature { + None => Sampling::ArgMax, + Some(temperature) => match top_p { + None => Sampling::All { temperature }, + Some(p) => Sampling::TopP { p, temperature }, + }, }; - Self { - rng: rand::rngs::StdRng::seed_from_u64(seed), - temperature, - top_p, - } + Self::from_sampling(seed, sampling) } fn sample_argmax(&mut self, logits: Tensor) -> Result { @@ -38,14 +49,14 @@ impl LogitsProcessor { Ok(next_token) } + /// top-p sampling (or "nucleus sampling") samples from the smallest set of tokens that exceed + /// probability top_p. This way we never sample tokens that have very low probabilities and are + /// less likely to go "off the rails". fn sample_topp(&mut self, prs: &mut Vec, top_p: f32) -> Result { - // top-p sampling (or "nucleus sampling") samples from the smallest set of - // tokens that exceed probability top_p. This way we never sample tokens that - // have very low probabilities and are less likely to go "off the rails". let mut argsort_indices = (0..prs.len()).collect::>(); // Sort by descending probability. - argsort_indices.sort_by(|&i, &j| prs[j].partial_cmp(&prs[i]).unwrap()); + argsort_indices.sort_by(|&i, &j| prs[j].total_cmp(&prs[i])); // Clamp smaller probabilities to zero. let mut cumsum = 0.; @@ -60,23 +71,49 @@ impl LogitsProcessor { self.sample_multinomial(prs) } + // top-k sampling samples from the k tokens with the largest probabilities. + fn sample_topk(&mut self, prs: &mut Vec, top_k: usize) -> Result { + if top_k >= prs.len() { + self.sample_multinomial(prs) + } else { + let mut argsort_indices = (0..prs.len()).collect::>(); + // Sort by descending probability. + let (indices, _, _) = + argsort_indices.select_nth_unstable_by(top_k, |&i, &j| prs[j].total_cmp(&prs[i])); + let prs = indices.iter().map(|&i| prs[i]).collect::>(); + let index = self.sample_multinomial(&prs)?; + Ok(indices[index as usize] as u32) + } + } + pub fn sample(&mut self, logits: &Tensor) -> Result { let logits = logits.to_dtype(DType::F32)?; - let next_token = match self.temperature { - None => self.sample_argmax(logits)?, - Some(temperature) => { - let logits = &(&logits / temperature)?; - let prs = candle_nn::ops::softmax_last_dim(logits)?; - let mut prs: Vec = prs.to_vec1()?; - let top_p = self.top_p.unwrap_or(1.); - if top_p <= 0.0 || top_p >= 1.0 { + let prs = |temperature: f64| -> Result> { + let logits = (&logits / temperature)?; + let prs = candle_nn::ops::softmax_last_dim(&logits)?; + prs.to_vec1() + }; + + let next_token = match &self.sampling { + Sampling::ArgMax => self.sample_argmax(logits)?, + Sampling::All { temperature } => { + let prs = prs(*temperature)?; + self.sample_multinomial(&prs)? + } + Sampling::TopP { p, temperature } => { + let mut prs = prs(*temperature)?; + if *p <= 0.0 || *p >= 1.0 { // simply sample from the predicted probability distribution self.sample_multinomial(&prs)? } else { // top-p (nucleus) sampling, clamping the least likely tokens to zero - self.sample_topp(&mut prs, top_p as f32)? + self.sample_topp(&mut prs, *p as f32)? } } + Sampling::TopK { k, temperature } => { + let mut prs = prs(*temperature)?; + self.sample_topk(&mut prs, *k)? + } }; Ok(next_token) } diff --git a/candle-transformers/tests/generation_tests.rs b/candle-transformers/tests/generation_tests.rs index 76f994d0..cc499a44 100644 --- a/candle-transformers/tests/generation_tests.rs +++ b/candle-transformers/tests/generation_tests.rs @@ -27,3 +27,30 @@ fn sample_with_top_p() -> Result<()> { assert_eq!(token, 2); Ok(()) } + +#[test] +fn sample_with_top_k() -> Result<()> { + let mut logits_process = LogitsProcessor::from_sampling( + 42, + candle_transformers::generation::Sampling::TopK { + k: 1, + temperature: 1.0, + }, + ); + let logits = Tensor::new(&[0.1, 0.2, 0.3, 0.4], &Device::Cpu)?; + let token = logits_process.sample(&logits)?; + assert_eq!(token, 3); + let mut logits_process = LogitsProcessor::from_sampling( + 42, + candle_transformers::generation::Sampling::TopK { + k: 2, + temperature: 1.0, + }, + ); + let logits = Tensor::new(&[0.1, 0.2, 0.3, 0.4], &Device::Cpu)?; + let token = logits_process.sample(&logits)?; + assert_eq!(token, 3); + let token = logits_process.sample(&logits)?; + assert_eq!(token, 2); + Ok(()) +} From 5e70821dd0dacc1b1e1e44d8ec03d0e4a25d41dc Mon Sep 17 00:00:00 2001 From: laurent Date: Sat, 23 Mar 2024 15:47:39 +0100 Subject: [PATCH 056/131] Allow for arbitrary temperature modifications. --- candle-transformers/src/generation/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/candle-transformers/src/generation/mod.rs b/candle-transformers/src/generation/mod.rs index 530a6b48..257d9171 100644 --- a/candle-transformers/src/generation/mod.rs +++ b/candle-transformers/src/generation/mod.rs @@ -87,11 +87,17 @@ impl LogitsProcessor { } pub fn sample(&mut self, logits: &Tensor) -> Result { + self.sample_f(logits, |_| {}) + } + + pub fn sample_f(&mut self, logits: &Tensor, f: impl FnOnce(&mut [f32])) -> Result { let logits = logits.to_dtype(DType::F32)?; let prs = |temperature: f64| -> Result> { let logits = (&logits / temperature)?; let prs = candle_nn::ops::softmax_last_dim(&logits)?; - prs.to_vec1() + let mut prs = prs.to_vec1()?; + f(&mut prs); + Ok(prs) }; let next_token = match &self.sampling { From e2b4829531bb053c48e8124580695996b910ec00 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 24 Mar 2024 08:04:04 +0100 Subject: [PATCH 057/131] Support more mistral models. (#1927) * Support more mistral models. * Use the appropriate rope parameter. --- candle-examples/examples/mistral/main.rs | 41 ++++++++++++++++- candle-transformers/src/models/mistral.rs | 46 +++++++++++-------- .../src/models/quantized_mistral.rs | 9 ++-- 3 files changed, 70 insertions(+), 26 deletions(-) diff --git a/candle-examples/examples/mistral/main.rs b/candle-examples/examples/mistral/main.rs index 1cf4107c..a972279c 100644 --- a/candle-examples/examples/mistral/main.rs +++ b/candle-examples/examples/mistral/main.rs @@ -122,6 +122,18 @@ impl TextGeneration { } } +#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)] +enum Which { + #[value(name = "7b-v0.1")] + Mistral7bV01, + #[value(name = "7b-v0.2")] + Mistral7bV02, + #[value(name = "7b-instruct-v0.1")] + Mistral7bInstructV01, + #[value(name = "7b-instruct-v0.2")] + Mistral7bInstructV02, +} + #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { @@ -155,6 +167,10 @@ struct Args { #[arg(long, short = 'n', default_value_t = 10000)] sample_len: usize, + /// The model size to use. + #[arg(long, default_value = "7b-v0.1")] + which: Which, + #[arg(long)] model_id: Option, @@ -164,6 +180,9 @@ struct Args { #[arg(long)] tokenizer_file: Option, + #[arg(long)] + config_file: Option, + #[arg(long)] weight_files: Option, @@ -211,9 +230,17 @@ fn main() -> Result<()> { Some(model_id) => model_id, None => { if args.quantized { + if args.which != Which::Mistral7bV01 { + anyhow::bail!("only 7b-v0.1 is available as a quantized model for now") + } "lmz/candle-mistral".to_string() } else { - "mistralai/Mistral-7B-v0.1".to_string() + match args.which { + Which::Mistral7bV01 => "mistralai/Mistral-7B-v0.1".to_string(), + Which::Mistral7bV02 => "mistralai/Mistral-7B-v0.2".to_string(), + Which::Mistral7bInstructV01 => "mistralai/Mistral-7B-Instruct-v0.1".to_string(), + Which::Mistral7bInstructV02 => "mistralai/Mistral-7B-Instruct-v0.2".to_string(), + } } } }; @@ -243,7 +270,17 @@ fn main() -> Result<()> { let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?; let start = std::time::Instant::now(); - let config = Config::config_7b_v0_1(args.use_flash_attn); + let config = match args.config_file { + Some(config_file) => serde_json::from_slice(&std::fs::read(config_file)?)?, + None => { + if args.quantized { + Config::config_7b_v0_1(args.use_flash_attn) + } else { + let config_file = repo.get("config.json")?; + serde_json::from_slice(&std::fs::read(config_file)?)? + } + } + }; let device = candle_examples::device(args.cpu)?; let (model, device) = if args.quantized { let filename = &filenames[0]; diff --git a/candle-transformers/src/models/mistral.rs b/candle-transformers/src/models/mistral.rs index e40ae3ad..0e6200f5 100644 --- a/candle-transformers/src/models/mistral.rs +++ b/candle-transformers/src/models/mistral.rs @@ -4,20 +4,25 @@ use candle::{DType, Device, Module, Result, Tensor, D}; use candle_nn::{Activation, VarBuilder}; use std::sync::Arc; -#[derive(Debug, Clone, PartialEq)] +fn default_use_flash_attn() -> bool { + false +} + +#[derive(Debug, Clone, PartialEq, serde::Deserialize)] pub struct Config { - pub(crate) vocab_size: usize, - pub(crate) hidden_size: usize, - pub(crate) intermediate_size: usize, - pub(crate) num_hidden_layers: usize, - pub(crate) num_attention_heads: usize, - pub(crate) num_key_value_heads: usize, - pub(crate) hidden_act: Activation, - pub(crate) max_position_embeddings: usize, - pub(crate) rms_norm_eps: f64, - pub(crate) rope_theta: f64, - pub(crate) sliding_window: usize, - pub(crate) use_flash_attn: bool, + pub vocab_size: usize, + pub hidden_size: usize, + pub intermediate_size: usize, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + pub num_key_value_heads: usize, + pub hidden_act: Activation, + pub max_position_embeddings: usize, + pub rms_norm_eps: f64, + pub rope_theta: f64, + pub sliding_window: Option, + #[serde(default = "default_use_flash_attn")] + pub use_flash_attn: bool, } impl Config { @@ -34,7 +39,7 @@ impl Config { max_position_embeddings: 32768, rms_norm_eps: 1e-5, rope_theta: 10_000., - sliding_window: 4096, + sliding_window: Some(4096), use_flash_attn, } } @@ -53,7 +58,7 @@ impl Config { max_position_embeddings: 32768, rms_norm_eps: 1e-5, rope_theta: 10_000., - sliding_window: 4096, + sliding_window: Some(4096), use_flash_attn, } } @@ -71,7 +76,7 @@ impl Config { max_position_embeddings: 32768, rms_norm_eps: 1e-5, rope_theta: 10_000., - sliding_window: 4096, + sliding_window: Some(4096), use_flash_attn, } } @@ -92,11 +97,12 @@ fn rotate_half(xs: &Tensor) -> Result { impl RotaryEmbedding { fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result { + let rope_theta = cfg.rope_theta as f32; let dim = cfg.hidden_size / cfg.num_attention_heads; let max_seq_len = cfg.max_position_embeddings; let inv_freq: Vec<_> = (0..dim) .step_by(2) - .map(|i| 1f32 / 10000f32.powf(i as f32 / dim as f32)) + .map(|i| 1f32 / rope_theta.powf(i as f32 / dim as f32)) .collect(); let inv_freq_len = inv_freq.len(); let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(dtype)?; @@ -353,7 +359,7 @@ pub struct Model { layers: Vec, norm: RmsNorm, lm_head: Linear, - sliding_window: usize, + sliding_window: Option, device: Device, dtype: DType, } @@ -388,11 +394,11 @@ impl Model { tgt_len: usize, seqlen_offset: usize, ) -> Result { - // Sliding window mask? + let sliding_window = self.sliding_window.unwrap_or(tgt_len + 1); let mask: Vec<_> = (0..tgt_len) .flat_map(|i| { (0..tgt_len).map(move |j| { - if i < j || j + self.sliding_window < i { + if i < j || j + sliding_window < i { f32::NEG_INFINITY } else { 0. diff --git a/candle-transformers/src/models/quantized_mistral.rs b/candle-transformers/src/models/quantized_mistral.rs index 5f026f2b..2c5b7f74 100644 --- a/candle-transformers/src/models/quantized_mistral.rs +++ b/candle-transformers/src/models/quantized_mistral.rs @@ -21,11 +21,12 @@ fn rotate_half(xs: &Tensor) -> Result { impl RotaryEmbedding { fn new(cfg: &Config, dev: &Device) -> Result { + let rope_theta = cfg.rope_theta as f32; let dim = cfg.hidden_size / cfg.num_attention_heads; let max_seq_len = cfg.max_position_embeddings; let inv_freq: Vec<_> = (0..dim) .step_by(2) - .map(|i| 1f32 / 10000f32.powf(i as f32 / dim as f32)) + .map(|i| 1f32 / rope_theta.powf(i as f32 / dim as f32)) .collect(); let inv_freq_len = inv_freq.len(); let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?; @@ -257,7 +258,7 @@ pub struct Model { layers: Vec, norm: RmsNorm, lm_head: Linear, - sliding_window: usize, + sliding_window: Option, device: Device, } @@ -290,11 +291,11 @@ impl Model { tgt_len: usize, seqlen_offset: usize, ) -> Result { - // Sliding window mask? + let sliding_window = self.sliding_window.unwrap_or(tgt_len + 1); let mask: Vec<_> = (0..tgt_len) .flat_map(|i| { (0..tgt_len).map(move |j| { - if i < j || j + self.sliding_window < i { + if i < j || j + sliding_window < i { f32::NEG_INFINITY } else { 0. From 8c0db879924ca48c4f820dc1c5c88ad534cb62b0 Mon Sep 17 00:00:00 2001 From: laurent Date: Sun, 24 Mar 2024 18:55:56 +0100 Subject: [PATCH 058/131] Avoid using the attn mask when not necessary. --- .../src/models/quantized_llama.rs | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 717e6771..22ddfd2f 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -181,7 +181,12 @@ impl LayerWeights { Ok(rope) } - fn forward_attn(&mut self, x: &Tensor, mask: &Tensor, index_pos: usize) -> Result { + fn forward_attn( + &mut self, + x: &Tensor, + mask: Option<&Tensor>, + index_pos: usize, + ) -> Result { let _enter = self.span_attn.enter(); let (b_sz, seq_len, n_embd) = x.dims3()?; let q = self.attention_wq.forward(x)?; @@ -220,8 +225,13 @@ impl LayerWeights { let v = self.repeat_kv(v)?; let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?; - let mask = mask.broadcast_as(att.shape())?; - let att = masked_fill(&att, &mask, &self.neg_inf)?; + let att = match mask { + None => att, + Some(mask) => { + let mask = mask.broadcast_as(att.shape())?; + masked_fill(&att, &mask, &self.neg_inf)? + } + }; let att = candle_nn::ops::softmax_last_dim(&att)?; // Convert to contiguous as matmul doesn't support strided vs for now. let y = att.matmul(&v.contiguous()?)?; @@ -474,14 +484,18 @@ impl ModelWeights { pub fn forward(&mut self, x: &Tensor, index_pos: usize) -> Result { let (_b_sz, seq_len) = x.dims2()?; - let mask = self.mask(seq_len, x.device())?; + let mask = if seq_len == 1 { + None + } else { + Some(self.mask(seq_len, x.device())?) + }; let _enter = self.span.enter(); let mut layer_in = self.tok_embeddings.forward(x)?; for layer in self.layers.iter_mut() { let x = layer_in; let residual = &x; let x = layer.attention_norm.forward(&x)?; - let attn = layer.forward_attn(&x, &mask, index_pos)?; + let attn = layer.forward_attn(&x, mask.as_ref(), index_pos)?; let x = (attn + residual)?; // MLP From cf7d7fcf2f20c24aae633483c3a107c1219a7f9a Mon Sep 17 00:00:00 2001 From: laurent Date: Sun, 24 Mar 2024 19:04:32 +0100 Subject: [PATCH 059/131] Also avoid the mask in the llama example. --- candle-transformers/src/models/llama.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/candle-transformers/src/models/llama.rs b/candle-transformers/src/models/llama.rs index c311d4c4..73671cdc 100644 --- a/candle-transformers/src/models/llama.rs +++ b/candle-transformers/src/models/llama.rs @@ -240,8 +240,12 @@ impl CausalSelfAttention { let k = k.to_dtype(DType::F32)?; let v = v.to_dtype(DType::F32)?; let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?; - let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; - let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?; + let att = if seq_len == 1 { + att + } else { + let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; + masked_fill(&att, &mask, f32::NEG_INFINITY)? + }; let att = candle_nn::ops::softmax(&att, D::Minus1)?; // Convert to contiguous as matmul doesn't support strided vs for now. att.matmul(&v.contiguous()?)?.to_dtype(in_dtype)? From 1b98f84a2baa23192b97e36131011da658bfa1c2 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 24 Mar 2024 22:48:52 +0100 Subject: [PATCH 060/131] Fast kernels for rotary embeddings. (#1928) * Fast kernels for rotary embeddings. * Add a test for the fast CPU kernel. * Rope cuda bindings. * Cuda kernel. * Metal kernel (part 1). * Cuda kernels. * Finish the metal kernel. * Use the new kernels in the quantized example. * Fix warning. --- candle-kernels/src/reduce.cu | 29 ++ candle-metal-kernels/src/lib.rs | 41 +++ candle-metal-kernels/src/reduce.metal | 23 ++ candle-nn/Cargo.toml | 1 + candle-nn/src/lib.rs | 1 + candle-nn/src/rotary_emb.rs | 247 ++++++++++++++++++ candle-nn/tests/ops.rs | 28 ++ .../src/models/quantized_llama.rs | 31 +-- 8 files changed, 375 insertions(+), 26 deletions(-) create mode 100644 candle-nn/src/rotary_emb.rs diff --git a/candle-kernels/src/reduce.cu b/candle-kernels/src/reduce.cu index 19fb213a..e6002a6f 100644 --- a/candle-kernels/src/reduce.cu +++ b/candle-kernels/src/reduce.cu @@ -147,6 +147,20 @@ __device__ void softmax(const T * x, T * dst, const int ncols) { } } +template +__device__ void ropei(const T * src, const T * cos, const T * sin, T * dst, const uint32_t bh, const uint32_t td) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (2 * idx > bh * td) return; + + uint32_t rope_idx = idx % (td / 2); + T c = cos[rope_idx]; + T s = sin[rope_idx]; + + dst[2 * idx] = src[2 * idx] * c - src[2 * idx + 1] * s; + dst[2 * idx + 1] = src[2 * idx] * s + src[2 * idx + 1] * c; +} + + template __device__ void fast_max(const size_t src_numel, const size_t el_to_sum_per_block, @@ -402,9 +416,21 @@ fast_argmax(const size_t src_numel, const size_t el_to_sum_per_block, rmsnorm(src, dst, alpha, n_cols, eps); \ } \ +#define ROPEI_OP(TYPENAME, FN_NAME) \ + extern "C" __global__ void FN_NAME( \ + const TYPENAME *src, \ + const TYPENAME *cos, \ + const TYPENAME *sin, \ + TYPENAME *dst, \ + const uint32_t bh, \ + const uint32_t td) { \ + ropei(src, cos, sin, dst, bh, td); \ + } \ + #if __CUDA_ARCH__ >= 800 SOFTMAX_OP(__nv_bfloat16, float, softmax_bf16) RMSNORM_OP(__nv_bfloat16, rmsnorm_bf16) +ROPEI_OP(__nv_bfloat16, rope_i_bf16) SUM_OP(__nv_bfloat16, sum_bf16) FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argmax_bf16, fast_sum_bf16) #endif @@ -412,6 +438,7 @@ FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argm #if __CUDA_ARCH__ >= 530 SOFTMAX_OP(__half, float, softmax_f16) RMSNORM_OP(__half, rmsnorm_f16) +ROPEI_OP(__half, rope_i_f16) SUM_OP(__half, sum_f16) FAST_OP(__half, fast_min_f16, fast_max_f16, fast_argmin_f16, fast_argmax_f16, fast_sum_f16) #endif @@ -423,6 +450,8 @@ SOFTMAX_OP(float, float, softmax_f32) SOFTMAX_OP(double, double, softmax_f64) RMSNORM_OP(float, rmsnorm_f32) RMSNORM_OP(double, rmsnorm_f64) +ROPEI_OP(float, rope_i_f32) +ROPEI_OP(double, rope_i_f64) FAST_OP(float, fast_min_f32, fast_max_f32, fast_argmin_f32, fast_argmax_f32, fast_sum_f32) FAST_OP(double, fast_min_f64, fast_max_f64, fast_argmin_f64, fast_argmax_f64, fast_sum_f64) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index e17365a0..e83814a8 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -808,6 +808,47 @@ pub fn call_rms_norm( Ok(()) } +#[allow(clippy::too_many_arguments)] +pub fn call_rope_i( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + kernel_name: &'static str, + bh: usize, + td: usize, + src: &Buffer, + src_offset: usize, + cos: &Buffer, + cos_offset: usize, + sin: &Buffer, + sin_offset: usize, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?; + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + + set_params!( + encoder, + ( + bh, + td, + (src, src_offset), + (cos, cos_offset), + (sin, sin_offset), + output + ) + ); + let (thread_group_count, thread_group_size) = linear_split(&pipeline, (bh * td) / 2); + encoder.use_resource(src, metal::MTLResourceUsage::Read); + encoder.use_resource(cos, metal::MTLResourceUsage::Read); + encoder.use_resource(sin, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[allow(clippy::too_many_arguments)] pub fn call_affine( device: &Device, diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index 3c3cbc14..fa980dea 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -313,6 +313,26 @@ kernel void NAME( } \ } \ +#define ROPEI(FN_NAME, TYPENAME) \ +kernel void FN_NAME( \ + constant size_t &bh, \ + constant size_t &td, \ + device const TYPENAME *src, \ + device const TYPENAME *cos, \ + device const TYPENAME *sin, \ + device TYPENAME *dst, \ + uint tid [[ thread_position_in_grid ]] \ +) { \ + if (2 * tid >= bh * td) { \ + return; \ + } \ + size_t rope_idx = tid % (td / 2); \ + TYPENAME c = cos[rope_idx]; \ + TYPENAME s = sin[rope_idx]; \ + dst[2 * tid] = src[2 * tid] * c - src[2 * tid + 1] * s; \ + dst[2 * tid + 1] = src[2 * tid] * s + src[2 * tid + 1] * c; \ +}\ + REDUCE(x + y, fast_sum_f32_strided, float, 0) REDUCE(x + y, fast_sum_u32_strided, uint, 0) REDUCE(x + y, fast_sum_f16_strided, half, 0) @@ -341,6 +361,8 @@ SOFTMAX(softmax_f32, float) SOFTMAX(softmax_f16, half) RMSNORM(rmsnorm_f32, float) RMSNORM(rmsnorm_f16, half) +ROPEI(rope_i_f32, float) +ROPEI(rope_i_f16, half) #if __METAL_VERSION__ >= 220 REDUCE(x + y, fast_sum_i64_strided, int64_t, 0) @@ -359,4 +381,5 @@ ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF) ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF) SOFTMAX(softmax_bf16, bfloat) RMSNORM(rmsnorm_bf16, bfloat) +ROPEI(rope_i_bf16, bfloat) #endif diff --git a/candle-nn/Cargo.toml b/candle-nn/Cargo.toml index 214e8a59..3408dae3 100644 --- a/candle-nn/Cargo.toml +++ b/candle-nn/Cargo.toml @@ -25,6 +25,7 @@ candle-metal-kernels = { workspace = true, optional = true } [dev-dependencies] anyhow = { workspace = true } clap = { workspace = true } +rand = { workspace = true } [features] default = [] diff --git a/candle-nn/src/lib.rs b/candle-nn/src/lib.rs index 1bcb78d9..5c0fbb37 100644 --- a/candle-nn/src/lib.rs +++ b/candle-nn/src/lib.rs @@ -12,6 +12,7 @@ pub mod loss; pub mod ops; pub mod optim; pub mod rnn; +pub mod rotary_emb; pub mod sequential; pub mod var_builder; pub mod var_map; diff --git a/candle-nn/src/rotary_emb.rs b/candle-nn/src/rotary_emb.rs new file mode 100644 index 00000000..20545b8d --- /dev/null +++ b/candle-nn/src/rotary_emb.rs @@ -0,0 +1,247 @@ +use candle::{CpuStorage, Layout, Result, Shape, Tensor, D}; +use rayon::prelude::*; + +/// Interleaved variant of rotary embeddings. +/// The x0 and x1 value are interleaved on the n_embd (= head_dim) dimension. +/// The resulting y0 and y1 are also interleaved with: +/// y0 = x0*cos - x1*sin +/// y1 = x0*sin + x1*cos +#[derive(Debug, Clone)] +struct RotaryEmbI; + +impl candle::CustomOp3 for RotaryEmbI { + fn name(&self) -> &'static str { + "rotary-emb-int" + } + + fn cpu_fwd( + &self, + s1: &CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + s3: &CpuStorage, + l3: &Layout, + ) -> Result<(CpuStorage, Shape)> { + fn inner( + src: &[T], + l_src: &Layout, + cos: &[T], + l_cos: &Layout, + sin: &[T], + l_sin: &Layout, + ) -> Result<(CpuStorage, Shape)> { + let src = match l_src.contiguous_offsets() { + None => candle::bail!("input src has to be contiguous"), + Some((o1, o2)) => &src[o1..o2], + }; + let cos = match l_cos.contiguous_offsets() { + None => candle::bail!("input cos has to be contiguous"), + Some((o1, o2)) => &cos[o1..o2], + }; + let sin = match l_sin.contiguous_offsets() { + None => candle::bail!("input sin has to be contiguous"), + Some((o1, o2)) => &sin[o1..o2], + }; + let (b, h, t, d) = l_src.shape().dims4()?; + let el_count = b * h * t * d; + let mut dst = vec![T::zero(); el_count]; + src.par_chunks(t * d) + .zip(dst.par_chunks_mut(t * d)) + .for_each(|(src, dst)| { + for i_over_2 in 0..t * d / 2 { + let i = 2 * i_over_2; + dst[i] = src[i] * cos[i_over_2] - src[i + 1] * sin[i_over_2]; + dst[i + 1] = src[i] * sin[i_over_2] + src[i + 1] * cos[i_over_2]; + } + }); + let storage = candle::WithDType::to_cpu_storage_owned(dst); + Ok((storage, (b, h, t, d).into())) + } + + use candle::backend::BackendStorage; + use CpuStorage::{BF16, F16, F32, F64}; + match (s1, s2, s3) { + (BF16(s1), BF16(s2), BF16(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F16(s1), F16(s2), F16(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F32(s1), F32(s2), F32(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F64(s1), F64(s2), F64(s3)) => inner(s1, l1, s2, l2, s3, l3), + _ => candle::bail!( + "unsupported dtype for rope {:?} {:?} {:?}", + s1.dtype(), + s2.dtype(), + s3.dtype() + ), + } + } + + #[cfg(feature = "cuda")] + fn cuda_fwd( + &self, + s1: &candle::CudaStorage, + l1: &Layout, + s2: &candle::CudaStorage, + l2: &Layout, + s3: &candle::CudaStorage, + l3: &Layout, + ) -> Result<(candle::CudaStorage, Shape)> { + use candle::cuda_backend::cudarc::driver::{ + CudaSlice, DeviceRepr, LaunchAsync, LaunchConfig, + }; + use candle::cuda_backend::{kernel_name, kernels, WrapErr}; + use candle::{CudaDevice, WithDType}; + + fn inner( + src: &CudaSlice, + l_src: &Layout, + cos: &CudaSlice, + l_cos: &Layout, + sin: &CudaSlice, + l_sin: &Layout, + dev: &CudaDevice, + ) -> Result> { + let src = match l_src.contiguous_offsets() { + None => candle::bail!("src input has to be contiguous"), + Some((o1, o2)) => src.slice(o1..o2), + }; + let cos = match l_cos.contiguous_offsets() { + None => candle::bail!("cos input has to be contiguous"), + Some((o1, o2)) => cos.slice(o1..o2), + }; + let sin = match l_sin.contiguous_offsets() { + None => candle::bail!("sin input has to be contiguous"), + Some((o1, o2)) => sin.slice(o1..o2), + }; + let (b, h, t, d) = l_src.shape().dims4()?; + let el = b * h * t * d; + let cfg = LaunchConfig::for_num_elems((el / 2) as u32); + let func = dev.get_or_load_func(&kernel_name::("rope_i"), kernels::REDUCE)?; + // SAFETY: Set later by running the kernel. + let dst = unsafe { dev.alloc::(el) }.w()?; + let params = (&src, &cos, &sin, &dst, (b * h) as u32, (t * d) as u32); + // SAFETY: ffi. + unsafe { func.launch(cfg, params) }.w()?; + Ok(dst) + } + + use candle::backend::BackendStorage; + use candle::cuda_backend::CudaStorageSlice::{BF16, F16, F32, F64}; + let dev = s1.device(); + let slice = match (&s1.slice, &s2.slice, &s3.slice) { + (BF16(s1), BF16(s2), BF16(s3)) => BF16(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F16(s1), F16(s2), F16(s3)) => F16(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F32(s1), F32(s2), F32(s3)) => F32(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F64(s1), F64(s2), F64(s3)) => F64(inner(s1, l1, s2, l2, s3, l3, dev)?), + _ => candle::bail!( + "unsupported dtype for rope {:?} {:?} {:?}", + s1.dtype(), + s2.dtype(), + s3.dtype() + ), + }; + let dst = candle::cuda_backend::CudaStorage { + slice, + device: dev.clone(), + }; + Ok((dst, l1.shape().clone())) + } + + #[cfg(feature = "metal")] + fn metal_fwd( + &self, + src: &candle::MetalStorage, + l_src: &Layout, + cos: &candle::MetalStorage, + l_cos: &Layout, + sin: &candle::MetalStorage, + l_sin: &Layout, + ) -> Result<(candle::MetalStorage, Shape)> { + use candle::backend::BackendStorage; + let device = src.device(); + let command_buffer = device.command_buffer()?; + let kernels = device.kernels(); + if cos.dtype() != src.dtype() || sin.dtype() != src.dtype() { + candle::bail!( + "dtype mismatch in rope-i {:?} {:?} {:?}", + src.dtype(), + cos.dtype(), + sin.dtype() + ) + } + let name = match src.dtype() { + candle::DType::F32 => "rope_i_f32", + candle::DType::F16 => "rope_i_f16", + candle::DType::BF16 => "rope_i_bf16", + dtype => candle::bail!("rope-i is not implemented for {dtype:?}"), + }; + let (b, h, t, d) = l_src.shape().dims4()?; + let el = b * h * t * d; + let output = device.new_buffer(el, src.dtype(), "rope-i")?; + candle_metal_kernels::call_rope_i( + device.metal_device(), + &command_buffer, + kernels, + name, + b * h, + t * d, + src.buffer(), + l_src.start_offset() * src.dtype().size_in_bytes(), + cos.buffer(), + l_cos.start_offset() * cos.dtype().size_in_bytes(), + sin.buffer(), + l_sin.start_offset() * sin.dtype().size_in_bytes(), + &output, + ) + .map_err(candle::Error::wrap)?; + let out = candle::MetalStorage::new(output, device.clone(), el, src.dtype()); + Ok((out, l_src.shape().clone())) + } +} + +pub fn rope_i(xs: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { + let (_b_sz, _n_head, seq_len, n_embd) = xs.dims4()?; + let (cos_seq_len, cos_n_embd) = cos.dims2()?; + let (sin_seq_len, sin_n_embd) = cos.dims2()?; + if cos_n_embd * 2 != n_embd + || sin_n_embd * 2 != n_embd + || seq_len > cos_seq_len + || seq_len > sin_seq_len + { + candle::bail!( + "inconsistent last dim size in rope {:?} {:?} {:?}", + xs.shape(), + cos.shape(), + sin.shape() + ) + } + if !xs.is_contiguous() { + candle::bail!("xs has to be contiguous in rope") + } + if !cos.is_contiguous() { + candle::bail!("cos has to be contiguous in rope") + } + if !sin.is_contiguous() { + candle::bail!("sin has to be contiguous in rope") + } + xs.apply_op3_no_bwd(cos, sin, &RotaryEmbI) +} + +pub fn rope_i_slow(x: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { + let (b_sz, n_head, seq_len, n_embd) = x.dims4()?; + let cos = cos + .narrow(0, 0, seq_len)? + .reshape((seq_len, n_embd / 2, 1))?; + let sin = sin + .narrow(0, 0, seq_len)? + .reshape((seq_len, n_embd / 2, 1))?; + let cos = cos.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?; + let sin = sin.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?; + let x = x.reshape((b_sz, n_head, seq_len, n_embd / 2, 2))?; + let x0 = x.narrow(D::Minus1, 0, 1)?; + let x1 = x.narrow(D::Minus1, 1, 1)?; + let y0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?; + let y1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?; + let rope = Tensor::cat(&[y0, y1], D::Minus1)?; + let rope = rope.flatten_from(D::Minus2)?; + Ok(rope) +} diff --git a/candle-nn/tests/ops.rs b/candle-nn/tests/ops.rs index c1e3031f..af883b85 100644 --- a/candle-nn/tests/ops.rs +++ b/candle-nn/tests/ops.rs @@ -86,5 +86,33 @@ fn softmax_numerical_stability() -> Result<()> { Ok(()) } +fn rope(device: &Device) -> Result<()> { + use rand::{rngs::StdRng, Rng, SeedableRng}; + + let (b_size, num_head, seq_len, head_dim) = (2, 5, 10, 16); + let el_count = b_size * num_head * seq_len * head_dim; + let mut rng = StdRng::seed_from_u64(299792458); + let src: Vec = (0..el_count).map(|_| rng.gen::()).collect(); + let cos: Vec = (0..seq_len * head_dim / 2) + .map(|_| rng.gen::()) + .collect(); + let sin: Vec = (0..seq_len * head_dim / 2) + .map(|_| rng.gen::()) + .collect(); + let src = Tensor::from_vec(src, (b_size, num_head, seq_len, head_dim), device)?; + let cos = Tensor::from_vec(cos, (seq_len, head_dim / 2), device)?; + let sin = Tensor::from_vec(sin, (seq_len, head_dim / 2), device)?; + let rope1 = candle_nn::rotary_emb::rope_i(&src, &cos, &sin)?; + let rope2 = candle_nn::rotary_emb::rope_i_slow(&src, &cos, &sin)?; + let sum_diff = (rope1 - rope2)?.abs()?.sum_all()?.to_vec0::()?; + if device.is_cpu() { + assert_eq!(sum_diff, 0.); + } else if device.is_cuda() { + assert!(sum_diff < 1e-4); + } + Ok(()) +} + +test_device!(rope, rope_cpu, rope_gpu, rope_metal); test_device!(softmax, softmax_cpu, softmax_gpu, softmax_metal); test_device!(rms_norm, rms_norm_cpu, rms_norm_gpu, rms_norm_metal); diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 22ddfd2f..9898d872 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use crate::quantized_nn::RmsNorm; use candle::quantized::QTensor; use candle::quantized::{ggml_file, gguf_file}; -use candle::{DType, Device, IndexOp, Result, Tensor, D}; +use candle::{DType, Device, IndexOp, Result, Tensor}; use candle_nn::{Embedding, Module}; pub const MAX_SEQ_LEN: usize = 4096; @@ -154,31 +154,10 @@ fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: &Tensor) -> Result Result { let _enter = self.span_rot.enter(); - let (b_sz, n_head, seq_len, n_embd) = x.dims4()?; - let cos = self - .cos - .narrow(0, index_pos, seq_len)? - .reshape((seq_len, n_embd / 2, 1))?; - let sin = self - .sin - .narrow(0, index_pos, seq_len)? - .reshape((seq_len, n_embd / 2, 1))?; - let cos = cos.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?; - let sin = sin.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?; - // This mimics the llama.cpp behavior. - // https://github.com/ggerganov/llama.cpp/blob/1f0bccb27929e261744c979bc75114955da49e98/ggml.c#L12104-L12105 - // The x0 and x1 value are interleaved on the n_embd (= head_dim) dimension. - // The resulting y0 and y1 are also interleaved with: - // y0 = x0*cos - x1*sin - // y1 = x0*sin + x1*cos - let x = x.reshape((b_sz, n_head, seq_len, n_embd / 2, 2))?; - let x0 = x.narrow(D::Minus1, 0, 1)?; - let x1 = x.narrow(D::Minus1, 1, 1)?; - let y0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?; - let y1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?; - let rope = Tensor::cat(&[y0, y1], D::Minus1)?; - let rope = rope.flatten_from(D::Minus2)?; - Ok(rope) + let (_b_sz, _n_head, seq_len, _n_embd) = x.dims4()?; + let cos = self.cos.narrow(0, index_pos, seq_len)?; + let sin = self.sin.narrow(0, index_pos, seq_len)?; + candle_nn::rotary_emb::rope_i(&x.contiguous()?, &cos, &sin) } fn forward_attn( From e7f8e72588b963843546fa8a18ca5db9707a8637 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 25 Mar 2024 09:11:20 +0100 Subject: [PATCH 061/131] Contiguous variant of the rope kernel. (#1929) * Contiguous variant of the rope kernel. * Add the cuda kernel. * Metal kernel. --- candle-kernels/src/reduce.cu | 40 +++- candle-metal-kernels/src/lib.rs | 43 +++++ candle-metal-kernels/src/reduce.metal | 35 +++- candle-nn/src/rotary_emb.rs | 252 ++++++++++++++++++++++++++ candle-nn/tests/ops.rs | 32 +++- 5 files changed, 389 insertions(+), 13 deletions(-) diff --git a/candle-kernels/src/reduce.cu b/candle-kernels/src/reduce.cu index e6002a6f..48bbcd83 100644 --- a/candle-kernels/src/reduce.cu +++ b/candle-kernels/src/reduce.cu @@ -160,6 +160,24 @@ __device__ void ropei(const T * src, const T * cos, const T * sin, T * dst, cons dst[2 * idx + 1] = src[2 * idx] * s + src[2 * idx + 1] * c; } +template +__device__ void rope(const T * src, const T * cos, const T * sin, T * dst, const uint32_t bh, const uint32_t td, const uint32_t d) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (2 * idx > bh * td) return; + + uint32_t i_bh = idx / (td / 2); + uint32_t i_td = idx - (td / 2) * i_bh; + uint32_t i_t = i_td / (d / 2); + uint32_t i_d = i_td - (d / 2) * i_t; + uint32_t i1 = i_bh * td + i_t * d + i_d; + uint32_t i2 = i1 + d / 2; + uint32_t i_cs = i_t * (d / 2) + i_d; + T c = cos[i_cs]; + T s = sin[i_cs]; + + dst[i1] = src[i1] * c - src[i2] * s; + dst[i2] = src[i1] * s + src[i2] * c; +} template __device__ void @@ -416,8 +434,8 @@ fast_argmax(const size_t src_numel, const size_t el_to_sum_per_block, rmsnorm(src, dst, alpha, n_cols, eps); \ } \ -#define ROPEI_OP(TYPENAME, FN_NAME) \ - extern "C" __global__ void FN_NAME( \ +#define ROPE_OP(TYPENAME, FN_NAME, FN_NAME_I) \ + extern "C" __global__ void FN_NAME_I( \ const TYPENAME *src, \ const TYPENAME *cos, \ const TYPENAME *sin, \ @@ -426,11 +444,21 @@ fast_argmax(const size_t src_numel, const size_t el_to_sum_per_block, const uint32_t td) { \ ropei(src, cos, sin, dst, bh, td); \ } \ + extern "C" __global__ void FN_NAME( \ + const TYPENAME *src, \ + const TYPENAME *cos, \ + const TYPENAME *sin, \ + TYPENAME *dst, \ + const uint32_t bh, \ + const uint32_t td, \ + const uint32_t d) { \ + rope(src, cos, sin, dst, bh, td, d); \ + } \ #if __CUDA_ARCH__ >= 800 SOFTMAX_OP(__nv_bfloat16, float, softmax_bf16) RMSNORM_OP(__nv_bfloat16, rmsnorm_bf16) -ROPEI_OP(__nv_bfloat16, rope_i_bf16) +ROPE_OP(__nv_bfloat16, rope_bf16, rope_i_bf16) SUM_OP(__nv_bfloat16, sum_bf16) FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argmax_bf16, fast_sum_bf16) #endif @@ -438,7 +466,7 @@ FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argm #if __CUDA_ARCH__ >= 530 SOFTMAX_OP(__half, float, softmax_f16) RMSNORM_OP(__half, rmsnorm_f16) -ROPEI_OP(__half, rope_i_f16) +ROPE_OP(__half, rope_f16, rope_i_f16) SUM_OP(__half, sum_f16) FAST_OP(__half, fast_min_f16, fast_max_f16, fast_argmin_f16, fast_argmax_f16, fast_sum_f16) #endif @@ -450,8 +478,8 @@ SOFTMAX_OP(float, float, softmax_f32) SOFTMAX_OP(double, double, softmax_f64) RMSNORM_OP(float, rmsnorm_f32) RMSNORM_OP(double, rmsnorm_f64) -ROPEI_OP(float, rope_i_f32) -ROPEI_OP(double, rope_i_f64) +ROPE_OP(float, rope_f32, rope_i_f32) +ROPE_OP(double, rope_f64, rope_i_f64) FAST_OP(float, fast_min_f32, fast_max_f32, fast_argmin_f32, fast_argmax_f32, fast_sum_f32) FAST_OP(double, fast_min_f64, fast_max_f64, fast_argmin_f64, fast_argmax_f64, fast_sum_f64) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index e83814a8..449bef8f 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -849,6 +849,49 @@ pub fn call_rope_i( Ok(()) } +#[allow(clippy::too_many_arguments)] +pub fn call_rope( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + kernel_name: &'static str, + bh: usize, + td: usize, + d: usize, + src: &Buffer, + src_offset: usize, + cos: &Buffer, + cos_offset: usize, + sin: &Buffer, + sin_offset: usize, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?; + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + + set_params!( + encoder, + ( + bh, + td, + d, + (src, src_offset), + (cos, cos_offset), + (sin, sin_offset), + output + ) + ); + let (thread_group_count, thread_group_size) = linear_split(&pipeline, (bh * td) / 2); + encoder.use_resource(src, metal::MTLResourceUsage::Read); + encoder.use_resource(cos, metal::MTLResourceUsage::Read); + encoder.use_resource(sin, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[allow(clippy::too_many_arguments)] pub fn call_affine( device: &Device, diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index fa980dea..be5a0921 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -313,8 +313,8 @@ kernel void NAME( } \ } \ -#define ROPEI(FN_NAME, TYPENAME) \ -kernel void FN_NAME( \ +#define ROPEI(FN_NAME, FN_NAME_I, TYPENAME) \ +kernel void FN_NAME_I( \ constant size_t &bh, \ constant size_t &td, \ device const TYPENAME *src, \ @@ -332,6 +332,31 @@ kernel void FN_NAME( \ dst[2 * tid] = src[2 * tid] * c - src[2 * tid + 1] * s; \ dst[2 * tid + 1] = src[2 * tid] * s + src[2 * tid + 1] * c; \ }\ +kernel void FN_NAME( \ + constant size_t &bh, \ + constant size_t &td, \ + constant size_t &d, \ + device const TYPENAME *src, \ + device const TYPENAME *cos, \ + device const TYPENAME *sin, \ + device TYPENAME *dst, \ + uint idx [[ thread_position_in_grid ]] \ +) { \ + if (2 * idx >= bh * td) { \ + return; \ + } \ + size_t i_bh = idx / (td / 2); \ + size_t i_td = idx - (td / 2) * i_bh; \ + size_t i_t = i_td / (d / 2); \ + size_t i_d = i_td - (d / 2) * i_t; \ + size_t i1 = i_bh * td + i_t * d + i_d; \ + size_t i2 = i1 + d / 2; \ + size_t i_cs = i_t * (d / 2) + i_d; \ + TYPENAME c = cos[i_cs]; \ + TYPENAME s = sin[i_cs]; \ + dst[i1] = src[i1] * c - src[i2] * s; \ + dst[i2] = src[i1] * s + src[i2] * c; \ +}\ REDUCE(x + y, fast_sum_f32_strided, float, 0) REDUCE(x + y, fast_sum_u32_strided, uint, 0) @@ -361,8 +386,8 @@ SOFTMAX(softmax_f32, float) SOFTMAX(softmax_f16, half) RMSNORM(rmsnorm_f32, float) RMSNORM(rmsnorm_f16, half) -ROPEI(rope_i_f32, float) -ROPEI(rope_i_f16, half) +ROPEI(rope_f32, rope_i_f32, float) +ROPEI(rope_f16, rope_i_f16, half) #if __METAL_VERSION__ >= 220 REDUCE(x + y, fast_sum_i64_strided, int64_t, 0) @@ -381,5 +406,5 @@ ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF) ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF) SOFTMAX(softmax_bf16, bfloat) RMSNORM(rmsnorm_bf16, bfloat) -ROPEI(rope_i_bf16, bfloat) +ROPEI(rope_bf16, rope_i_bf16, bfloat) #endif diff --git a/candle-nn/src/rotary_emb.rs b/candle-nn/src/rotary_emb.rs index 20545b8d..9c5543fb 100644 --- a/candle-nn/src/rotary_emb.rs +++ b/candle-nn/src/rotary_emb.rs @@ -245,3 +245,255 @@ pub fn rope_i_slow(x: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { let rope = rope.flatten_from(D::Minus2)?; Ok(rope) } + +/// Contiguous variant of rope embeddings. +#[derive(Debug, Clone)] +struct RotaryEmb; + +impl candle::CustomOp3 for RotaryEmb { + fn name(&self) -> &'static str { + "rotary-emb" + } + + fn cpu_fwd( + &self, + s1: &CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + s3: &CpuStorage, + l3: &Layout, + ) -> Result<(CpuStorage, Shape)> { + fn inner( + src: &[T], + l_src: &Layout, + cos: &[T], + l_cos: &Layout, + sin: &[T], + l_sin: &Layout, + ) -> Result<(CpuStorage, Shape)> { + let src = match l_src.contiguous_offsets() { + None => candle::bail!("input src has to be contiguous"), + Some((o1, o2)) => &src[o1..o2], + }; + let cos = match l_cos.contiguous_offsets() { + None => candle::bail!("input cos has to be contiguous"), + Some((o1, o2)) => &cos[o1..o2], + }; + let sin = match l_sin.contiguous_offsets() { + None => candle::bail!("input sin has to be contiguous"), + Some((o1, o2)) => &sin[o1..o2], + }; + let (b, h, t, d) = l_src.shape().dims4()?; + let el_count = b * h * t * d; + let mut dst = vec![T::zero(); el_count]; + src.par_chunks(t * d) + .zip(dst.par_chunks_mut(t * d)) + .for_each(|(src, dst)| { + for i_t in 0..t { + for i_d in 0..d / 2 { + let i1 = i_t * d + i_d; + let i2 = i1 + d / 2; + let i_cs = i_t * (d / 2) + i_d; + dst[i1] = src[i1] * cos[i_cs] - src[i2] * sin[i_cs]; + dst[i2] = src[i1] * sin[i_cs] + src[i2] * cos[i_cs]; + } + } + }); + let storage = candle::WithDType::to_cpu_storage_owned(dst); + Ok((storage, (b, h, t, d).into())) + } + + use candle::backend::BackendStorage; + use CpuStorage::{BF16, F16, F32, F64}; + match (s1, s2, s3) { + (BF16(s1), BF16(s2), BF16(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F16(s1), F16(s2), F16(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F32(s1), F32(s2), F32(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F64(s1), F64(s2), F64(s3)) => inner(s1, l1, s2, l2, s3, l3), + _ => candle::bail!( + "unsupported dtype for rope {:?} {:?} {:?}", + s1.dtype(), + s2.dtype(), + s3.dtype() + ), + } + } + + #[cfg(feature = "cuda")] + fn cuda_fwd( + &self, + s1: &candle::CudaStorage, + l1: &Layout, + s2: &candle::CudaStorage, + l2: &Layout, + s3: &candle::CudaStorage, + l3: &Layout, + ) -> Result<(candle::CudaStorage, Shape)> { + use candle::cuda_backend::cudarc::driver::{ + CudaSlice, DeviceRepr, LaunchAsync, LaunchConfig, + }; + use candle::cuda_backend::{kernel_name, kernels, WrapErr}; + use candle::{CudaDevice, WithDType}; + + fn inner( + src: &CudaSlice, + l_src: &Layout, + cos: &CudaSlice, + l_cos: &Layout, + sin: &CudaSlice, + l_sin: &Layout, + dev: &CudaDevice, + ) -> Result> { + let src = match l_src.contiguous_offsets() { + None => candle::bail!("src input has to be contiguous"), + Some((o1, o2)) => src.slice(o1..o2), + }; + let cos = match l_cos.contiguous_offsets() { + None => candle::bail!("cos input has to be contiguous"), + Some((o1, o2)) => cos.slice(o1..o2), + }; + let sin = match l_sin.contiguous_offsets() { + None => candle::bail!("sin input has to be contiguous"), + Some((o1, o2)) => sin.slice(o1..o2), + }; + let (b, h, t, d) = l_src.shape().dims4()?; + let el = b * h * t * d; + let cfg = LaunchConfig::for_num_elems((el / 2) as u32); + let func = dev.get_or_load_func(&kernel_name::("rope"), kernels::REDUCE)?; + // SAFETY: Set later by running the kernel. + let dst = unsafe { dev.alloc::(el) }.w()?; + let params = ( + &src, + &cos, + &sin, + &dst, + (b * h) as u32, + (t * d) as u32, + d as u32, + ); + // SAFETY: ffi. + unsafe { func.launch(cfg, params) }.w()?; + Ok(dst) + } + + use candle::backend::BackendStorage; + use candle::cuda_backend::CudaStorageSlice::{BF16, F16, F32, F64}; + let dev = s1.device(); + let slice = match (&s1.slice, &s2.slice, &s3.slice) { + (BF16(s1), BF16(s2), BF16(s3)) => BF16(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F16(s1), F16(s2), F16(s3)) => F16(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F32(s1), F32(s2), F32(s3)) => F32(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F64(s1), F64(s2), F64(s3)) => F64(inner(s1, l1, s2, l2, s3, l3, dev)?), + _ => candle::bail!( + "unsupported dtype for rope {:?} {:?} {:?}", + s1.dtype(), + s2.dtype(), + s3.dtype() + ), + }; + let dst = candle::cuda_backend::CudaStorage { + slice, + device: dev.clone(), + }; + Ok((dst, l1.shape().clone())) + } + + #[cfg(feature = "metal")] + fn metal_fwd( + &self, + src: &candle::MetalStorage, + l_src: &Layout, + cos: &candle::MetalStorage, + l_cos: &Layout, + sin: &candle::MetalStorage, + l_sin: &Layout, + ) -> Result<(candle::MetalStorage, Shape)> { + use candle::backend::BackendStorage; + let device = src.device(); + let command_buffer = device.command_buffer()?; + let kernels = device.kernels(); + if cos.dtype() != src.dtype() || sin.dtype() != src.dtype() { + candle::bail!( + "dtype mismatch in rope {:?} {:?} {:?}", + src.dtype(), + cos.dtype(), + sin.dtype() + ) + } + let name = match src.dtype() { + candle::DType::F32 => "rope_f32", + candle::DType::F16 => "rope_f16", + candle::DType::BF16 => "rope_bf16", + dtype => candle::bail!("rope is not implemented for {dtype:?}"), + }; + let (b, h, t, d) = l_src.shape().dims4()?; + let el = b * h * t * d; + let output = device.new_buffer(el, src.dtype(), "rope-i")?; + candle_metal_kernels::call_rope( + device.metal_device(), + &command_buffer, + kernels, + name, + b * h, + t * d, + d, + src.buffer(), + l_src.start_offset() * src.dtype().size_in_bytes(), + cos.buffer(), + l_cos.start_offset() * cos.dtype().size_in_bytes(), + sin.buffer(), + l_sin.start_offset() * sin.dtype().size_in_bytes(), + &output, + ) + .map_err(candle::Error::wrap)?; + let out = candle::MetalStorage::new(output, device.clone(), el, src.dtype()); + Ok((out, l_src.shape().clone())) + } +} + +pub fn rope(xs: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { + let (_b_sz, _n_head, seq_len, n_embd) = xs.dims4()?; + let (cos_seq_len, cos_n_embd) = cos.dims2()?; + let (sin_seq_len, sin_n_embd) = cos.dims2()?; + if cos_n_embd * 2 != n_embd + || sin_n_embd * 2 != n_embd + || seq_len > cos_seq_len + || seq_len > sin_seq_len + { + candle::bail!( + "inconsistent last dim size in rope {:?} {:?} {:?}", + xs.shape(), + cos.shape(), + sin.shape() + ) + } + if !xs.is_contiguous() { + candle::bail!("xs has to be contiguous in rope") + } + if !cos.is_contiguous() { + candle::bail!("cos has to be contiguous in rope") + } + if !sin.is_contiguous() { + candle::bail!("sin has to be contiguous in rope") + } + xs.apply_op3_no_bwd(cos, sin, &RotaryEmb) +} + +fn rotate_half(xs: &Tensor) -> Result { + let last_dim = xs.dim(D::Minus1)?; + let xs1 = xs.narrow(D::Minus1, 0, last_dim / 2)?; + let xs2 = xs.narrow(D::Minus1, last_dim / 2, last_dim - last_dim / 2)?; + Tensor::cat(&[&xs2.neg()?, &xs1], D::Minus1) +} + +pub fn rope_slow(x: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { + let (_b_sz, _h, seq_len, _n_embd) = x.dims4()?; + let cos = Tensor::cat(&[cos, cos], D::Minus1)?; + let sin = Tensor::cat(&[sin, sin], D::Minus1)?; + let cos = cos.narrow(0, 0, seq_len)?; + let sin = sin.narrow(0, 0, seq_len)?; + let cos = cos.unsqueeze(0)?.unsqueeze(0)?; + let sin = sin.unsqueeze(0)?.unsqueeze(0)?; + x.broadcast_mul(&cos)? + rotate_half(x)?.broadcast_mul(&sin)? +} diff --git a/candle-nn/tests/ops.rs b/candle-nn/tests/ops.rs index af883b85..20a66e75 100644 --- a/candle-nn/tests/ops.rs +++ b/candle-nn/tests/ops.rs @@ -86,7 +86,7 @@ fn softmax_numerical_stability() -> Result<()> { Ok(()) } -fn rope(device: &Device) -> Result<()> { +fn ropei(device: &Device) -> Result<()> { use rand::{rngs::StdRng, Rng, SeedableRng}; let (b_size, num_head, seq_len, head_dim) = (2, 5, 10, 16); @@ -107,12 +107,40 @@ fn rope(device: &Device) -> Result<()> { let sum_diff = (rope1 - rope2)?.abs()?.sum_all()?.to_vec0::()?; if device.is_cpu() { assert_eq!(sum_diff, 0.); - } else if device.is_cuda() { + } else { assert!(sum_diff < 1e-4); } Ok(()) } +fn rope(device: &Device) -> Result<()> { + use rand::{rngs::StdRng, Rng, SeedableRng}; + + let (b_size, num_head, seq_len, head_dim) = (2, 5, 10, 16); + let el_count = b_size * num_head * seq_len * head_dim; + let mut rng = StdRng::seed_from_u64(299792458); + let src: Vec = (0..el_count).map(|_| rng.gen::()).collect(); + let cos: Vec = (0..seq_len * head_dim / 2) + .map(|_| rng.gen::()) + .collect(); + let sin: Vec = (0..seq_len * head_dim / 2) + .map(|_| rng.gen::()) + .collect(); + let src = Tensor::from_vec(src, (b_size, num_head, seq_len, head_dim), device)?; + let cos = Tensor::from_vec(cos, (seq_len, head_dim / 2), device)?; + let sin = Tensor::from_vec(sin, (seq_len, head_dim / 2), device)?; + let rope1 = candle_nn::rotary_emb::rope(&src, &cos, &sin)?; + let rope2 = candle_nn::rotary_emb::rope_slow(&src, &cos, &sin)?; + let sum_diff = (rope1 - rope2)?.abs()?.sum_all()?.to_vec0::()?; + if device.is_cpu() { + assert_eq!(sum_diff, 0.); + } else { + assert!(sum_diff < 1e-4); + } + Ok(()) +} + +test_device!(ropei, ropei_cpu, ropei_gpu, ropei_metal); test_device!(rope, rope_cpu, rope_gpu, rope_metal); test_device!(softmax, softmax_cpu, softmax_gpu, softmax_metal); test_device!(rms_norm, rms_norm_cpu, rms_norm_gpu, rms_norm_metal); From cd254074f354c4066bc73e1c5cc5ecc84d25a2db Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 25 Mar 2024 11:48:16 +0100 Subject: [PATCH 062/131] Really unique identifier for metal device ids. (#1932) * Really unique identifier for metal device ids. * Same device. --- candle-core/src/metal_backend.rs | 30 ++++++++++++++++++++++++------ candle-core/src/storage.rs | 16 +++++++++++++--- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 73a141ea..bf501e24 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -10,6 +10,19 @@ use std::ffi::c_void; use std::path::Path; use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard, TryLockError}; +/// Unique identifier for cuda devices. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct DeviceId(usize); + +impl DeviceId { + fn new() -> Self { + // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805 + use std::sync::atomic; + static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1); + Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed)) + } +} + /// Simple way to catch lock error without /// depending on T #[derive(thiserror::Error, Debug)] @@ -64,6 +77,10 @@ type AllocatedBuffers = Arc>; #[derive(Clone)] pub struct MetalDevice { + /// Unique identifier, the registryID is not sufficient as it identifies the GPU rather than + /// the device itself. + id: DeviceId, + /// Raw metal device: device: metal::Device, @@ -108,7 +125,7 @@ pub struct MetalDevice { impl std::fmt::Debug for MetalDevice { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "MetalDevice({:?})", self.device.registry_id()) + write!(f, "MetalDevice({:?})", self.id) } } @@ -121,8 +138,8 @@ impl std::ops::Deref for MetalDevice { } impl MetalDevice { - pub fn id(&self) -> NSUInteger { - self.registry_id() + pub fn id(&self) -> DeviceId { + self.id } pub fn metal_device(&self) -> &metal::Device { @@ -1117,8 +1134,8 @@ impl BackendStorage for MetalStorage { padding: params.padding, output_padding: params.output_padding, c_out: params.c_out, - out_h: out_h, - out_w: out_w, + out_h, + out_w, b_size: params.b_size, input_dims: l.dims(), input_stride: l.stride(), @@ -1867,6 +1884,7 @@ impl BackendDevice for MetalDevice { MTLResourceOptions::StorageModeManaged, ))); Ok(Self { + id: DeviceId::new(), device, command_queue, command_buffer, @@ -1885,7 +1903,7 @@ impl BackendDevice for MetalDevice { } fn same_device(&self, rhs: &Self) -> bool { - self.device.registry_id() == rhs.device.registry_id() + self.id == rhs.id } unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { diff --git a/candle-core/src/storage.rs b/candle-core/src/storage.rs index 36620dd9..8a0637e3 100644 --- a/candle-core/src/storage.rs +++ b/candle-core/src/storage.rs @@ -44,9 +44,19 @@ impl Storage { } pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> { - let lhs = self.device().location(); - let rhs = rhs.device().location(); - if lhs != rhs { + let lhs_device = self.device(); + let rhs_device = rhs.device(); + let lhs = lhs_device.location(); + let rhs = rhs_device.location(); + let same_device = if self.device().is_metal() { + // On metal, we require the device to be exactly the same rather than + // having the same location. In cuda this is not necessary as all CudaDevice on the + // same GPU will use the same cuda stream. + lhs_device.same_device(&rhs_device) + } else { + lhs == rhs + }; + if !same_device { Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op }.bt()) } else { Ok(()) From d3a8d291d5f2ff5addb9ff97cf881307afbd7b6a Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 25 Mar 2024 15:31:04 +0100 Subject: [PATCH 063/131] Avoid the attention mask where possible. (#1933) --- candle-transformers/src/models/falcon.rs | 32 ++++++++++++------- candle-transformers/src/models/llama2_c.rs | 8 +++-- .../src/models/quantized_llama2_c.rs | 8 +++-- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/candle-transformers/src/models/falcon.rs b/candle-transformers/src/models/falcon.rs index 86cf8451..24fd3c46 100644 --- a/candle-transformers/src/models/falcon.rs +++ b/candle-transformers/src/models/falcon.rs @@ -247,7 +247,7 @@ impl FalconAttention { } } - fn forward(&mut self, x: &Tensor, mask: &Tensor, past_kv_len: usize) -> Result { + fn forward(&mut self, x: &Tensor, mask: Option<&Tensor>, past_kv_len: usize) -> Result { let fused_qkv = self.query_key_value.forward(x)?; let head_dim = self.head_dim; let (query, key, value) = self.split_heads(&fused_qkv)?; @@ -267,7 +267,6 @@ impl FalconAttention { (query, key) }; let (mut key, mut value) = (key, value); - let mask = masked_fill(&mask.to_dtype(DType::F32)?, mask, -1e9)?.to_dtype(query.dtype())?; if self.use_cache { if let Some((cache_k, cache_v)) = &self.kv_cache { // TODO: we could trim the tensors to MAX_SEQ_LEN so that this would work for @@ -293,13 +292,18 @@ impl FalconAttention { // Only handle the case where alibi is None here, and non-flash attention. let attention_scores = (query.matmul(&key.t()?)? * self.inv_norm_factor)?; - let attention_scores = candle_nn::ops::softmax( - &attention_scores - .broadcast_add(&mask.squeeze(1)?)? - .to_dtype(DType::F32)?, - D::Minus1, - )? - .to_dtype(x.dtype())?; + let attention_scores = match mask { + None => attention_scores, + Some(mask) => { + let mask = masked_fill(&mask.to_dtype(DType::F32)?, mask, -1e9)? + .to_dtype(query.dtype())?; + attention_scores.broadcast_add(&mask.squeeze(1)?)? + } + }; + + let attention_scores = + candle_nn::ops::softmax(&attention_scores.to_dtype(DType::F32)?, D::Minus1)? + .to_dtype(x.dtype())?; let attn_output = attention_scores .matmul(&value)? .reshape((b_sz, self.num_heads, seq_len, head_dim))? @@ -372,7 +376,7 @@ impl FalconDecoderLayer { }) } - fn forward(&mut self, x: &Tensor, mask: &Tensor, past_kv_len: usize) -> Result { + fn forward(&mut self, x: &Tensor, mask: Option<&Tensor>, past_kv_len: usize) -> Result { let residual = x.clone(); let ln_attn = self.inp_layernorm.forward(x)?; let attn_output = self.self_attention.forward(&ln_attn, mask, past_kv_len)?; @@ -457,9 +461,13 @@ impl Falcon { Some((k, _)) => k.dim(1)?, None => 0, }; - let causal_mask = prepare_attn_mask(b_sz, seq_len)?.to_device(input_ids.device())?; + let causal_mask = if seq_len <= 1 { + None + } else { + Some(prepare_attn_mask(b_sz, seq_len)?.to_device(input_ids.device())?) + }; for block in self.blocks.iter_mut() { - hidden_state = block.forward(&hidden_state, &causal_mask, past_kv_len)?; + hidden_state = block.forward(&hidden_state, causal_mask.as_ref(), past_kv_len)?; } let hidden_state = self.ln_f.forward(&hidden_state)?; let hidden_state = hidden_state.narrow(1, seq_len - 1, 1)?; diff --git a/candle-transformers/src/models/llama2_c.rs b/candle-transformers/src/models/llama2_c.rs index 7b4f120b..bba8b666 100644 --- a/candle-transformers/src/models/llama2_c.rs +++ b/candle-transformers/src/models/llama2_c.rs @@ -194,8 +194,12 @@ impl CausalSelfAttention { let v = v.transpose(1, 2)?.contiguous()?; let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?; - let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; - let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?; + let att = if seq_len <= 1 { + att + } else { + let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; + masked_fill(&att, &mask, f32::NEG_INFINITY)? + }; let att = candle_nn::ops::softmax(&att, D::Minus1)?; // Convert to contiguous as matmul doesn't support strided vs for now. let y = att.matmul(&v.contiguous()?)?; diff --git a/candle-transformers/src/models/quantized_llama2_c.rs b/candle-transformers/src/models/quantized_llama2_c.rs index b43ca9ff..cbb8aad8 100644 --- a/candle-transformers/src/models/quantized_llama2_c.rs +++ b/candle-transformers/src/models/quantized_llama2_c.rs @@ -71,8 +71,12 @@ impl CausalSelfAttention { let v = v.transpose(1, 2)?.contiguous()?; let att = (q.matmul(&k.t()?)? / (self.head_dim as f64).sqrt())?; - let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; - let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?; + let att = if seq_len <= 1 { + att + } else { + let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; + masked_fill(&att, &mask, f32::NEG_INFINITY)? + }; let att = candle_nn::ops::softmax(&att, D::Minus1)?; // Convert to contiguous as matmul doesn't support strided vs for now. let y = att.matmul(&v.contiguous()?)?; From 60676780a9436fd0de43b1e8ff99445ab863c066 Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Mon, 25 Mar 2024 14:20:09 -0300 Subject: [PATCH 064/131] Fix detail in new RoPE implementation (#1935) --- candle-nn/src/rotary_emb.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-nn/src/rotary_emb.rs b/candle-nn/src/rotary_emb.rs index 9c5543fb..c2b41482 100644 --- a/candle-nn/src/rotary_emb.rs +++ b/candle-nn/src/rotary_emb.rs @@ -455,7 +455,7 @@ impl candle::CustomOp3 for RotaryEmb { pub fn rope(xs: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { let (_b_sz, _n_head, seq_len, n_embd) = xs.dims4()?; let (cos_seq_len, cos_n_embd) = cos.dims2()?; - let (sin_seq_len, sin_n_embd) = cos.dims2()?; + let (sin_seq_len, sin_n_embd) = sin.dims2()?; if cos_n_embd * 2 != n_embd || sin_n_embd * 2 != n_embd || seq_len > cos_seq_len From 196765e995f7f4bd3b9610a22f8ef5b009437a4e Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 25 Mar 2024 23:26:05 +0100 Subject: [PATCH 065/131] Use the new rope kernel in mistral. (#1937) * Use the new rope kernel in mistral. * Compute the cos and sin with full precision. * Bugfix. --- candle-kernels/src/reduce.cu | 4 ++-- candle-transformers/src/models/mistral.rs | 20 ++++++------------- .../src/models/quantized_mistral.rs | 20 ++++++------------- 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/candle-kernels/src/reduce.cu b/candle-kernels/src/reduce.cu index 48bbcd83..2af81c42 100644 --- a/candle-kernels/src/reduce.cu +++ b/candle-kernels/src/reduce.cu @@ -150,7 +150,7 @@ __device__ void softmax(const T * x, T * dst, const int ncols) { template __device__ void ropei(const T * src, const T * cos, const T * sin, T * dst, const uint32_t bh, const uint32_t td) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (2 * idx > bh * td) return; + if (2 * idx >= bh * td) return; uint32_t rope_idx = idx % (td / 2); T c = cos[rope_idx]; @@ -163,7 +163,7 @@ __device__ void ropei(const T * src, const T * cos, const T * sin, T * dst, cons template __device__ void rope(const T * src, const T * cos, const T * sin, T * dst, const uint32_t bh, const uint32_t td, const uint32_t d) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (2 * idx > bh * td) return; + if (2 * idx >= bh * td) return; uint32_t i_bh = idx / (td / 2); uint32_t i_td = idx - (td / 2) * i_bh; diff --git a/candle-transformers/src/models/mistral.rs b/candle-transformers/src/models/mistral.rs index 0e6200f5..d899c712 100644 --- a/candle-transformers/src/models/mistral.rs +++ b/candle-transformers/src/models/mistral.rs @@ -88,13 +88,6 @@ struct RotaryEmbedding { cos: Tensor, } -fn rotate_half(xs: &Tensor) -> Result { - let last_dim = xs.dim(D::Minus1)?; - let xs1 = xs.narrow(D::Minus1, 0, last_dim / 2)?; - let xs2 = xs.narrow(D::Minus1, last_dim / 2, last_dim - last_dim / 2)?; - Tensor::cat(&[&xs2.neg()?, &xs1], D::Minus1) -} - impl RotaryEmbedding { fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result { let rope_theta = cfg.rope_theta as f32; @@ -110,7 +103,6 @@ impl RotaryEmbedding { .to_dtype(dtype)? .reshape((max_seq_len, 1))?; let freqs = t.matmul(&inv_freq)?; - let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1)?; Ok(Self { sin: freqs.sin()?, cos: freqs.cos()?, @@ -126,10 +118,8 @@ impl RotaryEmbedding { let (_b_sz, _h, seq_len, _n_embd) = q.dims4()?; let cos = self.cos.narrow(0, seqlen_offset, seq_len)?; let sin = self.sin.narrow(0, seqlen_offset, seq_len)?; - let cos = cos.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim) - let sin = sin.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim) - let q_embed = (q.broadcast_mul(&cos)? + rotate_half(q)?.broadcast_mul(&sin))?; - let k_embed = (k.broadcast_mul(&cos)? + rotate_half(k)?.broadcast_mul(&sin))?; + let q_embed = candle_nn::rotary_emb::rope(q, &cos, &sin)?; + let k_embed = candle_nn::rotary_emb::rope(k, &cos, &sin)?; Ok((q_embed, k_embed)) } } @@ -252,10 +242,12 @@ impl Attention { let query_states = query_states .reshape((b_sz, q_len, self.num_heads, self.head_dim))? - .transpose(1, 2)?; + .transpose(1, 2)? + .contiguous()?; let key_states = key_states .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? - .transpose(1, 2)?; + .transpose(1, 2)? + .contiguous()?; let value_states = value_states .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? .transpose(1, 2)?; diff --git a/candle-transformers/src/models/quantized_mistral.rs b/candle-transformers/src/models/quantized_mistral.rs index 2c5b7f74..e37785de 100644 --- a/candle-transformers/src/models/quantized_mistral.rs +++ b/candle-transformers/src/models/quantized_mistral.rs @@ -12,13 +12,6 @@ struct RotaryEmbedding { cos: Tensor, } -fn rotate_half(xs: &Tensor) -> Result { - let last_dim = xs.dim(D::Minus1)?; - let xs1 = xs.narrow(D::Minus1, 0, last_dim / 2)?; - let xs2 = xs.narrow(D::Minus1, last_dim / 2, last_dim - last_dim / 2)?; - Tensor::cat(&[&xs2.neg()?, &xs1], D::Minus1) -} - impl RotaryEmbedding { fn new(cfg: &Config, dev: &Device) -> Result { let rope_theta = cfg.rope_theta as f32; @@ -34,7 +27,6 @@ impl RotaryEmbedding { .to_dtype(DType::F32)? .reshape((max_seq_len, 1))?; let freqs = t.matmul(&inv_freq)?; - let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1)?; Ok(Self { sin: freqs.sin()?, cos: freqs.cos()?, @@ -50,10 +42,8 @@ impl RotaryEmbedding { let (_b_sz, _h, seq_len, _n_embd) = q.dims4()?; let cos = self.cos.narrow(0, seqlen_offset, seq_len)?; let sin = self.sin.narrow(0, seqlen_offset, seq_len)?; - let cos = cos.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim) - let sin = sin.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim) - let q_embed = (q.broadcast_mul(&cos)? + rotate_half(q)?.broadcast_mul(&sin))?; - let k_embed = (k.broadcast_mul(&cos)? + rotate_half(k)?.broadcast_mul(&sin))?; + let q_embed = candle_nn::rotary_emb::rope(q, &cos, &sin)?; + let k_embed = candle_nn::rotary_emb::rope(k, &cos, &sin)?; Ok((q_embed, k_embed)) } } @@ -158,10 +148,12 @@ impl Attention { let query_states = query_states .reshape((b_sz, q_len, self.num_heads, self.head_dim))? - .transpose(1, 2)?; + .transpose(1, 2)? + .contiguous()?; let key_states = key_states .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? - .transpose(1, 2)?; + .transpose(1, 2)? + .contiguous()?; let value_states = value_states .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? .transpose(1, 2)?; From f5dfe883d768e55208b325b3838474f8fe58e12f Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Tue, 26 Mar 2024 01:48:56 -0400 Subject: [PATCH 066/131] Extend supported dtypes for metal (im2col & upsample_2d) (#1938) * update im2col dtype implementations * update dtypes for upsample --- candle-core/src/metal_backend.rs | 8 ++++++++ candle-metal-kernels/src/conv.metal | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index bf501e24..b9e761f6 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1038,6 +1038,10 @@ impl BackendStorage for MetalStorage { let command_buffer = self.device.command_buffer()?; let name = match self.dtype { DType::F32 => "im2col_f32", + DType::F16 => "im2col_f16", + DType::BF16 => "im2col_bf16", + DType::U8 => "im2col_u8", + DType::U32 => "im2col_u32", dtype => crate::bail!("Metal conv2d {dtype:?} not implemented"), }; candle_metal_kernels::call_im2col_strided( @@ -1250,6 +1254,10 @@ impl BackendStorage for MetalStorage { } let name = match self.dtype { DType::F32 => "upsample_nearest2d_f32", + DType::F16 => "upsample_nearest2d_f16", + DType::BF16 => "upsample_nearest2d_bf16", + DType::U8 => "upsample_nearest2d_u8", + DType::U32 => "upsample_nearest2d_u32", dtype => crate::bail!("Metal upsample_nearest2d {dtype:?} not implemented"), }; diff --git a/candle-metal-kernels/src/conv.metal b/candle-metal-kernels/src/conv.metal index e28ac6b3..8fdd0e5f 100644 --- a/candle-metal-kernels/src/conv.metal +++ b/candle-metal-kernels/src/conv.metal @@ -486,16 +486,24 @@ kernel void FN_NAME( \ } \ IM2COL_OP(float, im2col_f32) +IM2COL_OP(half, im2col_f16) IM2COL_OP(uint8_t, im2col_u8) IM2COL_OP(uint32_t, im2col_u32) +#if defined(__HAVE_BFLOAT__) +IM2COL_OP(bfloat, im2col_bf16) +#endif IM2COL1D_OP(float, im2col1d_f32) IM2COL1D_OP(uint8_t, im2col1d_u8) IM2COL1D_OP(uint32_t, im2col1d_u32) UPSAMPLE_NEAREST2D_OP(float, upsample_nearest2d_f32) +UPSAMPLE_NEAREST2D_OP(half, upsample_nearest2d_f16) UPSAMPLE_NEAREST2D_OP(uint8_t, upsample_nearest2d_u8) UPSAMPLE_NEAREST2D_OP(uint32_t, upsample_nearest2d_u32) +#if defined(__HAVE_BFLOAT__) +UPSAMPLE_NEAREST2D_OP(bfloat, upsample_nearest2d_bf16) +#endif MAXPOOL2D_OP(float, max_pool2d_f32) MAXPOOL2D_OP(half, max_pool2d_f16) From 4523ecfb2a3a3690aa3b037a5a096354f2ca612b Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Tue, 26 Mar 2024 11:31:20 +0100 Subject: [PATCH 067/131] Faster repeat penalty (#1940) * Avoid the attention mask where possible. * Faster repeat penalty. --- candle-transformers/src/utils.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/candle-transformers/src/utils.rs b/candle-transformers/src/utils.rs index 50d3b707..3cbcac5c 100644 --- a/candle-transformers/src/utils.rs +++ b/candle-transformers/src/utils.rs @@ -3,9 +3,13 @@ use candle::{Result, Tensor}; pub fn apply_repeat_penalty(logits: &Tensor, penalty: f32, context: &[u32]) -> Result { let device = logits.device(); let mut logits = logits.to_vec1::()?; - let context: std::collections::HashSet<_> = context.iter().collect(); - for (token_id, logit) in logits.iter_mut().enumerate() { - if context.contains(&(token_id as u32)) { + let mut already_seen = std::collections::HashSet::new(); + for token_id in context { + if already_seen.contains(token_id) { + continue; + } + already_seen.insert(token_id); + if let Some(logit) = logits.get_mut(*token_id as usize) { if *logit >= 0. { *logit /= penalty } else { From 66f0a4eeea02f069838903a18dd6402821e43271 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Tue, 26 Mar 2024 17:05:26 +0100 Subject: [PATCH 068/131] Another fix for squeezing. (#1943) --- candle-core/src/shape.rs | 4 ++-- candle-transformers/src/models/dinov2.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/candle-core/src/shape.rs b/candle-core/src/shape.rs index 32ebb23f..567a711b 100644 --- a/candle-core/src/shape.rs +++ b/candle-core/src/shape.rs @@ -171,7 +171,7 @@ impl Shape { } let mut acc = 1; for (&stride, &dim) in stride.iter().zip(self.0.iter()).rev() { - if stride != acc { + if dim > 1 && stride != acc { return false; } acc *= dim; @@ -186,7 +186,7 @@ impl Shape { } let mut acc = 1; for (&stride, &dim) in stride.iter().zip(self.0.iter()) { - if stride != acc { + if dim > 1 && stride != acc { return false; } acc *= dim; diff --git a/candle-transformers/src/models/dinov2.rs b/candle-transformers/src/models/dinov2.rs index 0edc8494..757aa88a 100644 --- a/candle-transformers/src/models/dinov2.rs +++ b/candle-transformers/src/models/dinov2.rs @@ -52,8 +52,8 @@ impl Module for Attention { .transpose(0, 1)? // 20134 .transpose(2, 3)?; // 20314 let q = (qkv.i(0)? * self.scale)?; - let k = qkv.i(1)?; - let v = qkv.i(2)?; + let k = qkv.i(1)?.contiguous()?; + let v = qkv.i(2)?.contiguous()?; let attn = candle_nn::ops::softmax(&q.matmul(&k.t()?)?, D::Minus1)?; let attn = attn.matmul(&v)?.transpose(1, 2)?.reshape((b, n, c))?; self.proj.forward(&attn) From 75b6d4b0da4e7fef82d9f61e274b49af55777acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= Date: Wed, 27 Mar 2024 06:47:23 +0000 Subject: [PATCH 069/131] add config for mamba 2.8b model parameter (#1946) * first commit * Make the mamba config public. --------- Co-authored-by: laurent --- candle-transformers/src/models/mamba.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/candle-transformers/src/models/mamba.rs b/candle-transformers/src/models/mamba.rs index 597dd2cd..836327ee 100644 --- a/candle-transformers/src/models/mamba.rs +++ b/candle-transformers/src/models/mamba.rs @@ -10,10 +10,10 @@ const D_STATE: usize = 16; #[derive(Debug, Clone, serde::Deserialize)] pub struct Config { - d_model: usize, - n_layer: usize, - vocab_size: usize, - pad_vocab_size_multiple: usize, + pub d_model: usize, + pub n_layer: usize, + pub vocab_size: usize, + pub pad_vocab_size_multiple: usize, } impl Config { From a9abde5f930914ef7ef2d504728f742f80468961 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 27 Mar 2024 10:59:05 +0100 Subject: [PATCH 070/131] More flexible matmul contiguity checks. (#1949) * More flexible matmul contiguity checks. * Also relax the checks on the metal side. --- candle-core/src/cuda_backend.rs | 12 ++++++++---- candle-core/src/tensor.rs | 10 ++++++++++ candle-core/tests/tensor_tests.rs | 25 +++++++++++++++++++++++++ candle-metal-kernels/src/lib.rs | 12 ++++++++---- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs index f0f03053..97dc346e 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend.rs @@ -1651,9 +1651,11 @@ fn gemm_config( let lhs_m1 = lhs_stride[lhs_stride.len() - 1]; let lhs_m2 = lhs_stride[lhs_stride.len() - 2]; // The a tensor has dims batching, k, n (rhs) - let (lda, transa) = if rhs_m1 == 1 && rhs_m2 == n { + // We also allow for the case where the stride on the minor dimension is not as expected but + // there is a single element. + let (lda, transa) = if rhs_m1 == 1 && (rhs_m2 == n || b * k == 1) { (n as i32, cublasOperation_t::CUBLAS_OP_N) - } else if rhs_m1 == k && rhs_m2 == 1 { + } else if (rhs_m1 == k || b * n == 1) && rhs_m2 == 1 { (k as i32, cublasOperation_t::CUBLAS_OP_T) } else { Err(CudaError::MatMulNonContiguous { @@ -1663,9 +1665,11 @@ fn gemm_config( })? }; // The b tensor has dims batching, m, k (lhs) - let (ldb, transb) = if lhs_m1 == 1 && lhs_m2 == k { + // We also allow for the case where the stride on the minor dimension is not as expected but + // there is a single element. + let (ldb, transb) = if lhs_m1 == 1 && (lhs_m2 == k || b * m == 1) { (k as i32, cublasOperation_t::CUBLAS_OP_N) - } else if lhs_m1 == m && lhs_m2 == 1 { + } else if (lhs_m1 == m || b * k == 1) && lhs_m2 == 1 { (m as i32, cublasOperation_t::CUBLAS_OP_T) } else { Err(CudaError::MatMulNonContiguous { diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index 92c931eb..b53b0419 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -2007,6 +2007,16 @@ impl Tensor { } } + /// Returns a tensor that is in row major order. This always makes a copy. + pub fn force_contiguous(&self) -> Result { + let shape = self.shape(); + let mut storage = unsafe { self.device().alloc_uninit(shape, self.dtype())? }; + self.storage() + .copy_strided_src(&mut storage, 0, self.layout())?; + let op = BackpropOp::new1(self, Op::Copy); + Ok(from_storage(storage, shape.clone(), op, false)) + } + /// Create a variable based on the values currently stored in a tensor. The storage is always /// copied. pub(crate) fn make_var(&self) -> Result { diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index b2475adc..af28c1c1 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -1135,6 +1135,30 @@ fn randn(device: &Device) -> Result<()> { Ok(()) } +// https://github.com/huggingface/candle/issues/1948 +fn squeeze_mm(device: &Device) -> Result<()> { + let seq_len = 8_usize; + let a = Tensor::zeros((1, seq_len, 16), DType::F32, device)?; + let x = a.i((.., seq_len - 1, ..))?; + println!( + "x shape:{:?}, stride:{:?}, is_contiguous:{}", + x.shape(), + x.stride(), + x.is_contiguous() + ); + + let w = Tensor::zeros((32, 16), DType::F32, device)?.t()?; + println!( + "w shape:{:?}, stride:{:?}, is_contiguous:{}", + w.shape(), + w.stride(), + w.is_contiguous() + ); + let x = x.matmul(&w)?; + assert_eq!(x.dims(), &[1, 32]); + Ok(()) +} + test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal); test_device!(ones, ones_cpu, ones_gpu, ones_metal); test_device!(full, full_cpu, full_gpu, full_metal); @@ -1190,6 +1214,7 @@ test_device!( test_device!(randn, randn_cpu, randn_gpu, randn_metal); test_device!(clamp, clamp_cpu, clamp_gpu, clamp_metal); test_device!(var, var_cpu, var_gpu, var_metal); +test_device!(squeeze_mm, squeeze_mm_cpu, squeeze_mm_gpu, squeeze_mm_metal); // There was originally a bug on the CPU implementation for randn // https://github.com/huggingface/candle/issues/381 diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 449bef8f..3f452331 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1451,9 +1451,12 @@ pub fn call_gemm( let rhs_m2 = rhs_stride[rhs_stride.len() - 2]; let lhs_m1 = lhs_stride[lhs_stride.len() - 1]; let lhs_m2 = lhs_stride[lhs_stride.len() - 2]; - let a_trans = if lhs_m1 == 1 && lhs_m2 == k { + // lhs has shape b, m, k + // We also allow for the case where the stride on the minor dimension is not as expected but + // there is a single element. + let a_trans = if lhs_m1 == 1 && (lhs_m2 == k || b * m == 1) { false - } else if lhs_m1 == m && lhs_m2 == 1 { + } else if (lhs_m1 == m || b * k == 1) && lhs_m2 == 1 { true } else { return Err(MetalKernelError::MatMulNonContiguous { @@ -1462,9 +1465,10 @@ pub fn call_gemm( mnk: (m, n, k), })?; }; - let b_trans = if rhs_m1 == 1 && rhs_m2 == n { + // rhs has shape b, k, n + let b_trans = if rhs_m1 == 1 && (rhs_m2 == n || b * k == 1) { false - } else if rhs_m1 == k && rhs_m2 == 1 { + } else if (rhs_m1 == k || b * n == 1) && rhs_m2 == 1 { true } else { return Err(MetalKernelError::MatMulNonContiguous { From ab86cd37c8fd944df351d8c7ca0e93376634a332 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 27 Mar 2024 16:30:07 +0100 Subject: [PATCH 071/131] Support i64 in index-select on metal. (#1951) * Support i64 in index-select on metal. * Add some testing of index-select for all dtypes. --- candle-core/src/metal_backend.rs | 4 ++ candle-core/tests/tensor_tests.rs | 79 +++++++++++++------------ candle-metal-kernels/src/indexing.metal | 8 ++- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index b9e761f6..fed7db13 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -1391,6 +1391,10 @@ impl BackendStorage for MetalStorage { (DType::U32, DType::F16) => "is_u32_f16", (DType::U32, DType::BF16) => "is_u32_bf16", + (DType::I64, DType::F32) => "is_i64_f32", + (DType::I64, DType::F16) => "is_i64_f16", + (DType::I64, DType::BF16) => "is_i64_bf16", + (left, right) => { crate::bail!("Metal contiguous index_select {left:?} {right:?} not implemented") } diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index af28c1c1..8aacc05d 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -707,6 +707,8 @@ fn embeddings(device: &Device) -> Result<()> { assert_eq!(hs.to_vec2::()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]); let hs = t.index_select(&ids, 0)?; assert_eq!(hs.to_vec2::()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]); + let hs = t.index_select(&ids.to_dtype(DType::I64)?, 0)?; + assert_eq!(hs.to_vec2::()?, &[[0.0, 1.0], [4.0, 5.0], [2.0, 3.0]]); Ok(()) } @@ -734,44 +736,47 @@ fn index_select(device: &Device) -> Result<()> { [9.0, 10.0, 11.0] ] ); - let hs = t.index_select(&ids, 1)?; - assert_eq!( - hs.to_vec2::()?, - &[ - [0.0, 2.0, 1.0], - [3.0, 5.0, 4.0], - [6.0, 8.0, 7.0], - [9.0, 11.0, 10.0] - ] - ); - let hs = t.index_select(&ids, 0)?; - assert_eq!( - hs.to_vec2::()?, - &[[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]] - ); - // Prior to https://github.com/huggingface/candle/pull/1022 - // There would be a bug where the last values in the result tensor would be set to 0. - let ids = Tensor::new(&[0u32, 2u32, 1u32, 0u32, 2u32, 1u32], device)?; - let hs = t.index_select(&ids, 0)?; - assert_eq!( - hs.to_vec2::()?, - &[ - [0.0, 1.0, 2.0], - [6.0, 7.0, 8.0], - [3.0, 4.0, 5.0], - [0.0, 1.0, 2.0], - [6.0, 7.0, 8.0], - [3.0, 4.0, 5.0], - ] - ); + for dtype in [DType::U8, DType::U32, DType::I64] { + let ids = ids.to_dtype(dtype)?; + let hs = t.index_select(&ids, 1)?; + assert_eq!( + hs.to_vec2::()?, + &[ + [0.0, 2.0, 1.0], + [3.0, 5.0, 4.0], + [6.0, 8.0, 7.0], + [9.0, 11.0, 10.0] + ] + ); + let hs = t.index_select(&ids, 0)?; + assert_eq!( + hs.to_vec2::()?, + &[[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]] + ); + // Prior to https://github.com/huggingface/candle/pull/1022 + // There would be a bug where the last values in the result tensor would be set to 0. + let ids = Tensor::new(&[0u32, 2u32, 1u32, 0u32, 2u32, 1u32], device)?; + let hs = t.index_select(&ids, 0)?; + assert_eq!( + hs.to_vec2::()?, + &[ + [0.0, 1.0, 2.0], + [6.0, 7.0, 8.0], + [3.0, 4.0, 5.0], + [0.0, 1.0, 2.0], + [6.0, 7.0, 8.0], + [3.0, 4.0, 5.0], + ] + ); - // Test when selecting dim > 0 with ids size different from elem count of - // target dim in source/input. - let ids = Tensor::new(&[1u32, 0u32, 1u32], device)?; - let t = Tensor::arange(1f32, 5f32, device)?.reshape((2, 2))?; - assert_eq!(t.to_vec2::()?, &[[1.0, 2.0], [3.0, 4.0]]); - let hs = t.index_select(&ids, 1)?; - assert_eq!(hs.to_vec2::()?, &[[2.0, 1.0, 2.0], [4.0, 3.0, 4.0]]); + // Test when selecting dim > 0 with ids size different from elem count of + // target dim in source/input. + let ids = Tensor::new(&[1u32, 0u32, 1u32], device)?; + let t = Tensor::arange(1f32, 5f32, device)?.reshape((2, 2))?; + assert_eq!(t.to_vec2::()?, &[[1.0, 2.0], [3.0, 4.0]]); + let hs = t.index_select(&ids, 1)?; + assert_eq!(hs.to_vec2::()?, &[[2.0, 1.0, 2.0], [4.0, 3.0, 4.0]]); + } Ok(()) } diff --git a/candle-metal-kernels/src/indexing.metal b/candle-metal-kernels/src/indexing.metal index ad4a8605..762b42be 100644 --- a/candle-metal-kernels/src/indexing.metal +++ b/candle-metal-kernels/src/indexing.metal @@ -187,6 +187,12 @@ kernel void NAME( \ } +INDEX_OP(is_i64_f32, int64_t, float) +INDEX_OP(is_i64_f16, int64_t, half) +#if defined(__HAVE_BFLOAT__) +INDEX_OP(is_i64_bf16, int64_t, bfloat) +#endif + INDEX_OP(is_u32_f32, uint32_t, float) INDEX_OP(is_u32_f16, uint32_t, half) #if defined(__HAVE_BFLOAT__) @@ -242,4 +248,4 @@ INDEX_ADD_OP(ia_u8_u32, uint8_t, uint32_t) INDEX_ADD_OP(ia_u8_u8, uint8_t, uint8_t) #if defined(__HAVE_BFLOAT__) INDEX_ADD_OP(ia_u8_bf16, uint8_t, bfloat) -#endif \ No newline at end of file +#endif From 13ae5a34c76af5c8226da09819d31c4cd70c49c9 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 28 Mar 2024 06:56:48 +0100 Subject: [PATCH 072/131] Ensure that the kernels get rebuilt on cuh changes. (#1954) --- candle-kernels/build.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/candle-kernels/build.rs b/candle-kernels/build.rs index 63d744ca..c28abd97 100644 --- a/candle-kernels/build.rs +++ b/candle-kernels/build.rs @@ -1,5 +1,8 @@ fn main() { println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rerun-if-changed=src/compatibility.cuh"); + println!("cargo:rerun-if-changed=src/cuda_utils.cuh"); + println!("cargo:rerun-if-changed=src/binary_op_macros.cuh"); let builder = bindgen_cuda::Builder::default(); println!("cargo:info={builder:?}"); From ada5d7c096b530fd29b071d798660f3843945e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= Date: Thu, 28 Mar 2024 09:03:00 +0000 Subject: [PATCH 073/131] add send and sync trait bounds for scheduler config in stable diffusion models (#1952) * first commit * add Sync deriving * static * remove static --- candle-transformers/src/models/stable_diffusion/schedulers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-transformers/src/models/stable_diffusion/schedulers.rs b/candle-transformers/src/models/stable_diffusion/schedulers.rs index 0f0441e0..94f8ab86 100644 --- a/candle-transformers/src/models/stable_diffusion/schedulers.rs +++ b/candle-transformers/src/models/stable_diffusion/schedulers.rs @@ -5,7 +5,7 @@ //! inference speed and quality. use candle::{Result, Tensor}; -pub trait SchedulerConfig: std::fmt::Debug { +pub trait SchedulerConfig: std::fmt::Debug + Send + Sync { fn build(&self, inference_steps: usize) -> Result>; } From b3484e7a5e8d8c613e2a444c6f056142fc1e758d Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 28 Mar 2024 10:17:38 +0100 Subject: [PATCH 074/131] Fix for the RWKV models. (#1955) * Fix for the RWKV models. * More general fix + revert the rwkv hack. * Remove the old hack. --- candle-core/src/cuda_backend.rs | 28 ++++++++++++++-------------- candle-metal-kernels/src/lib.rs | 8 ++++---- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend.rs index 97dc346e..23487330 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend.rs @@ -62,8 +62,8 @@ pub enum CudaError { #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")] MatMulNonContiguous { - lhs_stride: Vec, - rhs_stride: Vec, + lhs_stride: Layout, + rhs_stride: Layout, mnk: (usize, usize, usize), }, @@ -1653,28 +1653,28 @@ fn gemm_config( // The a tensor has dims batching, k, n (rhs) // We also allow for the case where the stride on the minor dimension is not as expected but // there is a single element. - let (lda, transa) = if rhs_m1 == 1 && (rhs_m2 == n || b * k == 1) { + let (lda, transa) = if (rhs_m1 == 1 || n == 1) && (rhs_m2 == n || k == 1) { (n as i32, cublasOperation_t::CUBLAS_OP_N) - } else if (rhs_m1 == k || b * n == 1) && rhs_m2 == 1 { + } else if (rhs_m1 == k || n == 1) && (rhs_m2 == 1 || k == 1) { (k as i32, cublasOperation_t::CUBLAS_OP_T) } else { Err(CudaError::MatMulNonContiguous { - lhs_stride: lhs_stride.to_vec(), - rhs_stride: rhs_stride.to_vec(), + lhs_stride: lhs_l.clone(), + rhs_stride: rhs_l.clone(), mnk: (m, n, k), })? }; // The b tensor has dims batching, m, k (lhs) // We also allow for the case where the stride on the minor dimension is not as expected but // there is a single element. - let (ldb, transb) = if lhs_m1 == 1 && (lhs_m2 == k || b * m == 1) { + let (ldb, transb) = if (lhs_m1 == 1 || k == 1) && (lhs_m2 == k || m == 1) { (k as i32, cublasOperation_t::CUBLAS_OP_N) - } else if (lhs_m1 == m || b * k == 1) && lhs_m2 == 1 { + } else if (lhs_m1 == m || k == 1) && (lhs_m2 == 1 || m == 1) { (m as i32, cublasOperation_t::CUBLAS_OP_T) } else { Err(CudaError::MatMulNonContiguous { - lhs_stride: lhs_stride.to_vec(), - rhs_stride: rhs_stride.to_vec(), + lhs_stride: lhs_l.clone(), + rhs_stride: rhs_l.clone(), mnk: (m, n, k), })? }; @@ -1698,8 +1698,8 @@ fn gemm_config( [stride] => stride, [] => m * k, _ => Err(CudaError::MatMulNonContiguous { - lhs_stride: lhs_stride.to_vec(), - rhs_stride: rhs_stride.to_vec(), + lhs_stride: lhs_l.clone(), + rhs_stride: rhs_l.clone(), mnk: (m, n, k), })?, }; @@ -1708,8 +1708,8 @@ fn gemm_config( [stride] => stride, [] => n * k, _ => Err(CudaError::MatMulNonContiguous { - lhs_stride: lhs_stride.to_vec(), - rhs_stride: rhs_stride.to_vec(), + lhs_stride: lhs_l.clone(), + rhs_stride: rhs_l.clone(), mnk: (m, n, k), })?, }; diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 3f452331..140927e3 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1454,9 +1454,9 @@ pub fn call_gemm( // lhs has shape b, m, k // We also allow for the case where the stride on the minor dimension is not as expected but // there is a single element. - let a_trans = if lhs_m1 == 1 && (lhs_m2 == k || b * m == 1) { + let a_trans = if (lhs_m1 == 1 || k == 1) && (lhs_m2 == k || m == 1) { false - } else if (lhs_m1 == m || b * k == 1) && lhs_m2 == 1 { + } else if (lhs_m1 == m || k == 1) && (lhs_m2 == 1 || m == 1) { true } else { return Err(MetalKernelError::MatMulNonContiguous { @@ -1466,9 +1466,9 @@ pub fn call_gemm( })?; }; // rhs has shape b, k, n - let b_trans = if rhs_m1 == 1 && (rhs_m2 == n || b * k == 1) { + let b_trans = if (rhs_m1 == 1 || n == 1) && (rhs_m2 == n || k == 1) { false - } else if (rhs_m1 == k || b * n == 1) && rhs_m2 == 1 { + } else if (rhs_m1 == k || n == 1) && (rhs_m2 == 1 || k == 1) { true } else { return Err(MetalKernelError::MatMulNonContiguous { From b0340d72ec9dd8f3bb1778e5a7d73111e67a4393 Mon Sep 17 00:00:00 2001 From: Tigran Zhampeissov <81493298+Tigranchick@users.noreply.github.com> Date: Thu, 28 Mar 2024 17:44:12 +0500 Subject: [PATCH 075/131] CLIP model implementation with example (#1950) * CLIP model implementation with example * CLIP Implementation fixes, batch images * CLIP model remove images from git * CLIP model remove unnecessary use of batch_indices --- candle-examples/examples/clip/README.md | 46 +++ candle-examples/examples/clip/main.rs | 202 ++++++++++ candle-transformers/src/models/clip/mod.rs | 167 ++++++++ .../src/models/clip/text_model.rs | 355 ++++++++++++++++++ .../src/models/clip/vision_model.rs | 171 +++++++++ candle-transformers/src/models/mod.rs | 1 + 6 files changed, 942 insertions(+) create mode 100644 candle-examples/examples/clip/README.md create mode 100644 candle-examples/examples/clip/main.rs create mode 100644 candle-transformers/src/models/clip/mod.rs create mode 100644 candle-transformers/src/models/clip/text_model.rs create mode 100644 candle-transformers/src/models/clip/vision_model.rs diff --git a/candle-examples/examples/clip/README.md b/candle-examples/examples/clip/README.md new file mode 100644 index 00000000..f0ee3b2c --- /dev/null +++ b/candle-examples/examples/clip/README.md @@ -0,0 +1,46 @@ +Contrastive Language-Image Pre-Training + +Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +pairs of images with related texts. + +https://github.com/openai/CLIP + +https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip + +## Running on an example on cpu + +``` +$ cargo run --example clip --release -- --images "candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg","candle-examples/examples/yolo-v8/assets/bike.jpg" --cpu --sequences "a cycling race","a photo of two cats","a robot holding a candle" + + +Results for image: candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg + +INFO clip: Probability: 0.0000% Text: a cycling race +INFO clip: Probability: 0.0000% Text: a photo of two cats +INFO clip: Probability: 100.0000% Text: a robot holding a candle + +Results for image: candle-examples/examples/yolo-v8/assets/bike.jpg + +INFO clip: Probability: 99.9999% Text: a cycling race +INFO clip: Probability: 0.0001% Text: a photo of two cats +INFO clip: Probability: 0.0000% Text: a robot holding a candle +``` + +## Running on an example with metal feature (mac) + +``` +$ cargo run --features metal --example clip --release -- --images "candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg","candle-examples/examples/yolo-v8/assets/bike.jpg" --cpu --sequences "a cycling race","a photo of two cats","a robot holding a candle" + + +Results for image: candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg + +INFO clip: Probability: 0.0000% Text: a cycling race +INFO clip: Probability: 0.0000% Text: a photo of two cats +INFO clip: Probability: 100.0000% Text: a robot holding a candle + +Results for image: candle-examples/examples/yolo-v8/assets/bike.jpg + +INFO clip: Probability: 99.9999% Text: a cycling race +INFO clip: Probability: 0.0001% Text: a photo of two cats +INFO clip: Probability: 0.0000% Text: a robot holding a candle +``` diff --git a/candle-examples/examples/clip/main.rs b/candle-examples/examples/clip/main.rs new file mode 100644 index 00000000..f301d211 --- /dev/null +++ b/candle-examples/examples/clip/main.rs @@ -0,0 +1,202 @@ +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + +use anyhow::Error as E; +use clap::Parser; + +use candle::{DType, Device, Tensor}; +use candle_nn::{ops::softmax, VarBuilder}; +use candle_transformers::models::clip; + +use tokenizers::Tokenizer; +use tracing::info; + +#[derive(Parser)] +struct Args { + #[arg(long)] + model: Option, + + #[arg(long)] + tokenizer: Option, + + #[arg(long, use_value_delimiter = true)] + images: Option>, + + #[arg(long)] + cpu: bool, + + #[arg(long, use_value_delimiter = true)] + sequences: Option>, +} + +fn load_image>(path: T, image_size: usize) -> anyhow::Result { + let img = image::io::Reader::open(path)?.decode()?; + let (height, width) = (image_size, image_size); + let img = img.resize_to_fill( + width as u32, + height as u32, + image::imageops::FilterType::Triangle, + ); + + let img = img.to_rgb8(); + + let img = img.into_raw(); + let img = Tensor::from_vec(img, (height, width, 3), &Device::Cpu)? + .permute((2, 0, 1))? + .to_dtype(DType::F32)? + .affine(2. / 255., -1.)?; + // .unsqueeze(0)?; + Ok(img) +} + +fn load_images>( + paths: &Vec, + image_size: usize, +) -> anyhow::Result { + let mut images = vec![]; + + for path in paths { + let tensor = load_image(path, image_size)?; + images.push(tensor); + } + + let images = Tensor::stack(&images, 0)?; + + Ok(images) +} + +pub fn main() -> anyhow::Result<()> { + // std::env::set_var("RUST_BACKTRACE", "full"); + + let args = Args::parse(); + + tracing_subscriber::fmt::init(); + + let model_file = match args.model { + None => { + let api = hf_hub::api::sync::Api::new()?; + + let api = api.repo(hf_hub::Repo::with_revision( + "openai/clip-vit-base-patch32".to_string(), + hf_hub::RepoType::Model, + "refs/pr/15".to_string(), + )); + + api.get("model.safetensors")? + } + Some(model) => model.into(), + }; + + let tokenizer = get_tokenizer(args.tokenizer)?; + + let config = clip::ClipConfig::vit_base_patch32(); + + let device = candle_examples::device(args.cpu)?; + + let vec_imgs = match args.images { + Some(imgs) => imgs, + None => vec![ + "candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg".to_string(), + "candle-examples/examples/yolo-v8/assets/bike.jpg".to_string(), + ], + }; + + // let image = load_image(args.image, config.image_size)?.to_device(&device)?; + let images = load_images(&vec_imgs, config.image_size)?.to_device(&device)?; + + let vb = + unsafe { VarBuilder::from_mmaped_safetensors(&[model_file.clone()], DType::F32, &device)? }; + + let model = clip::ClipModel::new(vb, &config)?; + + let (input_ids, vec_seq) = tokenize_sequences(args.sequences, &tokenizer, &device)?; + + let (_logits_per_text, logits_per_image) = model.forward(&images, &input_ids)?; + + let softmax_image = softmax(&logits_per_image, 1)?; + + let softmax_image_vec = softmax_image.flatten_all()?.to_vec1::()?; + + info!("softmax_image_vec: {:?}", softmax_image_vec); + + let probability_vec = softmax_image_vec + .iter() + .map(|v| v * 100.0) + .collect::>(); + + let probability_per_image = probability_vec.len() / vec_imgs.len(); + + for (i, img) in vec_imgs.iter().enumerate() { + let start = i * probability_per_image; + let end = start + probability_per_image; + let prob = &probability_vec[start..end]; + info!("\n\nResults for image: {}\n", img); + + for (i, p) in prob.iter().enumerate() { + info!("Probability: {:.4}% Text: {} ", p, vec_seq[i]); + } + } + + Ok(()) +} + +pub fn get_tokenizer(tokenizer: Option) -> anyhow::Result { + let tokenizer = match tokenizer { + None => { + let api = hf_hub::api::sync::Api::new()?; + let api = api.repo(hf_hub::Repo::with_revision( + "openai/clip-vit-base-patch32".to_string(), + hf_hub::RepoType::Model, + "refs/pr/15".to_string(), + )); + api.get("tokenizer.json")? + } + Some(file) => file.into(), + }; + + Tokenizer::from_file(tokenizer).map_err(E::msg) +} + +pub fn tokenize_sequences( + sequences: Option>, + tokenizer: &Tokenizer, + device: &Device, +) -> anyhow::Result<(Tensor, Vec)> { + let pad_id = *tokenizer + .get_vocab(true) + .get("<|endoftext|>") + .ok_or(E::msg("No pad token"))?; + + let vec_seq = match sequences { + Some(seq) => seq, + None => vec![ + "a cycling race".to_string(), + "a photo of two cats".to_string(), + "a robot holding a candle".to_string(), + ], + }; + + let mut tokens = vec![]; + + for seq in vec_seq.clone() { + let encoding = tokenizer.encode(seq, true).map_err(E::msg)?; + tokens.push(encoding.get_ids().to_vec()); + } + + let max_len = tokens.iter().map(|v| v.len()).max().unwrap_or(0); + + // Pad the sequences to have the same length + for token_vec in tokens.iter_mut() { + let len_diff = max_len - token_vec.len(); + if len_diff > 0 { + token_vec.extend(vec![pad_id; len_diff]); + } + } + + let input_ids = Tensor::new(tokens, device)?; + + Ok((input_ids, vec_seq)) +} diff --git a/candle-transformers/src/models/clip/mod.rs b/candle-transformers/src/models/clip/mod.rs new file mode 100644 index 00000000..02df782b --- /dev/null +++ b/candle-transformers/src/models/clip/mod.rs @@ -0,0 +1,167 @@ +//! Contrastive Language-Image Pre-Training +//! +//! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +//! pairs of images with related texts. +//! +//! https://github.com/openai/CLIP +//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip +use self::{ + text_model::{Activation, ClipTextTransformer}, + vision_model::ClipVisionTransformer, +}; +use candle::{Result, Tensor, D}; +use candle_nn::Module; + +use tracing::warn; + +pub mod text_model; +pub mod vision_model; + +pub struct ClipModel { + text_model: ClipTextTransformer, + vision_model: ClipVisionTransformer, + visual_projection: candle_nn::Linear, + text_projection: candle_nn::Linear, + logit_scale: Tensor, +} + +pub enum EncoderConfig { + Text(text_model::ClipTextConfig), + Vision(vision_model::ClipVisionConfig), +} + +impl EncoderConfig { + pub fn embed_dim(&self) -> usize { + match self { + Self::Text(c) => c.embed_dim, + Self::Vision(c) => c.embed_dim, + } + } + + pub fn num_attention_heads(&self) -> usize { + match self { + Self::Text(c) => c.num_attention_heads, + Self::Vision(c) => c.num_attention_heads, + } + } + + pub fn intermediate_size(&self) -> usize { + match self { + Self::Text(c) => c.intermediate_size, + Self::Vision(c) => c.intermediate_size, + } + } + + pub fn num_hidden_layers(&self) -> usize { + match self { + Self::Text(c) => c.num_hidden_layers, + Self::Vision(c) => c.num_hidden_layers, + } + } + + pub fn activation(&self) -> Activation { + match self { + Self::Text(_c) => Activation::QuickGelu, + Self::Vision(c) => c.activation, + } + } +} + +pub struct ClipConfig { + pub text_config: text_model::ClipTextConfig, + pub vision_config: vision_model::ClipVisionConfig, + pub logit_scale_init_value: f32, + pub image_size: usize, +} + +impl ClipConfig { + // base image size is 224, model size is 600Mb + pub fn vit_base_patch32() -> Self { + let text_config = text_model::ClipTextConfig::vit_base_patch32(); + let vision_config = vision_model::ClipVisionConfig::vit_base_patch32(); + + Self { + text_config, + vision_config, + logit_scale_init_value: 2.6592, + image_size: 224, + } + } +} + +impl ClipModel { + pub fn new(vs: candle_nn::VarBuilder, c: &ClipConfig) -> Result { + let text_model = ClipTextTransformer::new(vs.pp("text_model"), &c.text_config)?; + + let vision_model = ClipVisionTransformer::new(vs.pp("vision_model"), &c.vision_config)?; + + let visual_projection = candle_nn::linear_no_bias( + c.vision_config.embed_dim, + c.vision_config.projection_dim, + vs.pp("visual_projection"), + )?; + + let text_projection = candle_nn::linear_no_bias( + c.text_config.embed_dim, + c.text_config.projection_dim, + vs.pp("text_projection"), + )?; + + // originally nn.Parameter + let logit_scale = if vs.contains_tensor("logit_scale") { + vs.get(&[], "logit_scale")? + } else { + warn!("Creating logit_scale tensor, results may vary."); + Tensor::new(&[c.logit_scale_init_value], vs.device())? + }; + + Ok(Self { + text_model, + vision_model, + visual_projection, + text_projection, + logit_scale, + }) + } + + pub fn get_text_features(&self, input_ids: &Tensor) -> Result { + let text_outputs = self.text_model.forward(input_ids)?; + + let text_features = self.text_projection.forward(&text_outputs)?; + + Ok(text_features) + } + + pub fn get_image_features(&self, pixel_values: &Tensor) -> Result { + let image_features = self.vision_model.forward(pixel_values)?; + + let image_features = self.visual_projection.forward(&image_features)?; + + Ok(image_features) + } + + pub fn forward(&self, pixel_values: &Tensor, input_ids: &Tensor) -> Result<(Tensor, Tensor)> { + let image_features = self.get_image_features(pixel_values)?; + + let text_features = self.get_text_features(input_ids)?; + + let image_features_normalized = div_l2_norm(&image_features)?; + + let text_features_normalized = div_l2_norm(&text_features)?; + + let logits_per_text = text_features_normalized.matmul(&image_features_normalized.t()?)?; + + let logit_scale = &self.logit_scale.exp()?; + + let logits_per_text = logits_per_text.broadcast_mul(&logit_scale)?; + + let logits_per_image = logits_per_text.t()?; + + Ok((logits_per_text, logits_per_image)) + } +} + +pub fn div_l2_norm(v: &Tensor) -> Result { + let l2_norm = v.sqr()?.sum_keepdim(D::Minus1)?.sqrt()?; + v.broadcast_div(&l2_norm) +} diff --git a/candle-transformers/src/models/clip/text_model.rs b/candle-transformers/src/models/clip/text_model.rs new file mode 100644 index 00000000..852d3e24 --- /dev/null +++ b/candle-transformers/src/models/clip/text_model.rs @@ -0,0 +1,355 @@ +//! Contrastive Language-Image Pre-Training +//! +//! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +//! pairs of images with related texts. +//! +//! https://github.com/openai/CLIP +//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip + +use candle::{DType, Device, IndexOp, Result, Tensor, D}; +use candle_nn as nn; +use candle_nn::Module; + +use super::EncoderConfig; + +#[derive(Debug, Clone, Copy)] +pub enum Activation { + QuickGelu, +} + +impl Module for Activation { + fn forward(&self, xs: &Tensor) -> Result { + match self { + Activation::QuickGelu => xs * nn::ops::sigmoid(&(xs * 1.702f64)?)?, + } + } +} + +#[derive(Debug, Clone)] +pub struct ClipTextConfig { + pub vocab_size: usize, + pub embed_dim: usize, + pub activation: Activation, + pub intermediate_size: usize, + pub max_position_embeddings: usize, + pub pad_with: Option, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + #[allow(dead_code)] + pub projection_dim: usize, +} + +impl ClipTextConfig { + // The config details can be found in the "text_config" section of this json file: + // https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json + pub fn vit_base_patch32() -> Self { + Self { + vocab_size: 49408, + embed_dim: 512, + intermediate_size: 2048, + max_position_embeddings: 77, + pad_with: None, + num_hidden_layers: 12, + num_attention_heads: 8, + projection_dim: 512, + activation: Activation::QuickGelu, + } + } +} + +// ClipTextEmbeddings mostly based on the existing implementation in the stable diffision model. +// TODO rewrite to be more similar to https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L142 +#[derive(Debug)] +struct ClipTextEmbeddings { + token_embedding: candle_nn::Embedding, + position_embedding: candle_nn::Embedding, + position_ids: Tensor, +} + +impl ClipTextEmbeddings { + fn new(vs: candle_nn::VarBuilder, c: &ClipTextConfig) -> Result { + let token_embedding = + candle_nn::embedding(c.vocab_size, c.embed_dim, vs.pp("token_embedding"))?; + + let position_embedding: nn::Embedding = candle_nn::embedding( + c.max_position_embeddings, + c.embed_dim, + vs.pp("position_embedding"), + )?; + + let position_ids = + Tensor::arange(0u32, c.max_position_embeddings as u32, vs.device())?.unsqueeze(0)?; + + Ok(ClipTextEmbeddings { + token_embedding, + position_embedding, + position_ids, + }) + } +} + +impl Module for ClipTextEmbeddings { + fn forward(&self, input_ids: &Tensor) -> Result { + let seq_length = input_ids.dim(D::Minus1)?; + + let inputs_embeds = &self.token_embedding.forward(input_ids)?; + + let postion_ids = &self.position_ids.narrow(1, 0, seq_length)?; + + let position_embedding = &self.position_embedding.forward(&postion_ids)?; + + let inputs_embeds = inputs_embeds.broadcast_add(&position_embedding)?; + + Ok(inputs_embeds) + } +} + +#[derive(Debug)] +struct ClipAttention { + k_proj: candle_nn::Linear, + v_proj: candle_nn::Linear, + q_proj: candle_nn::Linear, + out_proj: candle_nn::Linear, + head_dim: usize, + scale: f64, + num_attention_heads: usize, +} + +impl ClipAttention { + fn new(vs: candle_nn::VarBuilder, c: &EncoderConfig) -> Result { + let embed_dim = c.embed_dim(); + let num_attention_heads = c.num_attention_heads(); + let k_proj = candle_nn::linear(embed_dim, embed_dim, vs.pp("k_proj"))?; + let v_proj = candle_nn::linear(embed_dim, embed_dim, vs.pp("v_proj"))?; + let q_proj = candle_nn::linear(embed_dim, embed_dim, vs.pp("q_proj"))?; + let out_proj = candle_nn::linear(embed_dim, embed_dim, vs.pp("out_proj"))?; + let head_dim = embed_dim / num_attention_heads; + let scale = (head_dim as f64).powf(-0.5); + + Ok(ClipAttention { + k_proj, + v_proj, + q_proj, + out_proj, + head_dim, + scale, + num_attention_heads, + }) + } + + fn shape(&self, xs: &Tensor, seq_len: usize, bsz: usize) -> Result { + xs.reshape((bsz, seq_len, self.num_attention_heads, self.head_dim))? + .transpose(1, 2)? + .contiguous() + } + + fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { + let in_dtype = xs.dtype(); + let (bsz, seq_len, embed_dim) = xs.dims3()?; + + let query_states = (self.q_proj.forward(xs)? * self.scale)?; + let proj_shape = (bsz * self.num_attention_heads, seq_len, self.head_dim); + let query_states = self + .shape(&query_states, seq_len, bsz)? + .reshape(proj_shape)? + .to_dtype(DType::F32)?; + let key_states = self + .shape(&self.k_proj.forward(xs)?, seq_len, bsz)? + .reshape(proj_shape)? + .to_dtype(DType::F32)?; + let value_states = self + .shape(&self.v_proj.forward(xs)?, seq_len, bsz)? + .reshape(proj_shape)? + .to_dtype(DType::F32)?; + let attn_weights = query_states.matmul(&key_states.transpose(1, 2)?)?; + + let src_len = key_states.dim(1)?; + + let attn_weights = if let Some(causal_attention_mask) = causal_attention_mask { + let attn_reshape = + attn_weights.reshape((bsz, self.num_attention_heads, seq_len, src_len))?; + + let attn_weights = attn_reshape.broadcast_add(causal_attention_mask)?; + + let attn_weights = + attn_weights.reshape((bsz * self.num_attention_heads, seq_len, src_len))?; + + attn_weights + } else { + attn_weights + }; + + let attn_weights = candle_nn::ops::softmax(&attn_weights, D::Minus1)?; + + let attn_output = attn_weights.matmul(&value_states)?.to_dtype(in_dtype)?; + let attn_output = attn_output + .reshape((bsz, self.num_attention_heads, seq_len, self.head_dim))? + .transpose(1, 2)? + .reshape((bsz, seq_len, embed_dim))?; + self.out_proj.forward(&attn_output) + } +} + +#[derive(Debug)] +struct ClipMlp { + fc1: candle_nn::Linear, + fc2: candle_nn::Linear, + activation: Activation, +} + +impl ClipMlp { + fn new(vs: candle_nn::VarBuilder, c: &EncoderConfig) -> Result { + let fc1 = candle_nn::linear(c.embed_dim(), c.intermediate_size(), vs.pp("fc1"))?; + let fc2 = candle_nn::linear(c.intermediate_size(), c.embed_dim(), vs.pp("fc2"))?; + + Ok(ClipMlp { + fc1, + fc2, + activation: c.activation(), + }) + } +} + +impl ClipMlp { + fn forward(&self, xs: &Tensor) -> Result { + let xs = self.fc1.forward(xs)?; + self.fc2.forward(&self.activation.forward(&xs)?) + } +} + +#[derive(Debug)] +struct ClipEncoderLayer { + self_attn: ClipAttention, + layer_norm1: candle_nn::LayerNorm, + mlp: ClipMlp, + layer_norm2: candle_nn::LayerNorm, +} + +impl ClipEncoderLayer { + fn new(vs: candle_nn::VarBuilder, c: &EncoderConfig) -> Result { + let self_attn = ClipAttention::new(vs.pp("self_attn"), c)?; + let layer_norm1 = candle_nn::layer_norm(c.embed_dim(), 1e-5, vs.pp("layer_norm1"))?; + let mlp = ClipMlp::new(vs.pp("mlp"), c)?; + let layer_norm2 = candle_nn::layer_norm(c.embed_dim(), 1e-5, vs.pp("layer_norm2"))?; + + Ok(ClipEncoderLayer { + self_attn, + layer_norm1, + mlp, + layer_norm2, + }) + } + + fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { + let residual = xs; + let xs = self.layer_norm1.forward(xs)?; + let xs = self.self_attn.forward(&xs, causal_attention_mask)?; + let xs = (xs + residual)?; + + let residual = &xs; + let xs = self.layer_norm2.forward(&xs)?; + let xs = self.mlp.forward(&xs)?; + xs + residual + } +} + +#[derive(Debug)] +pub struct ClipEncoder { + layers: Vec, +} + +impl ClipEncoder { + pub fn new(vs: candle_nn::VarBuilder, c: &EncoderConfig) -> Result { + let vs = vs.pp("layers"); + let mut layers: Vec = Vec::new(); + for index in 0..c.num_hidden_layers() { + let layer = ClipEncoderLayer::new(vs.pp(&index.to_string()), c)?; + layers.push(layer) + } + Ok(ClipEncoder { layers }) + } + + pub fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { + let mut xs = xs.clone(); + + for layer in self.layers.iter() { + xs = layer.forward(&xs, causal_attention_mask)?; + } + Ok(xs) + } +} + +/// A CLIP transformer based model. +#[derive(Debug)] +pub struct ClipTextTransformer { + embeddings: ClipTextEmbeddings, + encoder: ClipEncoder, + final_layer_norm: candle_nn::LayerNorm, +} + +impl ClipTextTransformer { + pub fn new(vs: candle_nn::VarBuilder, c: &ClipTextConfig) -> Result { + let embeddings = ClipTextEmbeddings::new(vs.pp("embeddings"), c)?; + let encoder = ClipEncoder::new(vs.pp("encoder"), &EncoderConfig::Text(c.clone()))?; + let final_layer_norm = candle_nn::layer_norm(c.embed_dim, 1e-5, vs.pp("final_layer_norm"))?; + + Ok(ClipTextTransformer { + embeddings, + encoder, + final_layer_norm, + }) + } + + // TODO: rewrrite to newer version + fn build_causal_attention_mask( + bsz: usize, + seq_len: usize, + mask_after: usize, + device: &Device, + ) -> Result { + let mask: Vec<_> = (0..seq_len) + .flat_map(|i| { + (0..seq_len).map(move |j| { + if j > i || j > mask_after { + f32::MIN + } else { + 0. + } + }) + }) + .collect(); + let mask = Tensor::from_slice(&mask, (seq_len, seq_len), device)?; + mask.broadcast_as((bsz, 1, seq_len, seq_len)) + } + + pub fn forward_with_mask(&self, input_ids: &Tensor, mask_after: usize) -> Result { + let (bsz, seq_len) = input_ids.dims2()?; + let input_ids = self.embeddings.forward(input_ids)?; + + let causal_attention_mask = + Self::build_causal_attention_mask(bsz, seq_len, mask_after, input_ids.device())?; + let input_ids = self + .encoder + .forward(&input_ids, Some(&causal_attention_mask))?; + self.final_layer_norm.forward(&input_ids) + } +} + +impl Module for ClipTextTransformer { + fn forward(&self, input_ids: &Tensor) -> Result { + let output = self.forward_with_mask(input_ids, usize::MAX)?; + + let sequence_max_indices = input_ids.argmax(D::Minus1)?.to_dtype(DType::I64)?; + + let mut indices: Vec = Vec::new(); + + for (batch_idx, &seq_idx) in sequence_max_indices.to_vec1::()?.iter().enumerate() { + let index = output.i((batch_idx, seq_idx as usize))?.unsqueeze(0)?; + indices.push(index); + } + + let pooled_output = Tensor::cat(&indices, 0)?; + + Ok(pooled_output) + } +} diff --git a/candle-transformers/src/models/clip/vision_model.rs b/candle-transformers/src/models/clip/vision_model.rs new file mode 100644 index 00000000..af9af7ae --- /dev/null +++ b/candle-transformers/src/models/clip/vision_model.rs @@ -0,0 +1,171 @@ +//! Contrastive Language-Image Pre-Training +//! +//! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +//! pairs of images with related texts. +//! +//! https://github.com/openai/CLIP +//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip + +use candle::{IndexOp, Result, Shape, Tensor, D}; +use candle_nn as nn; +use candle_nn::Module; +use nn::Conv2dConfig; +use tracing::warn; + +use super::{ + text_model::{Activation, ClipEncoder}, + EncoderConfig, +}; + +#[derive(Debug, Clone)] +pub struct ClipVisionConfig { + pub embed_dim: usize, + pub activation: Activation, + pub intermediate_size: usize, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + #[allow(dead_code)] + pub projection_dim: usize, + pub num_channels: usize, + pub image_size: usize, + pub patch_size: usize, +} + +impl ClipVisionConfig { + // The config details can be found in the "vision_config" section of this json file: + // https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json + pub fn vit_base_patch32() -> Self { + Self { + embed_dim: 768, + activation: Activation::QuickGelu, + intermediate_size: 3072, + num_hidden_layers: 12, + num_attention_heads: 12, + projection_dim: 512, + num_channels: 3, + image_size: 224, + patch_size: 32, + } + } +} + +// https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L112 +#[derive(Debug)] +struct ClipVisionEmbeddings { + patch_embedding: candle_nn::Conv2d, + position_ids: Tensor, + class_embedding: Tensor, + position_embedding: candle_nn::Embedding, +} + +impl ClipVisionEmbeddings { + fn new(vs: candle_nn::VarBuilder, c: &ClipVisionConfig) -> Result { + // originally nn.Parameter + let class_embedding = if vs.contains_tensor("class_embedding") { + vs.get(c.embed_dim, "class_embedding")? + } else { + warn!("class_embedding not found in the. Initializing a new one."); + Tensor::randn(0.0 as f32, 1.0 as f32, &[c.embed_dim], vs.device())? + }; + + let num_patches = (c.image_size / c.patch_size).pow(2); + + let num_positions = num_patches + 1; + + let position_ids = Tensor::arange(0, num_positions as i64, vs.device())?; + + let conv2dconfig = Conv2dConfig { + stride: c.patch_size, + ..Default::default() + }; + let position_embedding = + candle_nn::embedding(num_positions, c.embed_dim, vs.pp("position_embedding"))?; + + let patch_embedding = candle_nn::conv2d_no_bias( + c.num_channels, + c.embed_dim, + c.patch_size, + conv2dconfig, + vs.pp("patch_embedding"), + )?; + + Ok(Self { + patch_embedding, + position_ids, + class_embedding, + position_embedding, + }) + } +} + +impl Module for ClipVisionEmbeddings { + fn forward(&self, pixel_values: &Tensor) -> Result { + let batch_size = pixel_values.shape().dims(); + + let patch_embeds = self.patch_embedding.forward(&pixel_values)?; + + let patch_embeds = patch_embeds.flatten_from(2)?; + + let patch_embeds = patch_embeds.transpose(1, 2)?; + + let class_embedding = self.class_embedding.clone(); + + let shape = Shape::from(vec![batch_size[0], 1, class_embedding.dim(D::Minus1)?]); + + let class_embeds = class_embedding.expand(shape)?; + + let embeddings = Tensor::cat(&[class_embeds, patch_embeds], 1)?; + + let position_embedding = self.position_embedding.forward(&self.position_ids)?; + + let embeddings = embeddings.broadcast_add(&position_embedding)?; + + Ok(embeddings) + } +} + +// https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L743 +#[derive(Debug)] +pub struct ClipVisionTransformer { + embeddings: ClipVisionEmbeddings, + encoder: ClipEncoder, + pre_layer_norm: candle_nn::LayerNorm, + final_layer_norm: candle_nn::LayerNorm, +} + +impl ClipVisionTransformer { + pub fn new(vs: candle_nn::VarBuilder, c: &ClipVisionConfig) -> Result { + let embeddings = ClipVisionEmbeddings::new(vs.pp("embeddings"), c)?; + + let pre_layer_norm = candle_nn::layer_norm(c.embed_dim, 1e-5, vs.pp("pre_layrnorm"))?; + + let encoder = ClipEncoder::new(vs.pp("encoder"), &EncoderConfig::Vision(c.clone()))?; + + let final_layer_norm = candle_nn::layer_norm(c.embed_dim, 1e-5, vs.pp("post_layernorm"))?; + + Ok(Self { + embeddings, + encoder, + final_layer_norm, + pre_layer_norm, + }) + } +} + +impl Module for ClipVisionTransformer { + fn forward(&self, pixel_values: &Tensor) -> Result { + let hidden_states = self.embeddings.forward(pixel_values)?; + + let hidden_states = self.pre_layer_norm.forward(&hidden_states)?; + + let encoder_outputs = self.encoder.forward(&hidden_states, None)?; + + // https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L787 + // pooled_output = encoder_outputs[:, 0, :] + let pooled_output = encoder_outputs.i((.., 0, ..))?; + + let output = self.final_layer_norm.forward(&pooled_output)?; + + Ok(output) + } +} diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 389d1a80..4267059c 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -12,6 +12,7 @@ pub mod efficientvit; pub mod encodec; pub mod falcon; pub mod gemma; +pub mod clip; pub mod jina_bert; pub mod llama; pub mod llama2_c; From cdc8b57b5cf28ad92642b076d67e610bdb958b2d Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 28 Mar 2024 14:17:46 +0100 Subject: [PATCH 076/131] Fix clippy lints + minor cleanups. (#1957) * Fix clippy lints + minor cleanups. * fmt. * Derive clone. --- candle-transformers/src/models/clip/mod.rs | 33 ++++-------- .../src/models/clip/text_model.rs | 52 ++++++------------ .../src/models/clip/vision_model.rs | 54 ++++++------------- candle-transformers/src/models/mod.rs | 2 +- 4 files changed, 41 insertions(+), 100 deletions(-) diff --git a/candle-transformers/src/models/clip/mod.rs b/candle-transformers/src/models/clip/mod.rs index 02df782b..9613fdab 100644 --- a/candle-transformers/src/models/clip/mod.rs +++ b/candle-transformers/src/models/clip/mod.rs @@ -10,13 +10,11 @@ use self::{ vision_model::ClipVisionTransformer, }; use candle::{Result, Tensor, D}; -use candle_nn::Module; - -use tracing::warn; pub mod text_model; pub mod vision_model; +#[derive(Clone, Debug)] pub struct ClipModel { text_model: ClipTextTransformer, vision_model: ClipVisionTransformer, @@ -25,6 +23,7 @@ pub struct ClipModel { logit_scale: Tensor, } +#[derive(Clone, Debug)] pub enum EncoderConfig { Text(text_model::ClipTextConfig), Vision(vision_model::ClipVisionConfig), @@ -67,6 +66,7 @@ impl EncoderConfig { } } +#[derive(Clone, Debug)] pub struct ClipConfig { pub text_config: text_model::ClipTextConfig, pub vision_config: vision_model::ClipVisionConfig, @@ -111,7 +111,6 @@ impl ClipModel { let logit_scale = if vs.contains_tensor("logit_scale") { vs.get(&[], "logit_scale")? } else { - warn!("Creating logit_scale tensor, results may vary."); Tensor::new(&[c.logit_scale_init_value], vs.device())? }; @@ -125,38 +124,26 @@ impl ClipModel { } pub fn get_text_features(&self, input_ids: &Tensor) -> Result { - let text_outputs = self.text_model.forward(input_ids)?; - - let text_features = self.text_projection.forward(&text_outputs)?; - - Ok(text_features) + input_ids + .apply(&self.text_model)? + .apply(&self.text_projection) } pub fn get_image_features(&self, pixel_values: &Tensor) -> Result { - let image_features = self.vision_model.forward(pixel_values)?; - - let image_features = self.visual_projection.forward(&image_features)?; - - Ok(image_features) + pixel_values + .apply(&self.vision_model)? + .apply(&self.visual_projection) } pub fn forward(&self, pixel_values: &Tensor, input_ids: &Tensor) -> Result<(Tensor, Tensor)> { let image_features = self.get_image_features(pixel_values)?; - let text_features = self.get_text_features(input_ids)?; - let image_features_normalized = div_l2_norm(&image_features)?; - let text_features_normalized = div_l2_norm(&text_features)?; - let logits_per_text = text_features_normalized.matmul(&image_features_normalized.t()?)?; - - let logit_scale = &self.logit_scale.exp()?; - + let logit_scale = self.logit_scale.exp()?; let logits_per_text = logits_per_text.broadcast_mul(&logit_scale)?; - let logits_per_image = logits_per_text.t()?; - Ok((logits_per_text, logits_per_image)) } } diff --git a/candle-transformers/src/models/clip/text_model.rs b/candle-transformers/src/models/clip/text_model.rs index 852d3e24..d3ba26ff 100644 --- a/candle-transformers/src/models/clip/text_model.rs +++ b/candle-transformers/src/models/clip/text_model.rs @@ -59,7 +59,7 @@ impl ClipTextConfig { // ClipTextEmbeddings mostly based on the existing implementation in the stable diffision model. // TODO rewrite to be more similar to https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L142 -#[derive(Debug)] +#[derive(Clone, Debug)] struct ClipTextEmbeddings { token_embedding: candle_nn::Embedding, position_embedding: candle_nn::Embedding, @@ -70,16 +70,13 @@ impl ClipTextEmbeddings { fn new(vs: candle_nn::VarBuilder, c: &ClipTextConfig) -> Result { let token_embedding = candle_nn::embedding(c.vocab_size, c.embed_dim, vs.pp("token_embedding"))?; - let position_embedding: nn::Embedding = candle_nn::embedding( c.max_position_embeddings, c.embed_dim, vs.pp("position_embedding"), )?; - let position_ids = Tensor::arange(0u32, c.max_position_embeddings as u32, vs.device())?.unsqueeze(0)?; - Ok(ClipTextEmbeddings { token_embedding, position_embedding, @@ -91,20 +88,14 @@ impl ClipTextEmbeddings { impl Module for ClipTextEmbeddings { fn forward(&self, input_ids: &Tensor) -> Result { let seq_length = input_ids.dim(D::Minus1)?; - - let inputs_embeds = &self.token_embedding.forward(input_ids)?; - - let postion_ids = &self.position_ids.narrow(1, 0, seq_length)?; - - let position_embedding = &self.position_embedding.forward(&postion_ids)?; - - let inputs_embeds = inputs_embeds.broadcast_add(&position_embedding)?; - - Ok(inputs_embeds) + let inputs_embeds = self.token_embedding.forward(input_ids)?; + let position_ids = self.position_ids.narrow(1, 0, seq_length)?; + let position_embedding = self.position_embedding.forward(&position_ids)?; + inputs_embeds.broadcast_add(&position_embedding) } } -#[derive(Debug)] +#[derive(Clone, Debug)] struct ClipAttention { k_proj: candle_nn::Linear, v_proj: candle_nn::Linear, @@ -166,15 +157,10 @@ impl ClipAttention { let src_len = key_states.dim(1)?; let attn_weights = if let Some(causal_attention_mask) = causal_attention_mask { - let attn_reshape = - attn_weights.reshape((bsz, self.num_attention_heads, seq_len, src_len))?; - - let attn_weights = attn_reshape.broadcast_add(causal_attention_mask)?; - - let attn_weights = - attn_weights.reshape((bsz * self.num_attention_heads, seq_len, src_len))?; - attn_weights + .reshape((bsz, self.num_attention_heads, seq_len, src_len))? + .broadcast_add(causal_attention_mask)? + .reshape((bsz * self.num_attention_heads, seq_len, src_len))? } else { attn_weights }; @@ -190,7 +176,7 @@ impl ClipAttention { } } -#[derive(Debug)] +#[derive(Clone, Debug)] struct ClipMlp { fc1: candle_nn::Linear, fc2: candle_nn::Linear, @@ -217,7 +203,7 @@ impl ClipMlp { } } -#[derive(Debug)] +#[derive(Clone, Debug)] struct ClipEncoderLayer { self_attn: ClipAttention, layer_norm1: candle_nn::LayerNorm, @@ -253,7 +239,7 @@ impl ClipEncoderLayer { } } -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct ClipEncoder { layers: Vec, } @@ -271,7 +257,6 @@ impl ClipEncoder { pub fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { let mut xs = xs.clone(); - for layer in self.layers.iter() { xs = layer.forward(&xs, causal_attention_mask)?; } @@ -280,7 +265,7 @@ impl ClipEncoder { } /// A CLIP transformer based model. -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct ClipTextTransformer { embeddings: ClipTextEmbeddings, encoder: ClipEncoder, @@ -292,7 +277,6 @@ impl ClipTextTransformer { let embeddings = ClipTextEmbeddings::new(vs.pp("embeddings"), c)?; let encoder = ClipEncoder::new(vs.pp("encoder"), &EncoderConfig::Text(c.clone()))?; let final_layer_norm = candle_nn::layer_norm(c.embed_dim, 1e-5, vs.pp("final_layer_norm"))?; - Ok(ClipTextTransformer { embeddings, encoder, @@ -325,7 +309,6 @@ impl ClipTextTransformer { pub fn forward_with_mask(&self, input_ids: &Tensor, mask_after: usize) -> Result { let (bsz, seq_len) = input_ids.dims2()?; let input_ids = self.embeddings.forward(input_ids)?; - let causal_attention_mask = Self::build_causal_attention_mask(bsz, seq_len, mask_after, input_ids.device())?; let input_ids = self @@ -338,18 +321,13 @@ impl ClipTextTransformer { impl Module for ClipTextTransformer { fn forward(&self, input_ids: &Tensor) -> Result { let output = self.forward_with_mask(input_ids, usize::MAX)?; - let sequence_max_indices = input_ids.argmax(D::Minus1)?.to_dtype(DType::I64)?; - let mut indices: Vec = Vec::new(); - + let mut indices = Vec::new(); for (batch_idx, &seq_idx) in sequence_max_indices.to_vec1::()?.iter().enumerate() { let index = output.i((batch_idx, seq_idx as usize))?.unsqueeze(0)?; indices.push(index); } - - let pooled_output = Tensor::cat(&indices, 0)?; - - Ok(pooled_output) + Tensor::cat(&indices, 0) } } diff --git a/candle-transformers/src/models/clip/vision_model.rs b/candle-transformers/src/models/clip/vision_model.rs index af9af7ae..88992434 100644 --- a/candle-transformers/src/models/clip/vision_model.rs +++ b/candle-transformers/src/models/clip/vision_model.rs @@ -10,7 +10,6 @@ use candle::{IndexOp, Result, Shape, Tensor, D}; use candle_nn as nn; use candle_nn::Module; use nn::Conv2dConfig; -use tracing::warn; use super::{ text_model::{Activation, ClipEncoder}, @@ -50,7 +49,7 @@ impl ClipVisionConfig { } // https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L112 -#[derive(Debug)] +#[derive(Clone, Debug)] struct ClipVisionEmbeddings { patch_embedding: candle_nn::Conv2d, position_ids: Tensor, @@ -64,14 +63,11 @@ impl ClipVisionEmbeddings { let class_embedding = if vs.contains_tensor("class_embedding") { vs.get(c.embed_dim, "class_embedding")? } else { - warn!("class_embedding not found in the. Initializing a new one."); - Tensor::randn(0.0 as f32, 1.0 as f32, &[c.embed_dim], vs.device())? + Tensor::randn(0f32, 1f32, c.embed_dim, vs.device())? }; let num_patches = (c.image_size / c.patch_size).pow(2); - let num_positions = num_patches + 1; - let position_ids = Tensor::arange(0, num_positions as i64, vs.device())?; let conv2dconfig = Conv2dConfig { @@ -80,7 +76,6 @@ impl ClipVisionEmbeddings { }; let position_embedding = candle_nn::embedding(num_positions, c.embed_dim, vs.pp("position_embedding"))?; - let patch_embedding = candle_nn::conv2d_no_bias( c.num_channels, c.embed_dim, @@ -88,7 +83,6 @@ impl ClipVisionEmbeddings { conv2dconfig, vs.pp("patch_embedding"), )?; - Ok(Self { patch_embedding, position_ids, @@ -101,31 +95,21 @@ impl ClipVisionEmbeddings { impl Module for ClipVisionEmbeddings { fn forward(&self, pixel_values: &Tensor) -> Result { let batch_size = pixel_values.shape().dims(); - - let patch_embeds = self.patch_embedding.forward(&pixel_values)?; - - let patch_embeds = patch_embeds.flatten_from(2)?; - - let patch_embeds = patch_embeds.transpose(1, 2)?; - - let class_embedding = self.class_embedding.clone(); - - let shape = Shape::from(vec![batch_size[0], 1, class_embedding.dim(D::Minus1)?]); - - let class_embeds = class_embedding.expand(shape)?; - + let patch_embeds = self + .patch_embedding + .forward(pixel_values)? + .flatten_from(2)? + .transpose(1, 2)?; + let shape = Shape::from((batch_size[0], 1, self.class_embedding.dim(D::Minus1)?)); + let class_embeds = self.class_embedding.expand(shape)?; let embeddings = Tensor::cat(&[class_embeds, patch_embeds], 1)?; - let position_embedding = self.position_embedding.forward(&self.position_ids)?; - - let embeddings = embeddings.broadcast_add(&position_embedding)?; - - Ok(embeddings) + embeddings.broadcast_add(&position_embedding) } } // https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L743 -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct ClipVisionTransformer { embeddings: ClipVisionEmbeddings, encoder: ClipEncoder, @@ -136,13 +120,9 @@ pub struct ClipVisionTransformer { impl ClipVisionTransformer { pub fn new(vs: candle_nn::VarBuilder, c: &ClipVisionConfig) -> Result { let embeddings = ClipVisionEmbeddings::new(vs.pp("embeddings"), c)?; - let pre_layer_norm = candle_nn::layer_norm(c.embed_dim, 1e-5, vs.pp("pre_layrnorm"))?; - let encoder = ClipEncoder::new(vs.pp("encoder"), &EncoderConfig::Vision(c.clone()))?; - let final_layer_norm = candle_nn::layer_norm(c.embed_dim, 1e-5, vs.pp("post_layernorm"))?; - Ok(Self { embeddings, encoder, @@ -154,18 +134,14 @@ impl ClipVisionTransformer { impl Module for ClipVisionTransformer { fn forward(&self, pixel_values: &Tensor) -> Result { - let hidden_states = self.embeddings.forward(pixel_values)?; - - let hidden_states = self.pre_layer_norm.forward(&hidden_states)?; + let hidden_states = pixel_values + .apply(&self.embeddings)? + .apply(&self.pre_layer_norm)?; let encoder_outputs = self.encoder.forward(&hidden_states, None)?; - // https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L787 // pooled_output = encoder_outputs[:, 0, :] let pooled_output = encoder_outputs.i((.., 0, ..))?; - - let output = self.final_layer_norm.forward(&pooled_output)?; - - Ok(output) + self.final_layer_norm.forward(&pooled_output) } } diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 4267059c..6fbc1844 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -3,6 +3,7 @@ pub mod bigcode; pub mod blip; pub mod blip_text; pub mod chatglm; +pub mod clip; pub mod convmixer; pub mod convnext; pub mod dinov2; @@ -12,7 +13,6 @@ pub mod efficientvit; pub mod encodec; pub mod falcon; pub mod gemma; -pub mod clip; pub mod jina_bert; pub mod llama; pub mod llama2_c; From c5092f2c2977dbb0b45d16a869d22f4c2790a1e2 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 28 Mar 2024 17:58:06 +0100 Subject: [PATCH 077/131] Add a couple t5 models. (#1958) --- candle-examples/examples/t5/main.rs | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/candle-examples/examples/t5/main.rs b/candle-examples/examples/t5/main.rs index 8ef108b6..be6bc6b5 100644 --- a/candle-examples/examples/t5/main.rs +++ b/candle-examples/examples/t5/main.rs @@ -12,12 +12,19 @@ use anyhow::{Error as E, Result}; use candle::{DType, Device, Tensor}; use candle_nn::VarBuilder; use candle_transformers::generation::LogitsProcessor; -use clap::Parser; +use clap::{Parser, ValueEnum}; use hf_hub::{api::sync::Api, Repo, RepoType}; use tokenizers::Tokenizer; const DTYPE: DType = DType::F32; +#[derive(Clone, Debug, Copy, ValueEnum)] +enum Which { + T5Base, + T5Small, + T5_3B, +} + #[derive(Parser, Debug, Clone)] #[command(author, version, about, long_about = None)] struct Args { @@ -71,6 +78,10 @@ struct Args { /// The context size to consider for the repeat penalty. #[arg(long, default_value_t = 64)] repeat_last_n: usize, + + /// The model to be used. + #[arg(long, default_value = "t5-small")] + which: Which, } struct T5ModelBuilder { @@ -82,8 +93,13 @@ struct T5ModelBuilder { impl T5ModelBuilder { pub fn load(args: &Args) -> Result<(Self, Tokenizer)> { let device = candle_examples::device(args.cpu)?; - let default_model = "t5-small".to_string(); - let default_revision = "refs/pr/15".to_string(); + let (default_model, default_revision) = match args.which { + Which::T5Base => ("t5-base", "main"), + Which::T5Small => ("t5-small", "refs/pr/15"), + Which::T5_3B => ("t5-3b", "main"), + }; + let default_model = default_model.to_string(); + let default_revision = default_revision.to_string(); let (model_id, revision) = match (args.model_id.to_owned(), args.revision.to_owned()) { (Some(model_id), Some(revision)) => (model_id, revision), (Some(model_id), None) => (model_id, "main".to_string()), From 708e422456e8ed783923dd05ca7f4922099eda8c Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 28 Mar 2024 23:10:57 +0100 Subject: [PATCH 078/131] Qwen MoE model. (#1960) * Qwen MoE model. * Add the MoE model to the example. * Fix the scaling. * Readme updates. * Readme tweaks. --- README.md | 4 +- candle-examples/examples/qwen/README.md | 27 ++ candle-examples/examples/qwen/main.rs | 38 +- candle-transformers/src/models/mod.rs | 1 + candle-transformers/src/models/qwen2_moe.rs | 488 ++++++++++++++++++++ 5 files changed, 553 insertions(+), 5 deletions(-) create mode 100644 candle-examples/examples/qwen/README.md create mode 100644 candle-transformers/src/models/qwen2_moe.rs diff --git a/README.md b/README.md index b0a3b118..1208956c 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,8 @@ We also provide a some command line based examples using state of the art models [RepVGG](./candle-examples/examples/repvgg): computer vision models. - [BLIP](./candle-examples/examples/blip/): image to text model, can be used to generate captions for an image. +- [CLIP](./candle-examples/examples/clip/): multi-model vision and language + model. - [TrOCR](./candle-examples/examples/trocr/): a transformer OCR model, with dedicated submodels for hand-writing and printed recognition. - [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation @@ -206,7 +208,7 @@ If you have an addition to this list, please submit a pull request. - Replit-code-v1.5-3B. - Bert. - Yi-6B and Yi-34B. - - Qwen1.5. + - Qwen1.5, Qwen1.5 MoE. - RWKV v5 and v6. - Quantized LLMs. - Llama 7b, 13b, 70b, as well as the chat and code variants. diff --git a/candle-examples/examples/qwen/README.md b/candle-examples/examples/qwen/README.md new file mode 100644 index 00000000..44a50b72 --- /dev/null +++ b/candle-examples/examples/qwen/README.md @@ -0,0 +1,27 @@ +# candle-qwen: large language model series from Alibaba Cloud + +Qwen 1.5 is a series of large language models that provide strong performances +on English and Chinese. + +- [Blog post](https://qwenlm.github.io/blog/qwen1.5/) introducing Qwen1.5. +- [Model card](https://huggingface.co/Qwen/Qwen1.5-0.5B) on the HuggingFace Hub. +- [Blog post](https://qwenlm.github.io/blog/qwen-moe/) for the + mixture-of-experts (MoE) variant. + +## Running the example + +```bash +$ cargo run --example qwen --release -- --prompt "Hello there " +``` + +Various model sizes are available via the `--model` argument, including the MoE +variant. + +```bash +$ cargo run --example qwen --release -- --prompt "Hello there " --model moe-a2.7b --prompt 'def print_prime(n: int): ' +def print_prime(n: int): # n is the number of primes to be printed + for i in range(2, n + 1): + if all(i % j != 0 for j in range(2, i)): + print(i) +``` + diff --git a/candle-examples/examples/qwen/main.rs b/candle-examples/examples/qwen/main.rs index d040d4b0..a203ad8e 100644 --- a/candle-examples/examples/qwen/main.rs +++ b/candle-examples/examples/qwen/main.rs @@ -7,7 +7,8 @@ extern crate accelerate_src; use anyhow::{Error as E, Result}; use clap::Parser; -use candle_transformers::models::qwen2::{Config, Model}; +use candle_transformers::models::qwen2::{Config as ConfigBase, Model as ModelBase}; +use candle_transformers::models::qwen2_moe::{Config as ConfigMoe, Model as ModelMoe}; use candle::{DType, Device, Tensor}; use candle_examples::token_output_stream::TokenOutputStream; @@ -16,6 +17,20 @@ use candle_transformers::generation::LogitsProcessor; use hf_hub::{api::sync::Api, Repo, RepoType}; use tokenizers::Tokenizer; +enum Model { + Base(ModelBase), + Moe(ModelMoe), +} + +impl Model { + fn forward(&mut self, xs: &Tensor, s: usize) -> candle::Result { + match self { + Self::Moe(ref mut m) => m.forward(xs, s), + Self::Base(ref mut m) => m.forward(xs, s), + } + } +} + struct TextGeneration { model: Model, device: Device, @@ -127,6 +142,8 @@ enum WhichModel { W14b, #[value(name = "72b")] W72b, + #[value(name = "moe-a2.7b")] + MoeA27b, } #[derive(Parser, Debug)] @@ -224,6 +241,7 @@ fn main() -> Result<()> { WhichModel::W7b => "7B", WhichModel::W14b => "14B", WhichModel::W72b => "72B", + WhichModel::MoeA27b => "MoE-A2.7B", }; format!("Qwen/Qwen1.5-{size}") } @@ -244,7 +262,11 @@ fn main() -> Result<()> { .collect::>(), None => match args.model { WhichModel::W0_5b | WhichModel::W1_8b => vec![repo.get("model.safetensors")?], - WhichModel::W4b | WhichModel::W7b | WhichModel::W14b | WhichModel::W72b => { + WhichModel::W4b + | WhichModel::W7b + | WhichModel::W14b + | WhichModel::W72b + | WhichModel::MoeA27b => { candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")? } }, @@ -254,7 +276,6 @@ fn main() -> Result<()> { let start = std::time::Instant::now(); let config_file = repo.get("config.json")?; - let config: Config = serde_json::from_slice(&std::fs::read(config_file)?)?; let device = candle_examples::device(args.cpu)?; let dtype = if device.is_cuda() { DType::BF16 @@ -262,7 +283,16 @@ fn main() -> Result<()> { DType::F32 }; let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? }; - let model = Model::new(&config, vb)?; + let model = match args.model { + WhichModel::MoeA27b => { + let config: ConfigMoe = serde_json::from_slice(&std::fs::read(config_file)?)?; + Model::Moe(ModelMoe::new(&config, vb)?) + } + _ => { + let config: ConfigBase = serde_json::from_slice(&std::fs::read(config_file)?)?; + Model::Base(ModelBase::new(&config, vb)?) + } + }; println!("loaded the model in {:?}", start.elapsed()); diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 6fbc1844..980ba535 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -40,6 +40,7 @@ pub mod quantized_rwkv_v6; pub mod quantized_stable_lm; pub mod quantized_t5; pub mod qwen2; +pub mod qwen2_moe; pub mod repvgg; pub mod resnet; pub mod rwkv_v5; diff --git a/candle-transformers/src/models/qwen2_moe.rs b/candle-transformers/src/models/qwen2_moe.rs new file mode 100644 index 00000000..d6566e90 --- /dev/null +++ b/candle-transformers/src/models/qwen2_moe.rs @@ -0,0 +1,488 @@ +use crate::models::with_tracing::{linear, linear_no_bias, Linear, RmsNorm}; +use candle::{DType, Device, Module, Result, Tensor, D}; +use candle_nn::{Activation, VarBuilder}; +use std::sync::Arc; + +#[derive(Debug, Clone, PartialEq, serde::Deserialize)] +pub struct Config { + pub vocab_size: usize, + pub hidden_size: usize, + pub intermediate_size: usize, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + pub num_key_value_heads: usize, + pub max_position_embeddings: usize, + pub sliding_window: usize, + pub max_window_layers: usize, + pub tie_word_embeddings: bool, + pub rope_theta: f64, + pub rms_norm_eps: f64, + pub use_sliding_window: bool, + pub hidden_act: Activation, + pub decoder_sparse_step: usize, + pub moe_intermediate_size: usize, + pub shared_expert_intermediate_size: usize, + pub num_experts_per_tok: usize, + pub num_experts: usize, + pub norm_topk_prob: bool, +} + +#[derive(Debug, Clone)] +struct RotaryEmbedding { + sin: Tensor, + cos: Tensor, +} + +fn rotate_half(xs: &Tensor) -> Result { + let last_dim = xs.dim(D::Minus1)?; + let xs1 = xs.narrow(D::Minus1, 0, last_dim / 2)?; + let xs2 = xs.narrow(D::Minus1, last_dim / 2, last_dim - last_dim / 2)?; + Tensor::cat(&[&xs2.neg()?, &xs1], D::Minus1) +} + +impl RotaryEmbedding { + fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result { + let dim = cfg.hidden_size / cfg.num_attention_heads; + let max_seq_len = cfg.max_position_embeddings; + let inv_freq: Vec<_> = (0..dim) + .step_by(2) + .map(|i| 1f32 / cfg.rope_theta.powf(i as f64 / dim as f64) as f32) + .collect(); + let inv_freq_len = inv_freq.len(); + let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(dtype)?; + let t = Tensor::arange(0u32, max_seq_len as u32, dev)? + .to_dtype(dtype)? + .reshape((max_seq_len, 1))?; + let freqs = t.matmul(&inv_freq)?; + let freqs = Tensor::cat(&[&freqs, &freqs], D::Minus1)?; + Ok(Self { + sin: freqs.sin()?, + cos: freqs.cos()?, + }) + } + + fn apply_rotary_emb_qkv( + &self, + q: &Tensor, + k: &Tensor, + seqlen_offset: usize, + ) -> Result<(Tensor, Tensor)> { + let (_b_sz, _h, seq_len, _n_embd) = q.dims4()?; + let cos = self.cos.narrow(0, seqlen_offset, seq_len)?; + let sin = self.sin.narrow(0, seqlen_offset, seq_len)?; + let cos = cos.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim) + let sin = sin.unsqueeze(0)?.unsqueeze(0)?; // (1, 1, seq_len, dim) + let q_embed = (q.broadcast_mul(&cos)? + rotate_half(q)?.broadcast_mul(&sin))?; + let k_embed = (k.broadcast_mul(&cos)? + rotate_half(k)?.broadcast_mul(&sin))?; + Ok((q_embed, k_embed)) + } +} + +#[derive(Debug, Clone)] +#[allow(clippy::upper_case_acronyms)] +struct MLP { + gate_proj: Linear, + up_proj: Linear, + down_proj: Linear, + act_fn: Activation, +} + +impl MLP { + fn new(intermediate_sz: usize, cfg: &Config, vb: VarBuilder) -> Result { + let hidden_sz = cfg.hidden_size; + let gate_proj = linear_no_bias(hidden_sz, intermediate_sz, vb.pp("gate_proj"))?; + let up_proj = linear_no_bias(hidden_sz, intermediate_sz, vb.pp("up_proj"))?; + let down_proj = linear_no_bias(intermediate_sz, hidden_sz, vb.pp("down_proj"))?; + Ok(Self { + gate_proj, + up_proj, + down_proj, + act_fn: cfg.hidden_act, + }) + } +} + +impl Module for MLP { + fn forward(&self, xs: &Tensor) -> Result { + let lhs = xs.apply(&self.gate_proj)?.apply(&self.act_fn)?; + let rhs = xs.apply(&self.up_proj)?; + (lhs * rhs)?.apply(&self.down_proj) + } +} + +#[derive(Debug, Clone)] +struct Attention { + q_proj: Linear, + k_proj: Linear, + v_proj: Linear, + o_proj: Linear, + num_heads: usize, + num_kv_heads: usize, + num_kv_groups: usize, + head_dim: usize, + hidden_size: usize, + rotary_emb: Arc, + kv_cache: Option<(Tensor, Tensor)>, +} + +impl Attention { + fn new(rotary_emb: Arc, cfg: &Config, vb: VarBuilder) -> Result { + let hidden_sz = cfg.hidden_size; + let num_heads = cfg.num_attention_heads; + let num_kv_heads = cfg.num_key_value_heads; + let num_kv_groups = num_heads / num_kv_heads; + let head_dim = hidden_sz / num_heads; + let q_proj = linear(hidden_sz, num_heads * head_dim, vb.pp("q_proj"))?; + let k_proj = linear(hidden_sz, num_kv_heads * head_dim, vb.pp("k_proj"))?; + let v_proj = linear(hidden_sz, num_kv_heads * head_dim, vb.pp("v_proj"))?; + let o_proj = linear_no_bias(num_heads * head_dim, hidden_sz, vb.pp("o_proj"))?; + Ok(Self { + q_proj, + k_proj, + v_proj, + o_proj, + num_heads, + num_kv_heads, + num_kv_groups, + head_dim, + hidden_size: hidden_sz, + rotary_emb, + kv_cache: None, + }) + } + + fn repeat_kv(&self, xs: Tensor) -> Result { + let n_rep = self.num_kv_groups; + if n_rep == 1 { + Ok(xs) + } else { + let (b_sz, num_kv_heads, seq_len, head_dim) = xs.dims4()?; + xs.unsqueeze(2)? + .expand((b_sz, num_kv_heads, n_rep, seq_len, head_dim))? + .reshape((b_sz, num_kv_heads * n_rep, seq_len, head_dim)) + } + } + + fn forward( + &mut self, + xs: &Tensor, + attention_mask: Option<&Tensor>, + seqlen_offset: usize, + ) -> Result { + let (b_sz, q_len, _) = xs.dims3()?; + + let query_states = self.q_proj.forward(xs)?; + let key_states = self.k_proj.forward(xs)?; + let value_states = self.v_proj.forward(xs)?; + + let query_states = query_states + .reshape((b_sz, q_len, self.num_heads, self.head_dim))? + .transpose(1, 2)?; + let key_states = key_states + .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? + .transpose(1, 2)?; + let value_states = value_states + .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? + .transpose(1, 2)?; + + let (query_states, key_states) = + self.rotary_emb + .apply_rotary_emb_qkv(&query_states, &key_states, seqlen_offset)?; + + let (key_states, value_states) = match &self.kv_cache { + None => (key_states, value_states), + Some((prev_k, prev_v)) => { + let key_states = Tensor::cat(&[prev_k, &key_states], 2)?; + let value_states = Tensor::cat(&[prev_v, &value_states], 2)?; + (key_states, value_states) + } + }; + self.kv_cache = Some((key_states.clone(), value_states.clone())); + + let key_states = self.repeat_kv(key_states)?.contiguous()?; + let value_states = self.repeat_kv(value_states)?.contiguous()?; + + let attn_output = { + let scale = 1f64 / f64::sqrt(self.head_dim as f64); + let attn_weights = (query_states.matmul(&key_states.transpose(2, 3)?)? * scale)?; + + let attn_weights = match attention_mask { + None => attn_weights, + Some(mask) => attn_weights.broadcast_add(mask)?, + }; + let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?; + attn_weights.matmul(&value_states)? + }; + attn_output + .transpose(1, 2)? + .reshape((b_sz, q_len, self.hidden_size))? + .apply(&self.o_proj) + } + + fn clear_kv_cache(&mut self) { + self.kv_cache = None + } +} + +// https://github.com/huggingface/transformers/blob/536ea2aca234fb48c5c69769431d643b0d93b233/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py#L800 +#[derive(Debug, Clone)] +struct SparseMoeBlock { + gate: Linear, + experts: Vec, + shared_expert: MLP, + shared_expert_gate: Linear, + norm_topk_prob: bool, + num_experts_per_tok: usize, +} + +impl SparseMoeBlock { + fn new(cfg: &Config, vb: VarBuilder) -> Result { + let gate = linear_no_bias(cfg.hidden_size, cfg.num_experts, vb.pp("gate"))?; + let mut experts = Vec::with_capacity(cfg.num_experts); + let vb_e = vb.pp("experts"); + for idx in 0..cfg.num_experts { + let expert = MLP::new(cfg.moe_intermediate_size, cfg, vb_e.pp(idx))?; + experts.push(expert) + } + let shared_expert = MLP::new( + cfg.shared_expert_intermediate_size, + cfg, + vb.pp("shared_expert"), + )?; + let shared_expert_gate = linear_no_bias(cfg.hidden_size, 1, vb.pp("shared_expert_gate"))?; + Ok(Self { + gate, + experts, + shared_expert, + shared_expert_gate, + norm_topk_prob: cfg.norm_topk_prob, + num_experts_per_tok: cfg.num_experts_per_tok, + }) + } +} + +impl Module for SparseMoeBlock { + fn forward(&self, xs: &Tensor) -> Result { + let (b_size, seq_len, hidden_dim) = xs.dims3()?; + let xs = xs.reshape(((), hidden_dim))?; + let router_logits = xs.apply(&self.gate)?; + let routing_weights = candle_nn::ops::softmax_last_dim(&router_logits)?; + + // In order to extract topk, we extract the data from the tensor and manipulate it + // directly. Maybe we will want to use some custom ops instead at some point. + let routing_weights = routing_weights.to_dtype(DType::F32)?.to_vec2::()?; + + // routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + // top_x contains the row indexes to evaluate for each expert. + let mut top_x = vec![vec![]; self.experts.len()]; + let mut selected_experts = vec![vec![]; self.experts.len()]; + for (row_idx, rw) in routing_weights.iter().enumerate() { + let mut dst = (0..rw.len() as u32).collect::>(); + dst.sort_by(|&i, &j| rw[j as usize].total_cmp(&rw[i as usize])); + let mut sum_routing_weights = 0f32; + for &expert_idx in dst.iter().take(self.num_experts_per_tok) { + let expert_idx = expert_idx as usize; + let routing_weight = rw[expert_idx]; + sum_routing_weights += routing_weight; + top_x[expert_idx].push(row_idx as u32); + } + for &expert_idx in dst.iter().take(self.num_experts_per_tok) { + let expert_idx = expert_idx as usize; + let routing_weight = if self.norm_topk_prob { + rw[expert_idx] / sum_routing_weights + } else { + rw[expert_idx] + }; + selected_experts[expert_idx].push(routing_weight) + } + } + + let mut ys = xs.zeros_like()?; + for (expert_idx, expert_layer) in self.experts.iter().enumerate() { + let top_x = &top_x[expert_idx]; + if top_x.is_empty() { + continue; + } + let top_x = Tensor::new(top_x.as_slice(), xs.device())?; + let selected_experts = + Tensor::new(selected_experts[expert_idx].as_slice(), xs.device())? + .reshape(((), 1))? + .to_dtype(xs.dtype())?; + // Index the correct hidden states and compute the expert hidden state for + // the current expert. We need to make sure to multiply the output hidden + // states by `routing_weights` on the corresponding tokens (top-1 and top-2) + let current_state = xs.index_select(&top_x, 0)?.reshape(((), hidden_dim))?; + // current_hidden_states = expert_layer(current_state, routing_weights[top_x_list, idx_list, None]) + let current_hidden_states = expert_layer.forward(¤t_state)?; + let current_hidden_states = current_hidden_states.broadcast_mul(&selected_experts)?; + ys = ys.index_add(&top_x, ¤t_hidden_states, 0)?; + } + let shared_expert_output = xs.apply(&self.shared_expert)?; + let shared_expert_output = shared_expert_output.broadcast_mul(&candle_nn::ops::sigmoid( + &xs.apply(&self.shared_expert_gate)?, + )?)?; + let ys = (ys + shared_expert_output)?; + let ys = ys.reshape((b_size, seq_len, hidden_dim))?; + Ok(ys) + } +} + +#[derive(Debug, Clone)] +enum MlpOrMoeBlock { + Mlp(MLP), + MoeBlock(SparseMoeBlock), +} + +impl Module for MlpOrMoeBlock { + fn forward(&self, xs: &Tensor) -> Result { + match self { + Self::MoeBlock(m) => m.forward(xs), + Self::Mlp(m) => m.forward(xs), + } + } +} + +#[derive(Debug, Clone)] +struct DecoderLayer { + self_attn: Attention, + mlp: MlpOrMoeBlock, + input_layernorm: RmsNorm, + post_attention_layernorm: RmsNorm, +} + +impl DecoderLayer { + fn new( + layer_idx: usize, + rotary_emb: Arc, + cfg: &Config, + vb: VarBuilder, + ) -> Result { + let self_attn = Attention::new(rotary_emb, cfg, vb.pp("self_attn"))?; + let mlp = if cfg.num_experts > 0 && (layer_idx + 1) % cfg.decoder_sparse_step == 0 { + MlpOrMoeBlock::MoeBlock(SparseMoeBlock::new(cfg, vb.pp("mlp"))?) + } else { + MlpOrMoeBlock::Mlp(MLP::new(cfg.intermediate_size, cfg, vb.pp("mlp"))?) + }; + let input_layernorm = + RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("input_layernorm"))?; + let post_attention_layernorm = RmsNorm::new( + cfg.hidden_size, + cfg.rms_norm_eps, + vb.pp("post_attention_layernorm"), + )?; + Ok(Self { + self_attn, + mlp, + input_layernorm, + post_attention_layernorm, + }) + } + + fn forward( + &mut self, + xs: &Tensor, + attention_mask: Option<&Tensor>, + seqlen_offset: usize, + ) -> Result { + let residual = xs; + let xs = self.input_layernorm.forward(xs)?; + let xs = self.self_attn.forward(&xs, attention_mask, seqlen_offset)?; + let xs = (xs + residual)?; + let residual = &xs; + let xs = xs.apply(&self.post_attention_layernorm)?.apply(&self.mlp)?; + residual + xs + } + + fn clear_kv_cache(&mut self) { + self.self_attn.clear_kv_cache() + } +} + +#[derive(Debug, Clone)] +pub struct Model { + embed_tokens: candle_nn::Embedding, + layers: Vec, + norm: RmsNorm, + lm_head: Linear, + sliding_window: usize, + device: Device, + dtype: DType, +} + +impl Model { + pub fn new(cfg: &Config, vb: VarBuilder) -> Result { + let vb_m = vb.pp("model"); + let embed_tokens = + candle_nn::embedding(cfg.vocab_size, cfg.hidden_size, vb_m.pp("embed_tokens"))?; + let rotary_emb = Arc::new(RotaryEmbedding::new(vb.dtype(), cfg, vb_m.device())?); + let mut layers = Vec::with_capacity(cfg.num_hidden_layers); + let vb_l = vb_m.pp("layers"); + for layer_idx in 0..cfg.num_hidden_layers { + let layer = DecoderLayer::new(layer_idx, rotary_emb.clone(), cfg, vb_l.pp(layer_idx))?; + layers.push(layer) + } + let norm = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb_m.pp("norm"))?; + let lm_head = linear_no_bias(cfg.hidden_size, cfg.vocab_size, vb.pp("lm_head"))?; + Ok(Self { + embed_tokens, + layers, + norm, + lm_head, + sliding_window: cfg.sliding_window, + device: vb.device().clone(), + dtype: vb.dtype(), + }) + } + + fn prepare_decoder_attention_mask( + &self, + b_size: usize, + tgt_len: usize, + seqlen_offset: usize, + ) -> Result { + // Sliding window mask? + let mask: Vec<_> = (0..tgt_len) + .flat_map(|i| { + (0..tgt_len).map(move |j| { + if i < j || j + self.sliding_window < i { + f32::NEG_INFINITY + } else { + 0. + } + }) + }) + .collect(); + let mask = Tensor::from_slice(&mask, (tgt_len, tgt_len), &self.device)?; + let mask = if seqlen_offset > 0 { + let mask0 = Tensor::zeros((tgt_len, seqlen_offset), DType::F32, &self.device)?; + Tensor::cat(&[&mask0, &mask], D::Minus1)? + } else { + mask + }; + mask.expand((b_size, 1, tgt_len, tgt_len + seqlen_offset))? + .to_dtype(self.dtype) + } + + pub fn forward(&mut self, input_ids: &Tensor, seqlen_offset: usize) -> Result { + let (b_size, seq_len) = input_ids.dims2()?; + let attention_mask = if seq_len <= 1 { + None + } else { + let mask = self.prepare_decoder_attention_mask(b_size, seq_len, seqlen_offset)?; + Some(mask) + }; + let mut xs = self.embed_tokens.forward(input_ids)?; + for layer in self.layers.iter_mut() { + xs = layer.forward(&xs, attention_mask.as_ref(), seqlen_offset)? + } + xs.narrow(1, seq_len - 1, 1)? + .apply(&self.norm)? + .apply(&self.lm_head) + } + + pub fn clear_kv_cache(&mut self) { + for layer in self.layers.iter_mut() { + layer.clear_kv_cache() + } + } +} From eb1b27abcd1402f268e9cce143065fdfe3a7d693 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 28 Mar 2024 23:24:46 +0100 Subject: [PATCH 079/131] Readme fix. (#1961) --- candle-examples/examples/qwen/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-examples/examples/qwen/README.md b/candle-examples/examples/qwen/README.md index 44a50b72..cb785f21 100644 --- a/candle-examples/examples/qwen/README.md +++ b/candle-examples/examples/qwen/README.md @@ -18,7 +18,7 @@ Various model sizes are available via the `--model` argument, including the MoE variant. ```bash -$ cargo run --example qwen --release -- --prompt "Hello there " --model moe-a2.7b --prompt 'def print_prime(n: int): ' +$ cargo run --example qwen --release -- --model moe-a2.7b --prompt 'def print_prime(n: int): ' def print_prime(n: int): # n is the number of primes to be printed for i in range(2, n + 1): if all(i % j != 0 for j in range(2, i)): From 8ad12a0e81849d0bdb2e2b59d0f18e2b54174cd0 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 29 Mar 2024 18:09:29 +0100 Subject: [PATCH 080/131] Add some examples using the MT5 variants. (#1963) --- candle-examples/examples/t5/main.rs | 52 ++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/candle-examples/examples/t5/main.rs b/candle-examples/examples/t5/main.rs index be6bc6b5..34ae0ead 100644 --- a/candle-examples/examples/t5/main.rs +++ b/candle-examples/examples/t5/main.rs @@ -23,6 +23,9 @@ enum Which { T5Base, T5Small, T5_3B, + Mt5Base, + Mt5Small, + Mt5Large, } #[derive(Parser, Debug, Clone)] @@ -43,6 +46,15 @@ struct Args { #[arg(long)] revision: Option, + #[arg(long)] + model_file: Option, + + #[arg(long)] + tokenizer_file: Option, + + #[arg(long)] + config_file: Option, + /// Enable decoding. #[arg(long)] decode: bool, @@ -97,6 +109,9 @@ impl T5ModelBuilder { Which::T5Base => ("t5-base", "main"), Which::T5Small => ("t5-small", "refs/pr/15"), Which::T5_3B => ("t5-3b", "main"), + Which::Mt5Base => ("google/mt5-base", "refs/pr/5"), + Which::Mt5Small => ("google/mt5-small", "refs/pr/6"), + Which::Mt5Large => ("google/mt5-large", "refs/pr/2"), }; let default_model = default_model.to_string(); let default_revision = default_revision.to_string(); @@ -109,14 +124,35 @@ impl T5ModelBuilder { let repo = Repo::with_revision(model_id.clone(), RepoType::Model, revision); let api = Api::new()?; - let api = api.repo(repo); - let config_filename = api.get("config.json")?; - let tokenizer_filename = api.get("tokenizer.json")?; - let weights_filename = if model_id == "google/flan-t5-xxl" || model_id == "google/flan-ul2" - { - candle_examples::hub_load_safetensors(&api, "model.safetensors.index.json")? - } else { - vec![api.get("model.safetensors")?] + let repo = api.repo(repo); + let config_filename = match &args.config_file { + None => repo.get("config.json")?, + Some(f) => f.into(), + }; + let tokenizer_filename = match &args.tokenizer_file { + None => match args.which { + Which::Mt5Base => api + .model("lmz/mt5-tokenizers".into()) + .get("mt5-base.tokenizer.json")?, + Which::Mt5Small => api + .model("lmz/mt5-tokenizers".into()) + .get("mt5-small.tokenizer.json")?, + Which::Mt5Large => api + .model("lmz/mt5-tokenizers".into()) + .get("mt5-large.tokenizer.json")?, + _ => repo.get("tokenizer.json")?, + }, + Some(f) => f.into(), + }; + let weights_filename = match &args.model_file { + Some(f) => f.split(',').map(|v| v.into()).collect::>(), + None => { + if model_id == "google/flan-t5-xxl" || model_id == "google/flan-ul2" { + candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")? + } else { + vec![repo.get("model.safetensors")?] + } + } }; let config = std::fs::read_to_string(config_filename)?; let mut config: t5::Config = serde_json::from_str(&config)?; From 7ecbc6d50b778f226e52029170c1696399fea51b Mon Sep 17 00:00:00 2001 From: Marco Inacio Date: Fri, 29 Mar 2024 17:09:57 +0000 Subject: [PATCH 081/131] fix minor typo (#1924) --- candle-core/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index 1508b9c0..911e379f 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -14,7 +14,7 @@ //! //! ## Features //! -//! - Simple syntax (looks and like PyTorch) +//! - Simple syntax (looks and feels like PyTorch) //! - CPU and Cuda backends (and M1 support) //! - Enable serverless (CPU) small and fast deployments //! - Model training From 356a170ae92ea85411e605de1be2685b4c923358 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Mar 2024 21:58:15 +0100 Subject: [PATCH 082/131] Update parquet requirement from 50.0.0 to 51.0.0 (#1867) Updates the requirements on [parquet](https://github.com/apache/arrow-rs) to permit the latest version. - [Changelog](https://github.com/apache/arrow-rs/blob/master/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/50.0.0...50.0.0) --- updated-dependencies: - dependency-name: parquet dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 1f0067b7..d71cc4bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,7 +55,7 @@ log = "0.4" memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] } num_cpus = "1.15.0" num-traits = "0.2.15" -parquet = { version = "50.0.0" } +parquet = { version = "51.0.0" } rand = "0.8.5" rand_distr = "0.4.3" rayon = "1.7.0" From 665da304878326e267b178fa6e6d85424249126b Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 29 Mar 2024 23:02:11 +0100 Subject: [PATCH 083/131] Backend refactoring. (#1966) * Backend refactoring. * Metal tweaks. * Move the cudnn module. --- .../{cpu_backend.rs => cpu_backend/mod.rs} | 370 +----------- candle-core/src/cpu_backend/utils.rs | 350 ++++++++++++ candle-core/src/{ => cuda_backend}/cudnn.rs | 0 candle-core/src/cuda_backend/device.rs | 410 +++++++++++++ .../{cuda_backend.rs => cuda_backend/mod.rs} | 539 +----------------- candle-core/src/cuda_backend/utils.rs | 134 +++++ candle-core/src/lib.rs | 5 +- candle-core/src/metal_backend/device.rs | 287 ++++++++++ .../mod.rs} | 291 +--------- 9 files changed, 1202 insertions(+), 1184 deletions(-) rename candle-core/src/{cpu_backend.rs => cpu_backend/mod.rs} (87%) create mode 100644 candle-core/src/cpu_backend/utils.rs rename candle-core/src/{ => cuda_backend}/cudnn.rs (100%) create mode 100644 candle-core/src/cuda_backend/device.rs rename candle-core/src/{cuda_backend.rs => cuda_backend/mod.rs} (78%) create mode 100644 candle-core/src/cuda_backend/utils.rs create mode 100644 candle-core/src/metal_backend/device.rs rename candle-core/src/{metal_backend.rs => metal_backend/mod.rs} (86%) diff --git a/candle-core/src/cpu_backend.rs b/candle-core/src/cpu_backend/mod.rs similarity index 87% rename from candle-core/src/cpu_backend.rs rename to candle-core/src/cpu_backend/mod.rs index 6d2ba361..d686440a 100644 --- a/candle-core/src/cpu_backend.rs +++ b/candle-core/src/cpu_backend/mod.rs @@ -4,6 +4,11 @@ use crate::{DType, Error, IntDType, Layout, Result, Shape, WithDType}; use half::{bf16, f16}; use rayon::prelude::*; +mod utils; +pub use utils::{ + binary_map, binary_map_vec, unary_map, unary_map_vec, Map1, Map1Any, Map2, Map2U8, +}; + const USE_IM2COL_CONV1D: bool = true; const USE_IM2COL_CONV1D_TR: bool = true; const USE_IM2COL_CONV2D: bool = true; @@ -24,102 +29,6 @@ pub enum CpuStorage { #[derive(Debug, Clone)] pub struct CpuDevice; -pub trait Map1 { - fn f(&self, vs: &[T], layout: &Layout) -> Result>; - - fn map(&self, vs: &CpuStorage, layout: &Layout) -> Result { - match vs { - CpuStorage::U8(vs) => Ok(CpuStorage::U8(self.f(vs, layout)?)), - CpuStorage::U32(vs) => Ok(CpuStorage::U32(self.f(vs, layout)?)), - CpuStorage::I64(vs) => Ok(CpuStorage::I64(self.f(vs, layout)?)), - CpuStorage::BF16(vs) => Ok(CpuStorage::BF16(self.f(vs, layout)?)), - CpuStorage::F16(vs) => Ok(CpuStorage::F16(self.f(vs, layout)?)), - CpuStorage::F32(vs) => Ok(CpuStorage::F32(self.f(vs, layout)?)), - CpuStorage::F64(vs) => Ok(CpuStorage::F64(self.f(vs, layout)?)), - } - } -} - -pub trait Map1Any { - fn f) -> CpuStorage>( - &self, - vs: &[T], - layout: &Layout, - wrap: W, - ) -> Result; - - fn map(&self, vs: &CpuStorage, layout: &Layout) -> Result { - match vs { - CpuStorage::U8(vs) => Ok(self.f(vs, layout, CpuStorage::U8)?), - CpuStorage::U32(vs) => Ok(self.f(vs, layout, CpuStorage::U32)?), - CpuStorage::I64(vs) => Ok(self.f(vs, layout, CpuStorage::I64)?), - CpuStorage::BF16(vs) => Ok(self.f(vs, layout, CpuStorage::BF16)?), - CpuStorage::F16(vs) => Ok(self.f(vs, layout, CpuStorage::F16)?), - CpuStorage::F32(vs) => Ok(self.f(vs, layout, CpuStorage::F32)?), - CpuStorage::F64(vs) => Ok(self.f(vs, layout, CpuStorage::F64)?), - } - } -} - -type C = CpuStorage; -pub trait Map2 { - const OP: &'static str; - fn f(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result>; - - fn map( - &self, - v1: &CpuStorage, - l1: &Layout, - v2: &CpuStorage, - l2: &Layout, - ) -> Result { - match (v1, v2) { - (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::U32(v1), C::U32(v2)) => Ok(C::U32(self.f(v1, l1, v2, l2)?)), - (C::I64(v1), C::I64(v2)) => Ok(C::I64(self.f(v1, l1, v2, l2)?)), - (C::BF16(v1), C::BF16(v2)) => Ok(C::BF16(self.f(v1, l1, v2, l2)?)), - (C::F16(v1), C::F16(v2)) => Ok(C::F16(self.f(v1, l1, v2, l2)?)), - (C::F32(v1), C::F32(v2)) => Ok(C::F32(self.f(v1, l1, v2, l2)?)), - (C::F64(v1), C::F64(v2)) => Ok(C::F64(self.f(v1, l1, v2, l2)?)), - _ => Err(Error::DTypeMismatchBinaryOp { - lhs: v1.dtype(), - rhs: v2.dtype(), - op: Self::OP, - } - .bt()), - } - } -} - -pub trait Map2U8 { - const OP: &'static str; - fn f(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result>; - - fn map( - &self, - v1: &CpuStorage, - l1: &Layout, - v2: &CpuStorage, - l2: &Layout, - ) -> Result { - match (v1, v2) { - (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::U32(v1), C::U32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::I64(v1), C::I64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::BF16(v1), C::BF16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::F16(v1), C::F16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::F32(v1), C::F32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - (C::F64(v1), C::F64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), - _ => Err(Error::DTypeMismatchBinaryOp { - lhs: v1.dtype(), - rhs: v2.dtype(), - op: Self::OP, - } - .bt()), - } - } -} - struct Cmp(CmpOp); impl Map2U8 for Cmp { const OP: &'static str = "cmp"; @@ -366,275 +275,6 @@ impl<'a> Map1 for ReduceSum<'a> { } } -pub fn unary_map U>( - vs: &[T], - layout: &Layout, - mut f: F, -) -> Vec { - match layout.strided_blocks() { - crate::StridedBlocks::SingleBlock { start_offset, len } => vs - [start_offset..start_offset + len] - .iter() - .map(|&v| f(v)) - .collect(), - crate::StridedBlocks::MultipleBlocks { - block_start_index, - block_len, - } => { - let mut result = Vec::with_capacity(layout.shape().elem_count()); - // Specialize the case where block_len is one to avoid the second loop. - if block_len == 1 { - for index in block_start_index { - let v = unsafe { vs.get_unchecked(index) }; - result.push(f(*v)) - } - } else { - for index in block_start_index { - for offset in 0..block_len { - let v = unsafe { vs.get_unchecked(index + offset) }; - result.push(f(*v)) - } - } - } - result - } - } -} - -pub fn unary_map_vec U, FV: FnMut(&[T], &mut [U])>( - vs: &[T], - layout: &Layout, - mut f: F, - mut f_vec: FV, -) -> Vec { - match layout.strided_blocks() { - crate::StridedBlocks::SingleBlock { start_offset, len } => { - let mut ys: Vec = Vec::with_capacity(len); - let ys_to_set = ys.spare_capacity_mut(); - let ys_to_set = unsafe { std::mem::transmute::<_, &mut [U]>(ys_to_set) }; - f_vec(&vs[start_offset..start_offset + len], ys_to_set); - // SAFETY: values are all set by f_vec. - unsafe { ys.set_len(len) }; - ys - } - crate::StridedBlocks::MultipleBlocks { - block_start_index, - block_len, - } => { - let el_count = layout.shape().elem_count(); - // Specialize the case where block_len is one to avoid the second loop. - if block_len == 1 { - let mut result = Vec::with_capacity(el_count); - for index in block_start_index { - let v = unsafe { vs.get_unchecked(index) }; - result.push(f(*v)) - } - result - } else { - let mut ys: Vec = Vec::with_capacity(el_count); - let ys_to_set = ys.spare_capacity_mut(); - let ys_to_set = unsafe { std::mem::transmute::<_, &mut [U]>(ys_to_set) }; - let mut dst_index = 0; - for src_index in block_start_index { - let vs = &vs[src_index..src_index + block_len]; - let ys = &mut ys_to_set[dst_index..dst_index + block_len]; - f_vec(vs, ys); - dst_index += block_len; - } - // SAFETY: values are all set by f_vec. - unsafe { ys.set_len(el_count) }; - ys - } - } - } -} - -// This function maps over two strided index sequences. -pub fn binary_map U>( - lhs_l: &Layout, - rhs_l: &Layout, - lhs: &[T], - rhs: &[T], - mut f: F, -) -> Vec { - match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) { - (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => lhs[o_l1..o_l2] - .iter() - .zip(rhs[o_r1..o_r2].iter()) - .map(|(&l, &r)| f(l, r)) - .collect(), - (Some((o_l1, o_l2)), None) => { - // TODO: Maybe we want to avoid going through the layout twice. - match rhs_l.offsets_b() { - Some(ob) => { - let mut i_in_block = 0; - let mut i_right_broadcast = 0; - lhs[o_l1..o_l2] - .iter() - .map(|&l| { - let r = unsafe { rhs.get_unchecked(i_in_block + ob.start) }; - i_right_broadcast += 1; - if i_right_broadcast >= ob.right_broadcast { - i_in_block += 1; - i_right_broadcast = 0; - } - if i_in_block >= ob.len { - i_in_block = 0 - } - f(l, *r) - }) - .collect() - } - None => lhs_l - .strided_index() - .zip(rhs_l.strided_index()) - .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) - .collect(), - } - } - (None, Some((o_r1, o_r2))) => { - // TODO: Maybe we want to avoid going through the layout twice. - match lhs_l.offsets_b() { - Some(ob) => { - let mut i_in_block = 0; - let mut i_right_broadcast = 0; - rhs[o_r1..o_r2] - .iter() - .map(|&r| { - let l = unsafe { lhs.get_unchecked(i_in_block + ob.start) }; - i_right_broadcast += 1; - if i_right_broadcast >= ob.right_broadcast { - i_in_block += 1; - i_right_broadcast = 0; - } - if i_in_block >= ob.len { - i_in_block = 0 - } - f(*l, r) - }) - .collect() - } - None => lhs_l - .strided_index() - .zip(rhs_l.strided_index()) - .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) - .collect(), - } - } - _ => lhs_l - .strided_index() - .zip(rhs_l.strided_index()) - .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) - .collect(), - } -} - -// Similar to binary_map but with vectorized variants. -pub fn binary_map_vec T, FV: FnMut(&[T], &[T], &mut [T])>( - lhs_l: &Layout, - rhs_l: &Layout, - lhs: &[T], - rhs: &[T], - mut f: F, - mut f_vec: FV, -) -> Vec { - let el_count = lhs_l.shape().elem_count(); - match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) { - (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => { - let mut ys: Vec = Vec::with_capacity(el_count); - let ys_to_set = ys.spare_capacity_mut(); - let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) }; - f_vec(&lhs[o_l1..o_l2], &rhs[o_r1..o_r2], ys_to_set); - // SAFETY: values are all set by f_vec. - unsafe { ys.set_len(el_count) }; - ys - } - (Some((o_l1, o_l2)), None) => match rhs_l.offsets_b() { - Some(ob) if ob.right_broadcast == 1 => { - let rhs = &rhs[ob.start..ob.start + ob.len]; - let mut ys: Vec = Vec::with_capacity(el_count); - let ys_to_set = ys.spare_capacity_mut(); - let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) }; - let mut dst_i = 0; - for src_i in (o_l1..o_l2).step_by(ob.len) { - f_vec( - &lhs[src_i..src_i + ob.len], - rhs, - &mut ys_to_set[dst_i..dst_i + ob.len], - ); - dst_i += ob.len; - } - // SAFETY: values are all set by f_vec. - unsafe { ys.set_len(el_count) }; - ys - } - Some(ob) => { - let rhs = &rhs[ob.start..ob.start + ob.len]; - let mut ys = lhs[o_l1..o_l2].to_vec(); - for idx_l in 0..ob.left_broadcast { - let start = idx_l * ob.len * ob.right_broadcast; - for (i, &r) in rhs.iter().enumerate() { - let start = start + i * ob.right_broadcast; - for v in ys[start..start + ob.right_broadcast].iter_mut() { - *v = f(*v, r) - } - } - } - ys - } - None => lhs_l - .strided_index() - .zip(rhs_l.strided_index()) - .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) - .collect(), - }, - (None, Some((o_r1, o_r2))) => match lhs_l.offsets_b() { - Some(ob) if ob.right_broadcast == 1 => { - let lhs = &lhs[ob.start..ob.start + ob.len]; - let mut ys: Vec = Vec::with_capacity(el_count); - let ys_to_set = ys.spare_capacity_mut(); - let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) }; - let mut dst_i = 0; - for src_i in (o_r1..o_r2).step_by(ob.len) { - f_vec( - lhs, - &rhs[src_i..src_i + ob.len], - &mut ys_to_set[dst_i..dst_i + ob.len], - ); - dst_i += ob.len; - } - // SAFETY: values are all set by f_vec. - unsafe { ys.set_len(el_count) }; - ys - } - Some(ob) => { - let lhs = &lhs[ob.start..ob.start + ob.len]; - let mut ys = rhs[o_r1..o_r2].to_vec(); - for idx_l in 0..ob.left_broadcast { - let start = idx_l * ob.len * ob.right_broadcast; - for (i, &l) in lhs.iter().enumerate() { - let start = start + i * ob.right_broadcast; - for v in ys[start..start + ob.right_broadcast].iter_mut() { - *v = f(l, *v) - } - } - } - ys - } - None => lhs_l - .strided_index() - .zip(rhs_l.strided_index()) - .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) - .collect(), - }, - _ => lhs_l - .strided_index() - .zip(rhs_l.strided_index()) - .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) - .collect(), - } -} - struct Affine(f64, f64); impl Map1 for Affine { diff --git a/candle-core/src/cpu_backend/utils.rs b/candle-core/src/cpu_backend/utils.rs new file mode 100644 index 00000000..af25a2af --- /dev/null +++ b/candle-core/src/cpu_backend/utils.rs @@ -0,0 +1,350 @@ +/// Helper functions to write CPU kernels. +use crate::backend::BackendStorage; +use crate::{Error, Layout, Result, WithDType}; + +type C = super::CpuStorage; +pub trait Map1 { + fn f(&self, vs: &[T], layout: &Layout) -> Result>; + + fn map(&self, vs: &C, layout: &Layout) -> Result { + match vs { + C::U8(vs) => Ok(C::U8(self.f(vs, layout)?)), + C::U32(vs) => Ok(C::U32(self.f(vs, layout)?)), + C::I64(vs) => Ok(C::I64(self.f(vs, layout)?)), + C::BF16(vs) => Ok(C::BF16(self.f(vs, layout)?)), + C::F16(vs) => Ok(C::F16(self.f(vs, layout)?)), + C::F32(vs) => Ok(C::F32(self.f(vs, layout)?)), + C::F64(vs) => Ok(C::F64(self.f(vs, layout)?)), + } + } +} + +pub trait Map1Any { + fn f) -> C>(&self, vs: &[T], layout: &Layout, wrap: W) -> Result; + + fn map(&self, vs: &C, layout: &Layout) -> Result { + match vs { + C::U8(vs) => Ok(self.f(vs, layout, C::U8)?), + C::U32(vs) => Ok(self.f(vs, layout, C::U32)?), + C::I64(vs) => Ok(self.f(vs, layout, C::I64)?), + C::BF16(vs) => Ok(self.f(vs, layout, C::BF16)?), + C::F16(vs) => Ok(self.f(vs, layout, C::F16)?), + C::F32(vs) => Ok(self.f(vs, layout, C::F32)?), + C::F64(vs) => Ok(self.f(vs, layout, C::F64)?), + } + } +} + +pub trait Map2 { + const OP: &'static str; + fn f(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result>; + + fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result { + match (v1, v2) { + (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::U32(v1), C::U32(v2)) => Ok(C::U32(self.f(v1, l1, v2, l2)?)), + (C::I64(v1), C::I64(v2)) => Ok(C::I64(self.f(v1, l1, v2, l2)?)), + (C::BF16(v1), C::BF16(v2)) => Ok(C::BF16(self.f(v1, l1, v2, l2)?)), + (C::F16(v1), C::F16(v2)) => Ok(C::F16(self.f(v1, l1, v2, l2)?)), + (C::F32(v1), C::F32(v2)) => Ok(C::F32(self.f(v1, l1, v2, l2)?)), + (C::F64(v1), C::F64(v2)) => Ok(C::F64(self.f(v1, l1, v2, l2)?)), + _ => Err(Error::DTypeMismatchBinaryOp { + lhs: v1.dtype(), + rhs: v2.dtype(), + op: Self::OP, + } + .bt()), + } + } +} + +pub trait Map2U8 { + const OP: &'static str; + fn f(&self, v1: &[T], l1: &Layout, v2: &[T], l2: &Layout) -> Result>; + + fn map(&self, v1: &C, l1: &Layout, v2: &C, l2: &Layout) -> Result { + match (v1, v2) { + (C::U8(v1), C::U8(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::U32(v1), C::U32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::I64(v1), C::I64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::BF16(v1), C::BF16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::F16(v1), C::F16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::F32(v1), C::F32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + (C::F64(v1), C::F64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)), + _ => Err(Error::DTypeMismatchBinaryOp { + lhs: v1.dtype(), + rhs: v2.dtype(), + op: Self::OP, + } + .bt()), + } + } +} + +pub fn binary_map U>( + lhs_l: &Layout, + rhs_l: &Layout, + lhs: &[T], + rhs: &[T], + mut f: F, +) -> Vec { + match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) { + (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => lhs[o_l1..o_l2] + .iter() + .zip(rhs[o_r1..o_r2].iter()) + .map(|(&l, &r)| f(l, r)) + .collect(), + (Some((o_l1, o_l2)), None) => { + // TODO: Maybe we want to avoid going through the layout twice. + match rhs_l.offsets_b() { + Some(ob) => { + let mut i_in_block = 0; + let mut i_right_broadcast = 0; + lhs[o_l1..o_l2] + .iter() + .map(|&l| { + let r = unsafe { rhs.get_unchecked(i_in_block + ob.start) }; + i_right_broadcast += 1; + if i_right_broadcast >= ob.right_broadcast { + i_in_block += 1; + i_right_broadcast = 0; + } + if i_in_block >= ob.len { + i_in_block = 0 + } + f(l, *r) + }) + .collect() + } + None => lhs_l + .strided_index() + .zip(rhs_l.strided_index()) + .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) + .collect(), + } + } + (None, Some((o_r1, o_r2))) => { + // TODO: Maybe we want to avoid going through the layout twice. + match lhs_l.offsets_b() { + Some(ob) => { + let mut i_in_block = 0; + let mut i_right_broadcast = 0; + rhs[o_r1..o_r2] + .iter() + .map(|&r| { + let l = unsafe { lhs.get_unchecked(i_in_block + ob.start) }; + i_right_broadcast += 1; + if i_right_broadcast >= ob.right_broadcast { + i_in_block += 1; + i_right_broadcast = 0; + } + if i_in_block >= ob.len { + i_in_block = 0 + } + f(*l, r) + }) + .collect() + } + None => lhs_l + .strided_index() + .zip(rhs_l.strided_index()) + .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) + .collect(), + } + } + _ => lhs_l + .strided_index() + .zip(rhs_l.strided_index()) + .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) + .collect(), + } +} + +// Similar to binary_map but with vectorized variants. +pub fn binary_map_vec T, FV: FnMut(&[T], &[T], &mut [T])>( + lhs_l: &Layout, + rhs_l: &Layout, + lhs: &[T], + rhs: &[T], + mut f: F, + mut f_vec: FV, +) -> Vec { + let el_count = lhs_l.shape().elem_count(); + match (lhs_l.contiguous_offsets(), rhs_l.contiguous_offsets()) { + (Some((o_l1, o_l2)), Some((o_r1, o_r2))) => { + let mut ys: Vec = Vec::with_capacity(el_count); + let ys_to_set = ys.spare_capacity_mut(); + let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) }; + f_vec(&lhs[o_l1..o_l2], &rhs[o_r1..o_r2], ys_to_set); + // SAFETY: values are all set by f_vec. + unsafe { ys.set_len(el_count) }; + ys + } + (Some((o_l1, o_l2)), None) => match rhs_l.offsets_b() { + Some(ob) if ob.right_broadcast == 1 => { + let rhs = &rhs[ob.start..ob.start + ob.len]; + let mut ys: Vec = Vec::with_capacity(el_count); + let ys_to_set = ys.spare_capacity_mut(); + let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) }; + let mut dst_i = 0; + for src_i in (o_l1..o_l2).step_by(ob.len) { + f_vec( + &lhs[src_i..src_i + ob.len], + rhs, + &mut ys_to_set[dst_i..dst_i + ob.len], + ); + dst_i += ob.len; + } + // SAFETY: values are all set by f_vec. + unsafe { ys.set_len(el_count) }; + ys + } + Some(ob) => { + let rhs = &rhs[ob.start..ob.start + ob.len]; + let mut ys = lhs[o_l1..o_l2].to_vec(); + for idx_l in 0..ob.left_broadcast { + let start = idx_l * ob.len * ob.right_broadcast; + for (i, &r) in rhs.iter().enumerate() { + let start = start + i * ob.right_broadcast; + for v in ys[start..start + ob.right_broadcast].iter_mut() { + *v = f(*v, r) + } + } + } + ys + } + None => lhs_l + .strided_index() + .zip(rhs_l.strided_index()) + .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) + .collect(), + }, + (None, Some((o_r1, o_r2))) => match lhs_l.offsets_b() { + Some(ob) if ob.right_broadcast == 1 => { + let lhs = &lhs[ob.start..ob.start + ob.len]; + let mut ys: Vec = Vec::with_capacity(el_count); + let ys_to_set = ys.spare_capacity_mut(); + let ys_to_set = unsafe { std::mem::transmute::<_, &mut [T]>(ys_to_set) }; + let mut dst_i = 0; + for src_i in (o_r1..o_r2).step_by(ob.len) { + f_vec( + lhs, + &rhs[src_i..src_i + ob.len], + &mut ys_to_set[dst_i..dst_i + ob.len], + ); + dst_i += ob.len; + } + // SAFETY: values are all set by f_vec. + unsafe { ys.set_len(el_count) }; + ys + } + Some(ob) => { + let lhs = &lhs[ob.start..ob.start + ob.len]; + let mut ys = rhs[o_r1..o_r2].to_vec(); + for idx_l in 0..ob.left_broadcast { + let start = idx_l * ob.len * ob.right_broadcast; + for (i, &l) in lhs.iter().enumerate() { + let start = start + i * ob.right_broadcast; + for v in ys[start..start + ob.right_broadcast].iter_mut() { + *v = f(l, *v) + } + } + } + ys + } + None => lhs_l + .strided_index() + .zip(rhs_l.strided_index()) + .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) + .collect(), + }, + _ => lhs_l + .strided_index() + .zip(rhs_l.strided_index()) + .map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i])) + .collect(), + } +} + +pub fn unary_map U>( + vs: &[T], + layout: &Layout, + mut f: F, +) -> Vec { + match layout.strided_blocks() { + crate::StridedBlocks::SingleBlock { start_offset, len } => vs + [start_offset..start_offset + len] + .iter() + .map(|&v| f(v)) + .collect(), + crate::StridedBlocks::MultipleBlocks { + block_start_index, + block_len, + } => { + let mut result = Vec::with_capacity(layout.shape().elem_count()); + // Specialize the case where block_len is one to avoid the second loop. + if block_len == 1 { + for index in block_start_index { + let v = unsafe { vs.get_unchecked(index) }; + result.push(f(*v)) + } + } else { + for index in block_start_index { + for offset in 0..block_len { + let v = unsafe { vs.get_unchecked(index + offset) }; + result.push(f(*v)) + } + } + } + result + } + } +} + +pub fn unary_map_vec U, FV: FnMut(&[T], &mut [U])>( + vs: &[T], + layout: &Layout, + mut f: F, + mut f_vec: FV, +) -> Vec { + match layout.strided_blocks() { + crate::StridedBlocks::SingleBlock { start_offset, len } => { + let mut ys: Vec = Vec::with_capacity(len); + let ys_to_set = ys.spare_capacity_mut(); + let ys_to_set = unsafe { std::mem::transmute::<_, &mut [U]>(ys_to_set) }; + f_vec(&vs[start_offset..start_offset + len], ys_to_set); + // SAFETY: values are all set by f_vec. + unsafe { ys.set_len(len) }; + ys + } + crate::StridedBlocks::MultipleBlocks { + block_start_index, + block_len, + } => { + let el_count = layout.shape().elem_count(); + // Specialize the case where block_len is one to avoid the second loop. + if block_len == 1 { + let mut result = Vec::with_capacity(el_count); + for index in block_start_index { + let v = unsafe { vs.get_unchecked(index) }; + result.push(f(*v)) + } + result + } else { + let mut ys: Vec = Vec::with_capacity(el_count); + let ys_to_set = ys.spare_capacity_mut(); + let ys_to_set = unsafe { std::mem::transmute::<_, &mut [U]>(ys_to_set) }; + let mut dst_index = 0; + for src_index in block_start_index { + let vs = &vs[src_index..src_index + block_len]; + let ys = &mut ys_to_set[dst_index..dst_index + block_len]; + f_vec(vs, ys); + dst_index += block_len; + } + // SAFETY: values are all set by f_vec. + unsafe { ys.set_len(el_count) }; + ys + } + } + } +} diff --git a/candle-core/src/cudnn.rs b/candle-core/src/cuda_backend/cudnn.rs similarity index 100% rename from candle-core/src/cudnn.rs rename to candle-core/src/cuda_backend/cudnn.rs diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs new file mode 100644 index 00000000..0859d756 --- /dev/null +++ b/candle-core/src/cuda_backend/device.rs @@ -0,0 +1,410 @@ +use crate::backend::BackendDevice; +use crate::{CpuStorage, DType, Layout, Result, Shape}; +pub use candle_kernels as kernels; +pub use cudarc; +use cudarc::driver::{CudaFunction, LaunchAsync, LaunchConfig}; +use half::{bf16, f16}; +use std::sync::{Arc, Mutex}; + +use super::{CudaError, CudaStorage, CudaStorageSlice, WrapErr}; + +/// Unique identifier for cuda devices. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct DeviceId(usize); + +impl DeviceId { + fn new() -> Self { + // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805 + use std::sync::atomic; + static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1); + Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed)) + } +} + +struct CudaRng(cudarc::curand::CudaRng); +unsafe impl Send for CudaRng {} + +#[derive(Clone)] +pub struct CudaDevice { + id: DeviceId, + device: Arc, + pub(crate) blas: Arc, + curand: Arc>, +} + +impl std::fmt::Debug for CudaDevice { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "CudaDevice({:?})", self.id) + } +} + +impl std::ops::Deref for CudaDevice { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.device + } +} + +impl CudaDevice { + pub fn cuda_device(&self) -> Arc { + self.device.clone() + } + + pub fn id(&self) -> DeviceId { + self.id + } + + fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result { + let elem_count = shape.elem_count(); + let cfg = LaunchConfig::for_num_elems(elem_count as u32); + let slice = match dtype { + DType::U8 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_u8", kernels::FILL)?; + let params = (&data, v as u8, elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::U8(data) + } + DType::U32 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_u32", kernels::FILL)?; + let params = (&data, v as u32, elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::U32(data) + } + DType::I64 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_i64", kernels::FILL)?; + let params = (&data, v as i64, elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::I64(data) + } + DType::BF16 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_bf16", kernels::FILL)?; + let params = (&data, bf16::from_f64(v), elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::BF16(data) + } + DType::F16 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_f16", kernels::FILL)?; + let params = (&data, f16::from_f64(v), elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::F16(data) + } + DType::F32 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_f32", kernels::FILL)?; + let params = (&data, v as f32, elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::F32(data) + } + DType::F64 => { + // SAFETY: Set later by running the fill kernel. + let data = unsafe { self.alloc::(elem_count) }.w()?; + let func = self.get_or_load_func("fill_f64", kernels::FILL)?; + let params = (&data, v, elem_count); + unsafe { func.launch(cfg, params) }.w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + + pub fn get_or_load_func(&self, module_name: &str, ptx: &'static str) -> Result { + if !self.has_func(module_name, module_name) { + // Leaking the string here is a bit sad but we need a &'static str and this is only + // done once per kernel name. + let static_module_name = Box::leak(module_name.to_string().into_boxed_str()); + self.load_ptx(ptx.into(), module_name, &[static_module_name]) + .map_err(|cuda| CudaError::Load { + cuda, + module_name: module_name.to_string(), + }) + .w()?; + } + self.get_func(module_name, module_name) + // Clippy recommends this `ok_or` rather than `ok_or_else` so hopefully the compiler is + // able to only build the error value if needed. + .ok_or(CudaError::MissingKernel { + module_name: module_name.to_string(), + }) + .w() + } +} + +impl BackendDevice for CudaDevice { + type Storage = CudaStorage; + + fn new(ordinal: usize) -> Result { + let device = cudarc::driver::CudaDevice::new(ordinal).w()?; + let blas = cudarc::cublas::CudaBlas::new(device.clone()).w()?; + let curand = cudarc::curand::CudaRng::new(299792458, device.clone()).w()?; + Ok(Self { + id: DeviceId::new(), + device, + blas: Arc::new(blas), + curand: Arc::new(Mutex::new(CudaRng(curand))), + }) + } + + fn set_seed(&self, seed: u64) -> Result<()> { + // We do not call set_seed but instead create a new curand object. This ensures that the + // state will be identical and the same random numbers will be generated. + let mut curand = self.curand.lock().unwrap(); + curand.0 = cudarc::curand::CudaRng::new(seed, self.device.clone()).w()?; + Ok(()) + } + + fn location(&self) -> crate::DeviceLocation { + crate::DeviceLocation::Cuda { + gpu_id: self.device.ordinal(), + } + } + + fn same_device(&self, rhs: &Self) -> bool { + self.id == rhs.id + } + + fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result { + let elem_count = shape.elem_count(); + let slice = match dtype { + DType::U8 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::U8(data) + } + DType::U32 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::U32(data) + } + DType::I64 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::I64(data) + } + DType::BF16 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::BF16(data) + } + DType::F16 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::F16(data) + } + DType::F32 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::F32(data) + } + DType::F64 => { + let data = self.alloc_zeros::(elem_count).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + + fn rand_uniform(&self, shape: &Shape, dtype: DType, lo: f64, up: f64) -> Result { + let elem_count = shape.elem_count(); + let curand = self.curand.lock().unwrap(); + let slice = match dtype { + // TODO: Add support for F16 and BF16 though this is likely to require some upstream + // cudarc changes. + DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => { + Err(CudaError::UnsupportedDtype { + dtype, + op: "rand_uniform", + }) + .w()? + } + DType::F32 => { + let mut data = unsafe { self.alloc::(elem_count) }.w()?; + curand.0.fill_with_uniform(&mut data).w()?; + CudaStorageSlice::F32(data) + } + DType::F64 => { + let mut data = unsafe { self.alloc::(elem_count) }.w()?; + curand.0.fill_with_uniform(&mut data).w()?; + CudaStorageSlice::F64(data) + } + }; + let slice = if lo == 0. && up == 1.0 { + slice + } else { + use super::utils::Map1; + let layout = Layout::contiguous(shape); + super::Affine(up - lo, lo).map(&slice, self, &layout)? + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + + fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, std: f64) -> Result { + // TODO: Add support for F16 and BF16 though this is likely to require some upstream + // cudarc changes. + let elem_count = shape.elem_count(); + let curand = self.curand.lock().unwrap(); + // curand can only generate an odd number of values. + // https://github.com/huggingface/candle/issues/734 + let elem_count_round = if elem_count % 2 == 1 { + elem_count + 1 + } else { + elem_count + }; + let slice = match dtype { + DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => { + Err(CudaError::UnsupportedDtype { + dtype, + op: "rand_normal", + }) + .w()? + } + DType::F32 => { + let mut data = unsafe { self.alloc::(elem_count_round) }.w()?; + curand + .0 + .fill_with_normal(&mut data, mean as f32, std as f32) + .w()?; + CudaStorageSlice::F32(data) + } + DType::F64 => { + let mut data = unsafe { self.alloc::(elem_count_round) }.w()?; + curand.0.fill_with_normal(&mut data, mean, std).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + + fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result { + self.const_impl(1., shape, dtype) + } + + unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { + let elem_count = shape.elem_count(); + let slice = match dtype { + DType::U8 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::U8(data) + } + DType::U32 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::U32(data) + } + DType::I64 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::I64(data) + } + DType::BF16 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::BF16(data) + } + DType::F16 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::F16(data) + } + DType::F32 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::F32(data) + } + DType::F64 => { + let data = self.alloc::(elem_count).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + + fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result { + let slice = match storage { + CpuStorage::U8(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::U8(data) + } + CpuStorage::U32(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::U32(data) + } + CpuStorage::I64(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::I64(data) + } + CpuStorage::BF16(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::BF16(data) + } + CpuStorage::F16(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::F16(data) + } + CpuStorage::F32(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::F32(data) + } + CpuStorage::F64(storage) => { + let data = self.htod_sync_copy(storage).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } + + fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result { + let slice = match storage { + CpuStorage::U8(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::U8(data) + } + CpuStorage::U32(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::U32(data) + } + CpuStorage::I64(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::I64(data) + } + CpuStorage::BF16(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::BF16(data) + } + CpuStorage::F16(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::F16(data) + } + CpuStorage::F32(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::F32(data) + } + CpuStorage::F64(storage) => { + let data = self.htod_copy(storage).w()?; + CudaStorageSlice::F64(data) + } + }; + Ok(CudaStorage { + slice, + device: self.clone(), + }) + } +} diff --git a/candle-core/src/cuda_backend.rs b/candle-core/src/cuda_backend/mod.rs similarity index 78% rename from candle-core/src/cuda_backend.rs rename to candle-core/src/cuda_backend/mod.rs index 23487330..78aebd9b 100644 --- a/candle-core/src/cuda_backend.rs +++ b/candle-core/src/cuda_backend/mod.rs @@ -5,11 +5,17 @@ pub use candle_kernels as kernels; pub use cudarc; use cudarc::cublas::{Gemm, GemmConfig, StridedBatchedConfig}; use cudarc::driver::{ - CudaFunction, CudaSlice, DevicePtr, DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig, - ValidAsZeroBits, + CudaSlice, DevicePtr, DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig, ValidAsZeroBits, }; use half::{bf16, f16}; -use std::sync::{Arc, Mutex}; + +mod device; +pub use device::{CudaDevice, DeviceId}; +mod utils; +pub use utils::{Map1, Map1Any, Map2, Map2Any, Map2InPlace, S}; + +#[cfg(feature = "cudnn")] +pub mod cudnn; enum SlicePtrOrNull { Ptr(CudaSlice), @@ -87,44 +93,6 @@ impl From for crate::Error { } } -/// Unique identifier for cuda devices. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct DeviceId(usize); - -impl DeviceId { - fn new() -> Self { - // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805 - use std::sync::atomic; - static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1); - Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed)) - } -} - -struct CudaRng(cudarc::curand::CudaRng); -unsafe impl Send for CudaRng {} - -#[derive(Clone)] -pub struct CudaDevice { - id: DeviceId, - device: Arc, - blas: Arc, - curand: Arc>, -} - -impl std::fmt::Debug for CudaDevice { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "CudaDevice({:?})", self.id) - } -} - -impl std::ops::Deref for CudaDevice { - type Target = Arc; - - fn deref(&self) -> &Self::Target { - &self.device - } -} - pub trait WrapErr { fn w(self) -> std::result::Result; } @@ -135,368 +103,6 @@ impl> WrapErr for std::result::Result { } } -impl CudaDevice { - pub fn cuda_device(&self) -> Arc { - self.device.clone() - } - - pub fn id(&self) -> DeviceId { - self.id - } - - fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result { - let elem_count = shape.elem_count(); - let cfg = LaunchConfig::for_num_elems(elem_count as u32); - let slice = match dtype { - DType::U8 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_u8", kernels::FILL)?; - let params = (&data, v as u8, elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::U8(data) - } - DType::U32 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_u32", kernels::FILL)?; - let params = (&data, v as u32, elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::U32(data) - } - DType::I64 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_i64", kernels::FILL)?; - let params = (&data, v as i64, elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::I64(data) - } - DType::BF16 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_bf16", kernels::FILL)?; - let params = (&data, bf16::from_f64(v), elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::BF16(data) - } - DType::F16 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_f16", kernels::FILL)?; - let params = (&data, f16::from_f64(v), elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::F16(data) - } - DType::F32 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_f32", kernels::FILL)?; - let params = (&data, v as f32, elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::F32(data) - } - DType::F64 => { - // SAFETY: Set later by running the fill kernel. - let data = unsafe { self.alloc::(elem_count) }.w()?; - let func = self.get_or_load_func("fill_f64", kernels::FILL)?; - let params = (&data, v, elem_count); - unsafe { func.launch(cfg, params) }.w()?; - CudaStorageSlice::F64(data) - } - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } - - pub fn get_or_load_func(&self, module_name: &str, ptx: &'static str) -> Result { - if !self.has_func(module_name, module_name) { - // Leaking the string here is a bit sad but we need a &'static str and this is only - // done once per kernel name. - let static_module_name = Box::leak(module_name.to_string().into_boxed_str()); - self.load_ptx(ptx.into(), module_name, &[static_module_name]) - .map_err(|cuda| CudaError::Load { - cuda, - module_name: module_name.to_string(), - }) - .w()?; - } - self.get_func(module_name, module_name) - // Clippy recommends this `ok_or` rather than `ok_or_else` so hopefully the compiler is - // able to only build the error value if needed. - .ok_or(CudaError::MissingKernel { - module_name: module_name.to_string(), - }) - .w() - } -} - -impl BackendDevice for CudaDevice { - type Storage = CudaStorage; - - fn new(ordinal: usize) -> Result { - let device = cudarc::driver::CudaDevice::new(ordinal).w()?; - let blas = cudarc::cublas::CudaBlas::new(device.clone()).w()?; - let curand = cudarc::curand::CudaRng::new(299792458, device.clone()).w()?; - Ok(Self { - id: DeviceId::new(), - device, - blas: Arc::new(blas), - curand: Arc::new(Mutex::new(CudaRng(curand))), - }) - } - - fn set_seed(&self, seed: u64) -> Result<()> { - // We do not call set_seed but instead create a new curand object. This ensures that the - // state will be identical and the same random numbers will be generated. - let mut curand = self.curand.lock().unwrap(); - curand.0 = cudarc::curand::CudaRng::new(seed, self.device.clone()).w()?; - Ok(()) - } - - fn location(&self) -> crate::DeviceLocation { - crate::DeviceLocation::Cuda { - gpu_id: self.device.ordinal(), - } - } - - fn same_device(&self, rhs: &Self) -> bool { - self.id == rhs.id - } - - fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result { - let elem_count = shape.elem_count(); - let slice = match dtype { - DType::U8 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::U8(data) - } - DType::U32 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::U32(data) - } - DType::I64 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::I64(data) - } - DType::BF16 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::BF16(data) - } - DType::F16 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::F16(data) - } - DType::F32 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::F32(data) - } - DType::F64 => { - let data = self.alloc_zeros::(elem_count).w()?; - CudaStorageSlice::F64(data) - } - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } - - fn rand_uniform(&self, shape: &Shape, dtype: DType, lo: f64, up: f64) -> Result { - let elem_count = shape.elem_count(); - let curand = self.curand.lock().unwrap(); - let slice = match dtype { - // TODO: Add support for F16 and BF16 though this is likely to require some upstream - // cudarc changes. - DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => { - Err(CudaError::UnsupportedDtype { - dtype, - op: "rand_uniform", - }) - .w()? - } - DType::F32 => { - let mut data = unsafe { self.alloc::(elem_count) }.w()?; - curand.0.fill_with_uniform(&mut data).w()?; - CudaStorageSlice::F32(data) - } - DType::F64 => { - let mut data = unsafe { self.alloc::(elem_count) }.w()?; - curand.0.fill_with_uniform(&mut data).w()?; - CudaStorageSlice::F64(data) - } - }; - let slice = if lo == 0. && up == 1.0 { - slice - } else { - let layout = Layout::contiguous(shape); - Affine(up - lo, lo).map(&slice, self, &layout)? - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } - - fn rand_normal(&self, shape: &Shape, dtype: DType, mean: f64, std: f64) -> Result { - // TODO: Add support for F16 and BF16 though this is likely to require some upstream - // cudarc changes. - let elem_count = shape.elem_count(); - let curand = self.curand.lock().unwrap(); - // curand can only generate an odd number of values. - // https://github.com/huggingface/candle/issues/734 - let elem_count_round = if elem_count % 2 == 1 { - elem_count + 1 - } else { - elem_count - }; - let slice = match dtype { - DType::U8 | DType::U32 | DType::I64 | DType::F16 | DType::BF16 => { - Err(CudaError::UnsupportedDtype { - dtype, - op: "rand_normal", - }) - .w()? - } - DType::F32 => { - let mut data = unsafe { self.alloc::(elem_count_round) }.w()?; - curand - .0 - .fill_with_normal(&mut data, mean as f32, std as f32) - .w()?; - CudaStorageSlice::F32(data) - } - DType::F64 => { - let mut data = unsafe { self.alloc::(elem_count_round) }.w()?; - curand.0.fill_with_normal(&mut data, mean, std).w()?; - CudaStorageSlice::F64(data) - } - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } - - fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result { - self.const_impl(1., shape, dtype) - } - - unsafe fn alloc_uninit(&self, shape: &Shape, dtype: DType) -> Result { - let elem_count = shape.elem_count(); - let slice = match dtype { - DType::U8 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::U8(data) - } - DType::U32 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::U32(data) - } - DType::I64 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::I64(data) - } - DType::BF16 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::BF16(data) - } - DType::F16 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::F16(data) - } - DType::F32 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::F32(data) - } - DType::F64 => { - let data = self.alloc::(elem_count).w()?; - CudaStorageSlice::F64(data) - } - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } - - fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result { - let slice = match storage { - CpuStorage::U8(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::U8(data) - } - CpuStorage::U32(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::U32(data) - } - CpuStorage::I64(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::I64(data) - } - CpuStorage::BF16(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::BF16(data) - } - CpuStorage::F16(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::F16(data) - } - CpuStorage::F32(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::F32(data) - } - CpuStorage::F64(storage) => { - let data = self.htod_sync_copy(storage).w()?; - CudaStorageSlice::F64(data) - } - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } - - fn storage_from_cpu_storage_owned(&self, storage: CpuStorage) -> Result { - let slice = match storage { - CpuStorage::U8(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::U8(data) - } - CpuStorage::U32(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::U32(data) - } - CpuStorage::I64(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::I64(data) - } - CpuStorage::BF16(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::BF16(data) - } - CpuStorage::F16(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::F16(data) - } - CpuStorage::F32(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::F32(data) - } - CpuStorage::F64(storage) => { - let data = self.htod_copy(storage).w()?; - CudaStorageSlice::F64(data) - } - }; - Ok(CudaStorage { - slice, - device: self.clone(), - }) - } -} - #[derive(Debug)] pub enum CudaStorageSlice { U8(CudaSlice), @@ -507,133 +113,6 @@ pub enum CudaStorageSlice { F32(CudaSlice), F64(CudaSlice), } -type S = CudaStorageSlice; - -pub trait Map1 { - fn f( - &self, - src: &CudaSlice, - dev: &CudaDevice, - layout: &Layout, - ) -> Result>; - - fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result { - let out = match s { - S::U8(s) => S::U8(self.f(s, d, l)?), - S::U32(s) => S::U32(self.f(s, d, l)?), - S::I64(s) => S::I64(self.f(s, d, l)?), - S::BF16(s) => S::BF16(self.f(s, d, l)?), - S::F16(s) => S::F16(self.f(s, d, l)?), - S::F32(s) => S::F32(self.f(s, d, l)?), - S::F64(s) => S::F64(self.f(s, d, l)?), - }; - Ok(out) - } -} - -pub trait Map2 { - fn f( - &self, - src1: &CudaSlice, - layout1: &Layout, - src2: &CudaSlice, - layout2: &Layout, - dev: &CudaDevice, - ) -> Result>; - - fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result { - let out = match (s1, s2) { - (S::U8(s1), S::U8(s2)) => S::U8(self.f(s1, l1, s2, l2, d)?), - (S::U32(s1), S::U32(s2)) => S::U32(self.f(s1, l1, s2, l2, d)?), - (S::I64(s1), S::I64(s2)) => S::I64(self.f(s1, l1, s2, l2, d)?), - (S::BF16(s1), S::BF16(s2)) => S::BF16(self.f(s1, l1, s2, l2, d)?), - (S::F16(s1), S::F16(s2)) => S::F16(self.f(s1, l1, s2, l2, d)?), - (S::F32(s1), S::F32(s2)) => S::F32(self.f(s1, l1, s2, l2, d)?), - (S::F64(s1), S::F64(s2)) => S::F64(self.f(s1, l1, s2, l2, d)?), - _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?, - }; - Ok(out) - } -} - -pub trait Map2InPlace { - fn f( - &self, - dst: &mut CudaSlice, - dst_shape: &Shape, - src: &CudaSlice, - src_l: &Layout, - dev: &CudaDevice, - ) -> Result<()>; - - fn map( - &self, - dst: &mut S, - dst_s: &Shape, - src: &S, - src_l: &Layout, - d: &CudaDevice, - ) -> Result<()> { - match (dst, src) { - (S::U8(dst), S::U8(src)) => self.f(dst, dst_s, src, src_l, d), - (S::U32(dst), S::U32(src)) => self.f(dst, dst_s, src, src_l, d), - (S::I64(dst), S::I64(src)) => self.f(dst, dst_s, src, src_l, d), - (S::BF16(dst), S::BF16(src)) => self.f(dst, dst_s, src, src_l, d), - (S::F16(dst), S::F16(src)) => self.f(dst, dst_s, src, src_l, d), - (S::F32(dst), S::F32(src)) => self.f(dst, dst_s, src, src_l, d), - (S::F64(dst), S::F64(src)) => self.f(dst, dst_s, src, src_l, d), - _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?, - } - } -} - -pub trait Map1Any { - fn f) -> S>( - &self, - src: &CudaSlice, - dev: &CudaDevice, - layout: &Layout, - wrap: W, - ) -> Result; - - fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result { - let out = match s { - S::U8(s) => self.f(s, d, l, S::U8)?, - S::U32(s) => self.f(s, d, l, S::U32)?, - S::I64(s) => self.f(s, d, l, S::I64)?, - S::BF16(s) => self.f(s, d, l, S::BF16)?, - S::F16(s) => self.f(s, d, l, S::F16)?, - S::F32(s) => self.f(s, d, l, S::F32)?, - S::F64(s) => self.f(s, d, l, S::F64)?, - }; - Ok(out) - } -} - -pub trait Map2Any { - fn f( - &self, - src1: &CudaSlice, - layout1: &Layout, - src2: &CudaSlice, - layout2: &Layout, - dev: &CudaDevice, - ) -> Result; - - fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result { - let out = match (s1, s2) { - (S::U8(s1), S::U8(s2)) => self.f(s1, l1, s2, l2, d)?, - (S::U32(s1), S::U32(s2)) => self.f(s1, l1, s2, l2, d)?, - (S::I64(s1), S::I64(s2)) => self.f(s1, l1, s2, l2, d)?, - (S::BF16(s1), S::BF16(s2)) => self.f(s1, l1, s2, l2, d)?, - (S::F16(s1), S::F16(s2)) => self.f(s1, l1, s2, l2, d)?, - (S::F32(s1), S::F32(s2)) => self.f(s1, l1, s2, l2, d)?, - (S::F64(s1), S::F64(s2)) => self.f(s1, l1, s2, l2, d)?, - _ => Err(CudaError::InternalError("dtype mismatch in binary op")).w()?, - }; - Ok(out) - } -} struct Clone; impl Map1 for Clone { diff --git a/candle-core/src/cuda_backend/utils.rs b/candle-core/src/cuda_backend/utils.rs new file mode 100644 index 00000000..8dd5be77 --- /dev/null +++ b/candle-core/src/cuda_backend/utils.rs @@ -0,0 +1,134 @@ +/// Helper functions to plug cuda kernels in candle. +use crate::{Layout, Result, Shape, WithDType}; +pub use cudarc; +use cudarc::driver::{CudaSlice, DeviceRepr, ValidAsZeroBits}; + +use super::{CudaDevice, CudaError, WrapErr}; + +pub type S = super::CudaStorageSlice; + +pub trait Map1 { + fn f( + &self, + src: &CudaSlice, + dev: &CudaDevice, + layout: &Layout, + ) -> Result>; + + fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result { + let out = match s { + S::U8(s) => S::U8(self.f(s, d, l)?), + S::U32(s) => S::U32(self.f(s, d, l)?), + S::I64(s) => S::I64(self.f(s, d, l)?), + S::BF16(s) => S::BF16(self.f(s, d, l)?), + S::F16(s) => S::F16(self.f(s, d, l)?), + S::F32(s) => S::F32(self.f(s, d, l)?), + S::F64(s) => S::F64(self.f(s, d, l)?), + }; + Ok(out) + } +} + +pub trait Map2 { + fn f( + &self, + src1: &CudaSlice, + layout1: &Layout, + src2: &CudaSlice, + layout2: &Layout, + dev: &CudaDevice, + ) -> Result>; + + fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result { + let out = match (s1, s2) { + (S::U8(s1), S::U8(s2)) => S::U8(self.f(s1, l1, s2, l2, d)?), + (S::U32(s1), S::U32(s2)) => S::U32(self.f(s1, l1, s2, l2, d)?), + (S::I64(s1), S::I64(s2)) => S::I64(self.f(s1, l1, s2, l2, d)?), + (S::BF16(s1), S::BF16(s2)) => S::BF16(self.f(s1, l1, s2, l2, d)?), + (S::F16(s1), S::F16(s2)) => S::F16(self.f(s1, l1, s2, l2, d)?), + (S::F32(s1), S::F32(s2)) => S::F32(self.f(s1, l1, s2, l2, d)?), + (S::F64(s1), S::F64(s2)) => S::F64(self.f(s1, l1, s2, l2, d)?), + _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?, + }; + Ok(out) + } +} + +pub trait Map2InPlace { + fn f( + &self, + dst: &mut CudaSlice, + dst_shape: &Shape, + src: &CudaSlice, + src_l: &Layout, + dev: &CudaDevice, + ) -> Result<()>; + + fn map( + &self, + dst: &mut S, + dst_s: &Shape, + src: &S, + src_l: &Layout, + d: &CudaDevice, + ) -> Result<()> { + match (dst, src) { + (S::U8(dst), S::U8(src)) => self.f(dst, dst_s, src, src_l, d), + (S::U32(dst), S::U32(src)) => self.f(dst, dst_s, src, src_l, d), + (S::I64(dst), S::I64(src)) => self.f(dst, dst_s, src, src_l, d), + (S::BF16(dst), S::BF16(src)) => self.f(dst, dst_s, src, src_l, d), + (S::F16(dst), S::F16(src)) => self.f(dst, dst_s, src, src_l, d), + (S::F32(dst), S::F32(src)) => self.f(dst, dst_s, src, src_l, d), + (S::F64(dst), S::F64(src)) => self.f(dst, dst_s, src, src_l, d), + _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?, + } + } +} + +pub trait Map1Any { + fn f) -> S>( + &self, + src: &CudaSlice, + dev: &CudaDevice, + layout: &Layout, + wrap: W, + ) -> Result; + + fn map(&self, s: &S, d: &CudaDevice, l: &Layout) -> Result { + let out = match s { + S::U8(s) => self.f(s, d, l, S::U8)?, + S::U32(s) => self.f(s, d, l, S::U32)?, + S::I64(s) => self.f(s, d, l, S::I64)?, + S::BF16(s) => self.f(s, d, l, S::BF16)?, + S::F16(s) => self.f(s, d, l, S::F16)?, + S::F32(s) => self.f(s, d, l, S::F32)?, + S::F64(s) => self.f(s, d, l, S::F64)?, + }; + Ok(out) + } +} + +pub trait Map2Any { + fn f( + &self, + src1: &CudaSlice, + layout1: &Layout, + src2: &CudaSlice, + layout2: &Layout, + dev: &CudaDevice, + ) -> Result; + + fn map(&self, s1: &S, l1: &Layout, s2: &S, l2: &Layout, d: &CudaDevice) -> Result { + let out = match (s1, s2) { + (S::U8(s1), S::U8(s2)) => self.f(s1, l1, s2, l2, d)?, + (S::U32(s1), S::U32(s2)) => self.f(s1, l1, s2, l2, d)?, + (S::I64(s1), S::I64(s2)) => self.f(s1, l1, s2, l2, d)?, + (S::BF16(s1), S::BF16(s2)) => self.f(s1, l1, s2, l2, d)?, + (S::F16(s1), S::F16(s2)) => self.f(s1, l1, s2, l2, d)?, + (S::F32(s1), S::F32(s2)) => self.f(s1, l1, s2, l2, d)?, + (S::F64(s1), S::F64(s2)) => self.f(s1, l1, s2, l2, d)?, + _ => Err(CudaError::InternalError("dtype mismatch in binary op")).w()?, + }; + Ok(out) + } +} diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index 911e379f..862436ab 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -43,8 +43,6 @@ pub mod cpu; pub mod cpu_backend; #[cfg(feature = "cuda")] pub mod cuda_backend; -#[cfg(feature = "cudnn")] -pub mod cudnn; mod custom_op; mod device; pub mod display; @@ -73,6 +71,9 @@ pub mod test_utils; pub mod utils; mod variable; +#[cfg(feature = "cudnn")] +pub use cuda_backend::cudnn; + pub use cpu_backend::CpuStorage; pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}; pub use device::{Device, DeviceLocation, NdArray}; diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs new file mode 100644 index 00000000..fdeca13f --- /dev/null +++ b/candle-core/src/metal_backend/device.rs @@ -0,0 +1,287 @@ +use crate::{DType, Result}; +use candle_metal_kernels::Kernels; +use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; +use std::collections::HashMap; +use std::ffi::c_void; +use std::path::Path; +use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard}; + +use super::MetalError; + +/// Unique identifier for cuda devices. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct DeviceId(usize); + +impl DeviceId { + pub(crate) fn new() -> Self { + // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805 + use std::sync::atomic; + static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1); + Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed)) + } +} + +type BufferMap = HashMap<(NSUInteger, MTLResourceOptions), Vec>>; +type AllocatedBuffers = Arc>; + +#[derive(Clone)] +pub struct MetalDevice { + /// Unique identifier, the registryID is not sufficient as it identifies the GPU rather than + /// the device itself. + pub(crate) id: DeviceId, + + /// Raw metal device: + pub(crate) device: metal::Device, + + /// Single command queue for the entire device. + pub(crate) command_queue: CommandQueue, + /// One command buffer at a time. + /// The scheduler works by allowing multiple + /// [ComputeCommandEncoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) + /// on a single command buffer. Using a single command buffer would be fastest on the GPU but + /// prevents overlapping of CPU and GPU commands (because command buffer needs to be committed + /// to start to work). + /// Despite what the documentation says, command buffers are NOT ordered. They are ordered + /// for their START time, but there's no guarantee that command buffer1 will finish before + /// command buffer2 starts (or there are metal bugs there) + pub(crate) command_buffer: Arc>, + /// Keeps track of the current amount of compute command encoders on the current + /// command buffer + /// Arc, RwLock because of the interior mutability. + pub(crate) command_buffer_index: Arc>, + /// The maximum amount of [compute command encoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) per [command buffer](https://developer.apple.com/documentation/metal/mtlcommandbuffer?language=objc) + pub(crate) compute_per_buffer: usize, + /// Simple keeper struct to keep track of the already compiled kernels so we can reuse them. + /// Heavily used by [`candle_metal_kernels`] + pub(crate) kernels: Arc, + /// Simple allocator struct. + /// The buffers are stored in size buckets since ML tends to use similar shapes over and over. + /// We store the buffers in [`Arc`] because it's much faster than Obj-c internal ref counting + /// (could be linked to FFI communication overhead). + /// + /// Whenever a buffer has a strong_count==1, we can reuse it, it means it was dropped in the + /// graph calculation, and only we the allocator kept a reference to it, therefore it's free + /// to be reused. However, in order for this to work, we need to guarantee the order of + /// operation, so that this buffer is not being used by another kernel at the same time. + /// Arc is the CPU reference count, it doesn't mean anything on the GPU side of things. + /// + /// Whenever we actually allocate a new buffer, we make a full sweep to clean up unused buffers + /// (strong_count = 1). + pub(crate) buffers: AllocatedBuffers, + /// Seed for random number generation. + pub(crate) seed: Arc>, +} + +impl std::fmt::Debug for MetalDevice { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "MetalDevice({:?})", self.id) + } +} + +impl std::ops::Deref for MetalDevice { + type Target = metal::DeviceRef; + + fn deref(&self) -> &Self::Target { + &self.device + } +} + +impl MetalDevice { + pub fn id(&self) -> DeviceId { + self.id + } + + pub fn metal_device(&self) -> &metal::Device { + &self.device + } + + pub fn command_queue(&self) -> &CommandQueue { + &self.command_queue + } + + pub fn command_buffer(&self) -> Result { + let mut command_buffer_lock = self.command_buffer.try_write().map_err(MetalError::from)?; + let mut command_buffer = command_buffer_lock.to_owned(); + let mut index = self + .command_buffer_index + .try_write() + .map_err(MetalError::from)?; + if *index > self.compute_per_buffer { + command_buffer.commit(); + command_buffer = self.command_queue.new_command_buffer().to_owned(); + *command_buffer_lock = command_buffer.clone(); + *index = 0; + + self.drop_unused_buffers()?; + } + *index += 1; + Ok(command_buffer) + } + + pub fn wait_until_completed(&self) -> Result<()> { + let mut command_buffer = self.command_buffer.try_write().map_err(MetalError::from)?; + match command_buffer.status() { + metal::MTLCommandBufferStatus::Committed + | metal::MTLCommandBufferStatus::Scheduled + | metal::MTLCommandBufferStatus::Completed => { + panic!("Already committed"); + } + _ => {} + } + command_buffer.commit(); + command_buffer.wait_until_completed(); + *command_buffer = self.command_queue.new_command_buffer().to_owned(); + + Ok(()) + } + + pub fn kernels(&self) -> &Kernels { + &self.kernels + } + + pub fn device(&self) -> &metal::Device { + &self.device + } + + /// Creates a new buffer (not necessarily zeroed). + /// The buffer is [MTLPrivate](https://developer.apple.com/documentation/metal/mtlstoragemode) + /// This means the buffer data cannot be read on the CPU directly. + /// + /// [`name`] is only used to keep track of the resource origin in case of bugs + pub fn new_buffer( + &self, + element_count: usize, + dtype: DType, + name: &str, + ) -> Result> { + let size = (element_count * dtype.size_in_bytes()) as NSUInteger; + self.allocate_buffer(size, MTLResourceOptions::StorageModePrivate, name) + } + + /// Creates a new buffer (not necessarily zeroed). + /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode) + /// This means the buffer can be read on the CPU but will require manual + /// synchronization when the CPU memory is modified + /// Used as a bridge to gather data back from the GPU + pub fn new_buffer_managed(&self, size: NSUInteger) -> Result> { + self.allocate_buffer(size, MTLResourceOptions::StorageModeManaged, "managed") + } + + /// Creates a new buffer from data. + /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode) + /// + /// Does not require synchronization, as [newBufferWithBytes](https://developer.apple.com/documentation/metal/mtldevice/1433429-newbufferwithbytes) + /// allocates the buffer and copies over the existing data before returning the MTLBuffer. + pub fn new_buffer_with_data(&self, data: &[T]) -> Result> { + let size = core::mem::size_of_val(data) as NSUInteger; + let new_buffer = self.device.new_buffer_with_data( + data.as_ptr() as *const c_void, + size, + MTLResourceOptions::StorageModeManaged, + ); + let mut buffers = self.buffers.try_write().map_err(MetalError::from)?; + let subbuffers = buffers + .entry((size, MTLResourceOptions::StorageModeManaged)) + .or_insert(vec![]); + + let new_buffer = Arc::new(new_buffer); + subbuffers.push(new_buffer.clone()); + Ok(new_buffer) + } + + pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result> { + let buffer = self.allocate_buffer( + size_in_bytes as NSUInteger, + MTLResourceOptions::StorageModePrivate, + "allocate_zeros", + )?; + let command_buffer = self.command_buffer()?; + command_buffer.set_label("zeros"); + let blit = command_buffer.new_blit_command_encoder(); + blit.fill_buffer( + &buffer, + metal::NSRange { + location: 0, + length: buffer.length(), + }, + 0, + ); + blit.end_encoding(); + Ok(buffer) + } + + fn find_available_buffer( + &self, + size: NSUInteger, + option: MTLResourceOptions, + buffers: &RwLockWriteGuard, + ) -> Option> { + let mut best_buffer: Option<&Arc> = None; + let mut best_buffer_size: NSUInteger = NSUInteger::MAX; + for ((buffer_size, buffer_option), subbuffers) in buffers.iter() { + if buffer_size >= &size && buffer_size < &best_buffer_size && buffer_option == &option { + for sub in subbuffers { + if Arc::strong_count(sub) == 1 { + best_buffer = Some(sub); + best_buffer_size = *buffer_size; + } + } + } + } + best_buffer.cloned() + } + + fn drop_unused_buffers(&self) -> Result<()> { + let mut buffers = self.buffers.try_write().map_err(MetalError::from)?; + for subbuffers in buffers.values_mut() { + let newbuffers = subbuffers + .iter() + .filter(|s| Arc::strong_count(*s) > 1) + .map(Arc::clone) + .collect(); + *subbuffers = newbuffers; + } + Ok(()) + } + + /// The critical allocator algorithm + fn allocate_buffer( + &self, + size: NSUInteger, + option: MTLResourceOptions, + _name: &str, + ) -> Result> { + let mut buffers = self.buffers.try_write().map_err(MetalError::from)?; + if let Some(b) = self.find_available_buffer(size, option, &buffers) { + // Cloning also ensures we increment the strong count + return Ok(b.clone()); + } + + let size = buf_size(size); + let subbuffers = buffers.entry((size, option)).or_insert(vec![]); + + let new_buffer = self.device.new_buffer(size as NSUInteger, option); + let new_buffer = Arc::new(new_buffer); + subbuffers.push(new_buffer.clone()); + + Ok(new_buffer) + } + + /// Create a metal GPU capture trace on [`path`]. + pub fn capture>(&self, path: P) -> Result<()> { + let capture = metal::CaptureManager::shared(); + let descriptor = metal::CaptureDescriptor::new(); + descriptor.set_destination(metal::MTLCaptureDestination::GpuTraceDocument); + descriptor.set_capture_device(self); + descriptor.set_output_url(path); + + capture + .start_capture(&descriptor) + .map_err(MetalError::from)?; + Ok(()) + } +} + +fn buf_size(size: NSUInteger) -> NSUInteger { + (size - 1).next_power_of_two() as NSUInteger +} diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend/mod.rs similarity index 86% rename from candle-core/src/metal_backend.rs rename to candle-core/src/metal_backend/mod.rs index fed7db13..deb7a401 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -4,24 +4,13 @@ use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT}; use crate::{CpuStorage, DType, Layout, Result, Shape}; use candle_metal_kernels::CallConvTranspose2dCfg; use candle_metal_kernels::Kernels; -use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; +use metal::{Buffer, MTLResourceOptions, NSUInteger}; use std::collections::HashMap; use std::ffi::c_void; -use std::path::Path; -use std::sync::{Arc, Mutex, RwLock, RwLockWriteGuard, TryLockError}; +use std::sync::{Arc, Mutex, RwLock, TryLockError}; -/// Unique identifier for cuda devices. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct DeviceId(usize); - -impl DeviceId { - fn new() -> Self { - // https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805 - use std::sync::atomic; - static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1); - Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed)) - } -} +mod device; +pub use device::{DeviceId, MetalDevice}; /// Simple way to catch lock error without /// depending on T @@ -49,13 +38,6 @@ pub enum MetalError { Message(String), #[error(transparent)] KernelError(#[from] candle_metal_kernels::MetalKernelError), - - #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")] - MatMulNonContiguous { - lhs_stride: Vec, - rhs_stride: Vec, - mnk: (usize, usize, usize), - }, #[error("{0:?}")] LockError(LockError), #[error("{msg}, expected: {expected:?}, got: {got:?}")] @@ -72,267 +54,6 @@ impl From for MetalError { } } -type BufferMap = HashMap<(NSUInteger, MTLResourceOptions), Vec>>; -type AllocatedBuffers = Arc>; - -#[derive(Clone)] -pub struct MetalDevice { - /// Unique identifier, the registryID is not sufficient as it identifies the GPU rather than - /// the device itself. - id: DeviceId, - - /// Raw metal device: - device: metal::Device, - - /// Single command queue for the entire device. - command_queue: CommandQueue, - /// One command buffer at a time. - /// The scheduler works by allowing multiple - /// [ComputeCommandEncoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) - /// on a single command buffer. Using a single command buffer would be fastest on the GPU but - /// prevents overlapping of CPU and GPU commands (because command buffer needs to be committed - /// to start to work). - /// Despite what the documentation says, command buffers are NOT ordered. They are ordered - /// for their START time, but there's no guarantee that command buffer1 will finish before - /// command buffer2 starts (or there are metal bugs there) - command_buffer: Arc>, - /// Keeps track of the current amount of compute command encoders on the current - /// command buffer - /// Arc, RwLock because of the interior mutability. - command_buffer_index: Arc>, - /// The maximum amount of [compute command encoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) per [command buffer](https://developer.apple.com/documentation/metal/mtlcommandbuffer?language=objc) - compute_per_buffer: usize, - /// Simple keeper struct to keep track of the already compiled kernels so we can reuse them. - /// Heavily used by [`candle_metal_kernels`] - kernels: Arc, - /// Simple allocator struct. - /// The buffers are stored in size buckets since ML tends to use similar shapes over and over. - /// We store the buffers in [`Arc`] because it's much faster than Obj-c internal ref counting - /// (could be linked to FFI communication overhead). - /// - /// Whenever a buffer has a strong_count==1, we can reuse it, it means it was dropped in the - /// graph calculation, and only we the allocator kept a reference to it, therefore it's free - /// to be reused. However, in order for this to work, we need to guarantee the order of - /// operation, so that this buffer is not being used by another kernel at the same time. - /// Arc is the CPU reference count, it doesn't mean anything on the GPU side of things. - /// - /// Whenever we actually allocate a new buffer, we make a full sweep to clean up unused buffers - /// (strong_count = 1). - buffers: AllocatedBuffers, - /// Seed for random number generation. - seed: Arc>, -} - -impl std::fmt::Debug for MetalDevice { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "MetalDevice({:?})", self.id) - } -} - -impl std::ops::Deref for MetalDevice { - type Target = metal::DeviceRef; - - fn deref(&self) -> &Self::Target { - &self.device - } -} - -impl MetalDevice { - pub fn id(&self) -> DeviceId { - self.id - } - - pub fn metal_device(&self) -> &metal::Device { - &self.device - } - - pub fn command_queue(&self) -> &CommandQueue { - &self.command_queue - } - - pub fn command_buffer(&self) -> Result { - let mut command_buffer_lock = self.command_buffer.try_write().map_err(MetalError::from)?; - let mut command_buffer = command_buffer_lock.to_owned(); - let mut index = self - .command_buffer_index - .try_write() - .map_err(MetalError::from)?; - if *index > self.compute_per_buffer { - command_buffer.commit(); - command_buffer = self.command_queue.new_command_buffer().to_owned(); - *command_buffer_lock = command_buffer.clone(); - *index = 0; - - self.drop_unused_buffers()?; - } - *index += 1; - Ok(command_buffer) - } - - pub fn wait_until_completed(&self) -> Result<()> { - let mut command_buffer = self.command_buffer.try_write().map_err(MetalError::from)?; - match command_buffer.status() { - metal::MTLCommandBufferStatus::Committed - | metal::MTLCommandBufferStatus::Scheduled - | metal::MTLCommandBufferStatus::Completed => { - panic!("Already committed"); - } - _ => {} - } - command_buffer.commit(); - command_buffer.wait_until_completed(); - *command_buffer = self.command_queue.new_command_buffer().to_owned(); - - Ok(()) - } - - pub fn kernels(&self) -> &Kernels { - &self.kernels - } - - pub fn device(&self) -> &metal::Device { - &self.device - } - - /// Creates a new buffer (not necessarily zeroed). - /// The buffer is [MTLPrivate](https://developer.apple.com/documentation/metal/mtlstoragemode) - /// This means the buffer data cannot be read on the CPU directly. - /// - /// [`name`] is only used to keep track of the resource origin in case of bugs - pub fn new_buffer( - &self, - element_count: usize, - dtype: DType, - name: &str, - ) -> Result> { - let size = (element_count * dtype.size_in_bytes()) as NSUInteger; - self.allocate_buffer(size, MTLResourceOptions::StorageModePrivate, name) - } - - /// Creates a new buffer (not necessarily zeroed). - /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode) - /// This means the buffer can be read on the CPU but will require manual - /// synchronization when the CPU memory is modified - /// Used as a bridge to gather data back from the GPU - pub fn new_buffer_managed(&self, size: NSUInteger) -> Result> { - self.allocate_buffer(size, MTLResourceOptions::StorageModeManaged, "managed") - } - - /// Creates a new buffer from data. - /// The buffer is [MTLManaged](https://developer.apple.com/documentation/metal/mtlstoragemode) - /// - /// Does not require synchronization, as [newBufferWithBytes](https://developer.apple.com/documentation/metal/mtldevice/1433429-newbufferwithbytes) - /// allocates the buffer and copies over the existing data before returning the MTLBuffer. - pub fn new_buffer_with_data(&self, data: &[T]) -> Result> { - let size = core::mem::size_of_val(data) as NSUInteger; - let new_buffer = self.device.new_buffer_with_data( - data.as_ptr() as *const c_void, - size, - MTLResourceOptions::StorageModeManaged, - ); - let mut buffers = self.buffers.try_write().map_err(MetalError::from)?; - let subbuffers = buffers - .entry((size, MTLResourceOptions::StorageModeManaged)) - .or_insert(vec![]); - - let new_buffer = Arc::new(new_buffer); - subbuffers.push(new_buffer.clone()); - Ok(new_buffer) - } - - pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result> { - let buffer = self.allocate_buffer( - size_in_bytes as NSUInteger, - MTLResourceOptions::StorageModePrivate, - "allocate_zeros", - )?; - let command_buffer = self.command_buffer()?; - command_buffer.set_label("zeros"); - let blit = command_buffer.new_blit_command_encoder(); - blit.fill_buffer( - &buffer, - metal::NSRange { - location: 0, - length: buffer.length(), - }, - 0, - ); - blit.end_encoding(); - Ok(buffer) - } - - fn find_available_buffer( - &self, - size: NSUInteger, - option: MTLResourceOptions, - buffers: &RwLockWriteGuard, - ) -> Option> { - let mut best_buffer: Option<&Arc> = None; - let mut best_buffer_size: NSUInteger = NSUInteger::MAX; - for ((buffer_size, buffer_option), subbuffers) in buffers.iter() { - if buffer_size >= &size && buffer_size < &best_buffer_size && buffer_option == &option { - for sub in subbuffers { - if Arc::strong_count(sub) == 1 { - best_buffer = Some(sub); - best_buffer_size = *buffer_size; - } - } - } - } - best_buffer.cloned() - } - - fn drop_unused_buffers(&self) -> Result<()> { - let mut buffers = self.buffers.try_write().map_err(MetalError::from)?; - for subbuffers in buffers.values_mut() { - let newbuffers = subbuffers - .iter() - .filter(|s| Arc::strong_count(*s) > 1) - .map(Arc::clone) - .collect(); - *subbuffers = newbuffers; - } - Ok(()) - } - - /// The critical allocator algorithm - fn allocate_buffer( - &self, - size: NSUInteger, - option: MTLResourceOptions, - _name: &str, - ) -> Result> { - let mut buffers = self.buffers.try_write().map_err(MetalError::from)?; - if let Some(b) = self.find_available_buffer(size, option, &buffers) { - // Cloning also ensures we increment the strong count - return Ok(b.clone()); - } - - let size = buf_size(size); - let subbuffers = buffers.entry((size, option)).or_insert(vec![]); - - let new_buffer = self.device.new_buffer(size as NSUInteger, option); - let new_buffer = Arc::new(new_buffer); - subbuffers.push(new_buffer.clone()); - - Ok(new_buffer) - } - - /// Create a metal GPU capture trace on [`path`]. - pub fn capture>(&self, path: P) -> Result<()> { - let capture = metal::CaptureManager::shared(); - let descriptor = metal::CaptureDescriptor::new(); - descriptor.set_destination(metal::MTLCaptureDestination::GpuTraceDocument); - descriptor.set_capture_device(self); - descriptor.set_output_url(path); - - capture - .start_capture(&descriptor) - .map_err(MetalError::from)?; - Ok(()) - } -} - #[derive(Debug, Clone)] pub struct MetalStorage { /// The actual buffer containing the data. @@ -2055,10 +1776,6 @@ impl BackendDevice for MetalDevice { } } -fn buf_size(size: NSUInteger) -> NSUInteger { - (size - 1).next_power_of_two() as NSUInteger -} - fn read_to_vec(buffer: &Buffer, n: usize) -> Vec { let ptr = buffer.contents() as *const T; assert!(!ptr.is_null()); From efe4a0c84b55b60f7555a89ea7e0ba8d300104cd Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 30 Mar 2024 11:34:33 +0100 Subject: [PATCH 084/131] Add a print command to tensor-tools. (#1967) * Add a print command to tensor-tools. * Add some flags to tweak the formatting. --- candle-core/examples/tensor-tools.rs | 124 +++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs index 1801ac58..5dc49cd8 100644 --- a/candle-core/examples/tensor-tools.rs +++ b/candle-core/examples/tensor-tools.rs @@ -117,6 +117,24 @@ enum Command { verbose: bool, }, + Print { + file: std::path::PathBuf, + + names: Vec, + + /// The file format to use, if unspecified infer from the file extension. + #[arg(long, value_enum)] + format: Option, + + /// Print the whole content of each tensor. + #[arg(long)] + full: bool, + + /// Line width for printing the tensors. + #[arg(long)] + line_width: Option, + }, + Quantize { /// The input file(s), in safetensors format. in_file: Vec, @@ -150,6 +168,105 @@ struct Args { command: Command, } +fn run_print( + file: &std::path::PathBuf, + names: Vec, + format: Option, + full: bool, + line_width: Option, + device: &Device, +) -> Result<()> { + if full { + candle_core::display::set_print_options_full(); + } + if let Some(line_width) = line_width { + candle_core::display::set_line_width(line_width) + } + let format = match format { + Some(format) => format, + None => match Format::infer(file) { + Some(format) => format, + None => { + println!( + "{file:?}: cannot infer format from file extension, use the --format flag" + ); + return Ok(()); + } + }, + }; + match format { + Format::Npz => { + let tensors = candle_core::npy::NpzTensors::new(file)?; + for name in names.iter() { + println!("==== {name} ===="); + match tensors.get(name)? { + Some(tensor) => println!("{tensor}"), + None => println!("not found"), + } + } + } + Format::Safetensors => { + use candle_core::safetensors::Load; + let tensors = unsafe { candle_core::safetensors::MmapedSafetensors::new(file)? }; + let tensors: std::collections::HashMap<_, _> = tensors.tensors().into_iter().collect(); + for name in names.iter() { + println!("==== {name} ===="); + match tensors.get(name) { + Some(tensor_view) => { + let tensor = tensor_view.load(device)?; + println!("{tensor}") + } + None => println!("not found"), + } + } + } + Format::Pth => { + let pth_file = candle_core::pickle::PthTensors::new(file, None)?; + for name in names.iter() { + println!("==== {name} ===="); + match pth_file.get(name)? { + Some(tensor) => { + println!("{tensor}") + } + None => println!("not found"), + } + } + } + Format::Pickle => { + candle_core::bail!("pickle format is not supported for print") + } + Format::Ggml => { + let mut file = std::fs::File::open(file)?; + let content = candle_core::quantized::ggml_file::Content::read(&mut file, device)?; + for name in names.iter() { + println!("==== {name} ===="); + match content.tensors.get(name) { + Some(tensor) => { + let tensor = tensor.dequantize(device)?; + println!("{tensor}") + } + None => println!("not found"), + } + } + } + Format::Gguf => { + let mut file = std::fs::File::open(file)?; + let content = gguf_file::Content::read(&mut file)?; + for name in names.iter() { + println!("==== {name} ===="); + match content.tensor(&mut file, name, device) { + Ok(tensor) => { + let tensor = tensor.dequantize(device)?; + println!("{tensor}") + } + Err(_) => println!("not found"), + } + } + } + } + Ok(()) +} + fn run_ls( file: &std::path::PathBuf, format: Option, @@ -377,6 +494,13 @@ fn main() -> anyhow::Result<()> { run_ls(file, format.clone(), verbose, &device)? } } + Command::Print { + file, + names, + format, + full, + line_width, + } => run_print(&file, names, format, full, line_width, &device)?, Command::Quantize { in_file, out_file, From b190fd85920dfeb93c091593d42fda596c3a83a7 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 30 Mar 2024 13:22:00 +0100 Subject: [PATCH 085/131] Remove some unnecessary calls to contiguous. (#1968) * Remove some unnecessary calls to contiguous. * Slightly improved kv cache concatenation. --- candle-core/src/tensor_cat.rs | 22 +++++++++---------- .../src/models/quantized_llama.rs | 14 ++++++++---- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/candle-core/src/tensor_cat.rs b/candle-core/src/tensor_cat.rs index 31cc8503..27ff7851 100644 --- a/candle-core/src/tensor_cat.rs +++ b/candle-core/src/tensor_cat.rs @@ -58,20 +58,18 @@ impl Tensor { } } } - if dim == 0 { + let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous()); + if all_contiguous { + Self::cat_contiguous(args, dim) + } else if dim == 0 { Self::cat0(args) } else { - let all_contiguous = args.iter().all(|v| v.as_ref().is_contiguous()); - if all_contiguous { - Self::cat_contiguous(args, dim) - } else { - let args: Vec = args - .iter() - .map(|a| a.as_ref().transpose(0, dim)) - .collect::>>()?; - let cat = Self::cat0(&args)?; - cat.transpose(0, dim) - } + let args: Vec = args + .iter() + .map(|a| a.as_ref().transpose(0, dim)) + .collect::>>()?; + let cat = Self::cat0(&args)?; + cat.transpose(0, dim) } } diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 9898d872..e1519b2d 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -157,6 +157,8 @@ impl LayerWeights { let (_b_sz, _n_head, seq_len, _n_embd) = x.dims4()?; let cos = self.cos.narrow(0, index_pos, seq_len)?; let sin = self.sin.narrow(0, index_pos, seq_len)?; + // The call to contiguous below is only necessary when processing the prompt. + // When the seq_len is 1 in the inference loop, this is a no-op. candle_nn::rotary_emb::rope_i(&x.contiguous()?, &cos, &sin) } @@ -180,7 +182,11 @@ impl LayerWeights { .transpose(1, 2)?; let v = v .reshape((b_sz, seq_len, self.n_kv_head, self.head_dim))? - .transpose(1, 2)?; + .transpose(1, 2)? + // This call to contiguous ensures that the fast kernel can be called below. It's + // actually a no-op except when processing the initial prompt so has no significant + // impact on performance. + .contiguous()?; let q = self.apply_rotary_emb(&q, index_pos)?; let k = self.apply_rotary_emb(&k, index_pos)?; @@ -191,8 +197,8 @@ impl LayerWeights { if index_pos == 0 { (k, v) } else { - let k = Tensor::cat(&[k_cache, &k], 2)?.contiguous()?; - let v = Tensor::cat(&[v_cache, &v], 2)?.contiguous()?; + let k = Tensor::cat(&[k_cache, &k], 2)?; + let v = Tensor::cat(&[v_cache, &v], 2)?; (k, v) } } @@ -486,7 +492,7 @@ impl ModelWeights { layer_in = x } let x = self.norm.forward(&layer_in)?; - let x = x.i((.., seq_len - 1, ..))?.contiguous()?; + let x = x.i((.., seq_len - 1, ..))?; let _enter = self.span_output.enter(); self.output.forward(&x) } From 3144150b8d1b80b2c6b469dcab5b717598f0a458 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 30 Mar 2024 15:49:37 +0100 Subject: [PATCH 086/131] Move the tensor-tools binary in a separate crate. (#1969) --- Cargo.toml | 1 + .../examples/quantized-t5/README.md | 2 +- tensor-tools/Cargo.toml | 16 +++++++ .../src/main.rs | 42 +++++++++---------- 4 files changed, 39 insertions(+), 22 deletions(-) create mode 100644 tensor-tools/Cargo.toml rename candle-core/examples/tensor-tools.rs => tensor-tools/src/main.rs (90%) diff --git a/Cargo.toml b/Cargo.toml index d71cc4bc..313c68f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "candle-transformers", "candle-wasm-examples/*", "candle-wasm-tests", + "tensor-tools", ] exclude = [ "candle-flash-attn", diff --git a/candle-examples/examples/quantized-t5/README.md b/candle-examples/examples/quantized-t5/README.md index 8b8179eb..c86e746d 100644 --- a/candle-examples/examples/quantized-t5/README.md +++ b/candle-examples/examples/quantized-t5/README.md @@ -17,7 +17,7 @@ generate quantized weight files from the original safetensors file by using the `tensor-tools` command line utility via: ```bash -$ cargo run --example tensor-tools --release -- quantize --quantization q6k PATH/TO/T5/model.safetensors /tmp/model.gguf +$ cargo run --bin tensor-tools --release -- quantize --quantization q6k PATH/TO/T5/model.safetensors /tmp/model.gguf ``` ## Using custom models diff --git a/tensor-tools/Cargo.toml b/tensor-tools/Cargo.toml new file mode 100644 index 00000000..eecd7e43 --- /dev/null +++ b/tensor-tools/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "tensor-tools" +version.workspace = true +edition.workspace = true +description.workspace = true +repository.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true + +[dependencies] +anyhow = { workspace = true } +candle = { workspace = true } +clap = { workspace = true } +rayon = { workspace = true } +safetensors = { workspace = true } diff --git a/candle-core/examples/tensor-tools.rs b/tensor-tools/src/main.rs similarity index 90% rename from candle-core/examples/tensor-tools.rs rename to tensor-tools/src/main.rs index 5dc49cd8..ad351171 100644 --- a/candle-core/examples/tensor-tools.rs +++ b/tensor-tools/src/main.rs @@ -1,5 +1,5 @@ -use candle_core::quantized::{gguf_file, GgmlDType, QTensor}; -use candle_core::{Device, Result}; +use candle::quantized::{gguf_file, GgmlDType, QTensor}; +use candle::{Device, Result}; use clap::{Parser, Subcommand, ValueEnum}; use rayon::prelude::*; @@ -177,10 +177,10 @@ fn run_print( device: &Device, ) -> Result<()> { if full { - candle_core::display::set_print_options_full(); + candle::display::set_print_options_full(); } if let Some(line_width) = line_width { - candle_core::display::set_line_width(line_width) + candle::display::set_line_width(line_width) } let format = match format { Some(format) => format, @@ -196,7 +196,7 @@ fn run_print( }; match format { Format::Npz => { - let tensors = candle_core::npy::NpzTensors::new(file)?; + let tensors = candle::npy::NpzTensors::new(file)?; for name in names.iter() { println!("==== {name} ===="); match tensors.get(name)? { @@ -206,8 +206,8 @@ fn run_print( } } Format::Safetensors => { - use candle_core::safetensors::Load; - let tensors = unsafe { candle_core::safetensors::MmapedSafetensors::new(file)? }; + use candle::safetensors::Load; + let tensors = unsafe { candle::safetensors::MmapedSafetensors::new(file)? }; let tensors: std::collections::HashMap<_, _> = tensors.tensors().into_iter().collect(); for name in names.iter() { println!("==== {name} ===="); @@ -221,7 +221,7 @@ fn run_print( } } Format::Pth => { - let pth_file = candle_core::pickle::PthTensors::new(file, None)?; + let pth_file = candle::pickle::PthTensors::new(file, None)?; for name in names.iter() { println!("==== {name} ===="); match pth_file.get(name)? { @@ -233,11 +233,11 @@ fn run_print( } } Format::Pickle => { - candle_core::bail!("pickle format is not supported for print") + candle::bail!("pickle format is not supported for print") } Format::Ggml => { let mut file = std::fs::File::open(file)?; - let content = candle_core::quantized::ggml_file::Content::read(&mut file, device)?; + let content = candle::quantized::ggml_file::Content::read(&mut file, device)?; for name in names.iter() { println!("==== {name} ===="); match content.tensors.get(name) { @@ -287,7 +287,7 @@ fn run_ls( }; match format { Format::Npz => { - let tensors = candle_core::npy::NpzTensors::new(file)?; + let tensors = candle::npy::NpzTensors::new(file)?; let mut names = tensors.names(); names.sort(); for name in names { @@ -299,12 +299,12 @@ fn run_ls( } } Format::Safetensors => { - let tensors = unsafe { candle_core::safetensors::MmapedSafetensors::new(file)? }; + let tensors = unsafe { candle::safetensors::MmapedSafetensors::new(file)? }; let mut tensors = tensors.tensors(); tensors.sort_by(|a, b| a.0.cmp(&b.0)); for (name, view) in tensors.iter() { let dtype = view.dtype(); - let dtype = match candle_core::DType::try_from(dtype) { + let dtype = match candle::DType::try_from(dtype) { Ok(dtype) => format!("{dtype:?}"), Err(_) => format!("{dtype:?}"), }; @@ -313,7 +313,7 @@ fn run_ls( } } Format::Pth => { - let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose, None)?; + let mut tensors = candle::pickle::read_pth_tensor_info(file, verbose, None)?; tensors.sort_by(|a, b| a.name.cmp(&b.name)); for tensor_info in tensors.iter() { println!( @@ -330,7 +330,7 @@ fn run_ls( Format::Pickle => { let file = std::fs::File::open(file)?; let mut reader = std::io::BufReader::new(file); - let mut stack = candle_core::pickle::Stack::empty(); + let mut stack = candle::pickle::Stack::empty(); stack.read_loop(&mut reader)?; for (i, obj) in stack.stack().iter().enumerate() { println!("{i} {obj:?}"); @@ -338,7 +338,7 @@ fn run_ls( } Format::Ggml => { let mut file = std::fs::File::open(file)?; - let content = candle_core::quantized::ggml_file::Content::read(&mut file, device)?; + let content = candle::quantized::ggml_file::Content::read(&mut file, device)?; let mut tensors = content.tensors.into_iter().collect::>(); tensors.sort_by(|a, b| a.0.cmp(&b.0)); for (name, qtensor) in tensors.iter() { @@ -374,7 +374,7 @@ fn run_quantize_safetensors( let mut out_file = std::fs::File::create(out_file)?; let mut tensors = std::collections::HashMap::new(); for in_file in in_files.iter() { - let in_tensors = candle_core::safetensors::load(in_file, &Device::Cpu)?; + let in_tensors = candle::safetensors::load(in_file, &Device::Cpu)?; tensors.extend(in_tensors) } println!("tensors: {}", tensors.len()); @@ -416,7 +416,7 @@ fn run_dequantize( let tensor = tensor.dequantize(device)?; tensors.insert(tensor_name.to_string(), tensor); } - candle_core::safetensors::save(&tensors, out_file)?; + candle::safetensors::save(&tensors, out_file)?; Ok(()) } @@ -428,11 +428,11 @@ fn run_quantize( device: &Device, ) -> Result<()> { if in_files.is_empty() { - candle_core::bail!("no specified input files") + candle::bail!("no specified input files") } if let Some(extension) = out_file.extension() { if extension == "safetensors" { - candle_core::bail!("the generated file cannot use the safetensors extension") + candle::bail!("the generated file cannot use the safetensors extension") } } if let Some(extension) = in_files[0].extension() { @@ -442,7 +442,7 @@ fn run_quantize( } if in_files.len() != 1 { - candle_core::bail!("only a single in-file can be used when quantizing gguf files") + candle::bail!("only a single in-file can be used when quantizing gguf files") } // Open the out file early so as to fail directly on missing directories etc. From 92f81d2fcb4a116fd30d5bd1b6fd46dc6c0e8463 Mon Sep 17 00:00:00 2001 From: Santiago Medina Date: Sat, 30 Mar 2024 23:54:56 -0700 Subject: [PATCH 087/131] Add Moondream transformer implementation and example (#1970) * moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward --- README.md | 2 + candle-examples/examples/moondream/README.md | 26 ++ candle-examples/examples/moondream/main.rs | 245 +++++++++++++++ candle-transformers/src/models/mixformer.rs | 20 ++ candle-transformers/src/models/mod.rs | 1 + candle-transformers/src/models/moondream.rs | 308 +++++++++++++++++++ 6 files changed, 602 insertions(+) create mode 100644 candle-examples/examples/moondream/README.md create mode 100644 candle-examples/examples/moondream/main.rs create mode 100644 candle-transformers/src/models/moondream.rs diff --git a/README.md b/README.md index 1208956c..0fdcedca 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,8 @@ We also provide a some command line based examples using state of the art models dedicated submodels for hand-writing and printed recognition. - [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation model, generates the translated text from the input text. +- [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model + that can answer real-world questions about images. Run them using commands like: ``` diff --git a/candle-examples/examples/moondream/README.md b/candle-examples/examples/moondream/README.md new file mode 100644 index 00000000..e202de7c --- /dev/null +++ b/candle-examples/examples/moondream/README.md @@ -0,0 +1,26 @@ +# candle-moondream + +[Moondream](https://github.com/vikhyat/moondream) is a computer-vision model can answer real-world questions about images. It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices. + +## Running some examples +First download an example image +```bash +$ wget https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg +``` + + + +Now you can run Moondream from the `candle-examples` crate: +```bash +$ cargo run --example moondream --release -- --prompt "What is the girl eating?" --image "./demo-1.jpg" + +avavx: false, neon: true, simd128: false, f16c: false +temp: 0.00 repeat-penalty: 1.00 repeat-last-n: 64 +retrieved the files in 3.395583ms +Running on CPU, to run on GPU(metal), build this example with `--features metal` +loaded the model in 5.485493792s +loaded and encoded the image Tensor[dims 3, 378, 378; f32] in 4.801396417s +starting the inference loop + The girl is eating a hamburger.< +9 tokens generated (0.68 token/s) +``` \ No newline at end of file diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs new file mode 100644 index 00000000..7ea6570f --- /dev/null +++ b/candle-examples/examples/moondream/main.rs @@ -0,0 +1,245 @@ +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + +use anyhow::{Error as E, Result}; +use clap::Parser; + +use candle::{DType, Device, Tensor}; +use candle_nn::VarBuilder; +use candle_transformers::{generation::LogitsProcessor, models::moondream}; +use tokenizers::Tokenizer; + +struct TextGeneration { + model: moondream::Model, + device: Device, + tokenizer: Tokenizer, + logits_processor: LogitsProcessor, + repeat_penalty: f32, + repeat_last_n: usize, + verbose_prompt: bool, +} + +impl TextGeneration { + #[allow(clippy::too_many_arguments)] + fn new( + model: moondream::Model, + tokenizer: Tokenizer, + seed: u64, + temp: Option, + top_p: Option, + repeat_penalty: f32, + repeat_last_n: usize, + verbose_prompt: bool, + device: &Device, + ) -> Self { + let logits_processor = LogitsProcessor::new(seed, temp, top_p); + Self { + model, + tokenizer, + logits_processor, + repeat_penalty, + repeat_last_n, + verbose_prompt, + device: device.clone(), + } + } + + fn run(&mut self, prompt: &str, image_embeds: &Tensor, sample_len: usize) -> Result<()> { + use std::io::Write; + println!("starting the inference loop"); + let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?; + if tokens.is_empty() { + anyhow::bail!("Empty prompts are not supported in the Moondream model.") + } + if self.verbose_prompt { + for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) { + let token = token.replace('▁', " ").replace("<0x0A>", "\n"); + println!("{id:7} -> '{token}'"); + } + } + + let mut tokens = tokens.get_ids().to_vec(); + let mut generated_tokens = 0usize; + + let eos_token = match self.tokenizer.get_vocab(true).get("END") { + Some(token) => *token, + None => anyhow::bail!("cannot find the EOS token"), + }; + + let start_gen = std::time::Instant::now(); + for index in 0..sample_len { + let context_size = if index > 0 { 1 } else { tokens.len() }; + let ctxt = &tokens[tokens.len().saturating_sub(context_size)..]; + let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?; + let logits = if index > 0 { + self.model.text_model.forward(&input)? + } else { + self.model + .text_model + .forward_with_img(&input, &image_embeds)? + }; + let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; + let logits = if self.repeat_penalty == 1. { + logits + } else { + let start_at = tokens.len().saturating_sub(self.repeat_last_n); + candle_transformers::utils::apply_repeat_penalty( + &logits, + self.repeat_penalty, + &tokens[start_at..], + )? + }; + let next_token = self.logits_processor.sample(&logits)?; + tokens.push(next_token); + generated_tokens += 1; + if next_token == eos_token { + break; + } + let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?; + print!("{token}"); + std::io::stdout().flush()?; + } + + let dt = start_gen.elapsed(); + println!( + "\n{generated_tokens} tokens generated ({:.2} token/s)", + generated_tokens as f64 / dt.as_secs_f64() + ); + + Ok(()) + } +} + +#[derive(Parser)] +struct Args { + /// Run on CPU rather than on GPU. + #[arg(long)] + cpu: bool, + + /// Enable tracing (generates a trace-timestamp.json file). + #[arg(long)] + tracing: bool, + + /// Display the token for the specified prompt. + #[arg(long)] + verbose_prompt: bool, + + #[arg(long)] + prompt: String, + + #[arg(long)] + image: String, + + /// The temperature used to generate samples. + #[arg(long)] + temperature: Option, + + /// Nucleus sampling probability cutoff. + #[arg(long)] + top_p: Option, + + /// The seed to use when generating random samples. + #[arg(long, default_value_t = 299792458)] + seed: u64, + + #[arg(long, default_value_t = 5000)] + sample_len: usize, + + /// Penalty to be applied for repeating tokens, 1. means no penalty. + #[arg(long, default_value_t = 1.0)] + repeat_penalty: f32, + + /// The context size to consider for the repeat penalty. + #[arg(long, default_value_t = 64)] + repeat_last_n: usize, +} + +/// Loads an image from disk using the image crate, this returns a tensor with shape +/// (3, 378, 378). +pub fn load_image>(p: P) -> candle::Result { + let img = image::io::Reader::open(p)? + .decode() + .map_err(candle::Error::wrap)? + .resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378 + let img = img.to_rgb8(); + let data = img.into_raw(); + let data = Tensor::from_vec(data, (378, 378, 3), &Device::Cpu)?.permute((2, 0, 1))?; + let mean = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?; + let std = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?; + (data.to_dtype(candle::DType::F32)? / 255.)? + .broadcast_sub(&mean)? + .broadcast_div(&std) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + use tracing_chrome::ChromeLayerBuilder; + use tracing_subscriber::prelude::*; + + let args = Args::parse(); + + let _guard = if args.tracing { + let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); + tracing_subscriber::registry().with(chrome_layer).init(); + Some(guard) + } else { + None + }; + println!( + "avx: {}, neon: {}, simd128: {}, f16c: {}", + candle::utils::with_avx(), + candle::utils::with_neon(), + candle::utils::with_simd128(), + candle::utils::with_f16c() + ); + println!( + "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}", + args.temperature.unwrap_or(0.), + args.repeat_penalty, + args.repeat_last_n + ); + + let start = std::time::Instant::now(); + let api = hf_hub::api::tokio::Api::new()?; + let repo = api.model("vikhyatk/moondream2".to_string()); + let model_file = repo.get("model.safetensors").await?; + let tokenizer = repo.get("tokenizer.json").await?; + println!("retrieved the files in {:?}", start.elapsed()); + let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?; + + let start = std::time::Instant::now(); + let device = candle_examples::device(args.cpu)?; + let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? }; + let config = moondream::Config::v2(); + let model = moondream::Model::new(&config, vb)?; + println!("loaded the model in {:?}", start.elapsed()); + + let start = std::time::Instant::now(); + let image = load_image(args.image)?.to_device(&device)?; + let image_embeds = image.unsqueeze(0)?; + let image_embeds = image_embeds.apply(model.vision_encoder())?; + println!( + "loaded and encoded the image {image:?} in {:?}", + start.elapsed() + ); + + let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt); + + let mut pipeline = TextGeneration::new( + model, + tokenizer, + args.seed, + args.temperature, + args.top_p, + args.repeat_penalty, + args.repeat_last_n, + args.verbose_prompt, + &device, + ); + pipeline.run(&prompt, &image_embeds, args.sample_len)?; + + Ok(()) +} diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index f7eb0abe..edca8b9d 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -438,6 +438,26 @@ impl MixFormerSequentialForCausalLM { xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1) } + pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result { + let _enter = self.span.enter(); + let xs = xs.apply(&self.embedding)?; + let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?; + let (_b_size, seq_len, _embds) = xs.dims3()?; + let mask = if seq_len <= 1 { + None + } else { + Some(get_mask(seq_len, xs.device())?) + }; + for block in self.blocks.iter_mut() { + xs = block.forward(&xs, mask.as_ref())? + } + let xs = xs + .narrow(1, seq_len - 1, 1)? + .apply(&self.head)? + .squeeze(1)?; + Ok(xs) + } + pub fn clear_kv_cache(&mut self) { self.blocks.iter_mut().for_each(|b| b.clear_kv_cache()) } diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 980ba535..ed0e0de7 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -24,6 +24,7 @@ pub mod mistral; pub mod mixformer; pub mod mixtral; pub mod mobileone; +pub mod moondream; pub mod mpt; pub mod persimmon; pub mod phi; diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs new file mode 100644 index 00000000..1172bf71 --- /dev/null +++ b/candle-transformers/src/models/moondream.rs @@ -0,0 +1,308 @@ +use crate::models::mixformer::{Config as PhiConfig, MixFormerSequentialForCausalLM as PhiModel}; +use candle::{IndexOp, Result, Tensor, D}; +use candle_nn::{layer_norm, linear_b, Linear, Module, VarBuilder}; + +pub struct Config { + pub phi_config: PhiConfig, + pub vision_config: VisionConfig, +} + +impl Config { + pub fn v2() -> Self { + Self { + phi_config: PhiConfig::v1_5(), + vision_config: VisionConfig::v2(), + } + } +} + +fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result { + let dim = q.dim(D::Minus1)?; + let scale_factor = 1.0 / (dim as f64).sqrt(); + let k = k.transpose(D::Minus2, D::Minus1)?.contiguous()?; + let mut attn_weights = (q.contiguous()?.matmul(&k)? * scale_factor)?; + attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?.contiguous()?; + let attn_weights = attn_weights.matmul(&v.contiguous()?)?; + Ok(attn_weights) +} + +#[derive(Debug, Clone, PartialEq, serde::Deserialize)] +pub struct VisionConfig { + image_embedding_dim: usize, + model_dim: usize, + hidden_dim: usize, + hidden_features: usize, + embed_len: usize, + embed_dim: usize, + num_blocks: usize, + num_heads: usize, + act: candle_nn::Activation, +} + +impl VisionConfig { + pub fn v2() -> Self { + Self { + image_embedding_dim: 1152, + model_dim: 2048, + hidden_dim: 2048 * 4, + hidden_features: 4304, + embed_len: 729, + embed_dim: 1152, + num_blocks: 27, + num_heads: 16, + act: candle_nn::Activation::Gelu, + } + } +} + +#[derive(Debug, Clone)] +struct LinearPatchEmbedding { + linear: Linear, +} + +impl LinearPatchEmbedding { + fn new(vb: VarBuilder) -> Result { + let linear = linear_b(588, 1152, true, vb.pp("linear"))?; + Ok(Self { linear }) + } +} + +impl Module for LinearPatchEmbedding { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.linear) + } +} + +#[derive(Debug, Clone)] +struct Attention { + num_heads: usize, + head_dim: usize, + qkv: Linear, + proj: Linear, +} + +impl Attention { + pub fn new(vb: VarBuilder, dim: usize, num_heads: usize) -> Result { + let qkv = linear_b(dim, dim * 3, true, vb.pp("qkv"))?; + let proj = linear_b(dim, dim, true, vb.pp("proj"))?; + Ok(Self { + num_heads, + head_dim: dim / num_heads, + qkv, + proj, + }) + } +} + +impl Module for Attention { + fn forward(&self, xs: &Tensor) -> Result { + let (b, n, c) = xs.dims3()?; + let qkv = xs + .apply(&self.qkv)? + .reshape((b, n, 3, self.num_heads, self.head_dim))? + .permute((2, 0, 3, 1, 4))?; + let (q, k, v) = (qkv.i(0)?, qkv.i(1)?, qkv.i(2)?); + let attn_weights = scaled_dot_product_attention(&q, &k, &v)?; + let attn_weights = attn_weights.transpose(1, 2)?.reshape((b, n, c))?; + attn_weights.apply(&self.proj) + } +} + +#[derive(Debug, Clone)] +struct VitBlock { + attn: Attention, + mlp: Mlp, + norm1: candle_nn::LayerNorm, + norm2: candle_nn::LayerNorm, +} + +impl VitBlock { + fn new(vb: VarBuilder, dim: usize, num_heads: usize, cfg: &VisionConfig) -> Result { + let attn = Attention::new(vb.pp("attn"), dim, num_heads)?; + let mlp = Mlp::new(vb.pp("mlp"), dim, cfg.hidden_features, dim, cfg.act)?; + let norm1 = layer_norm(dim, 1e-5, vb.pp("norm1"))?; + let norm2 = layer_norm(dim, 1e-5, vb.pp("norm2"))?; + Ok(Self { + attn, + mlp, + norm1, + norm2, + }) + } +} + +impl Module for VitBlock { + fn forward(&self, xs: &Tensor) -> Result { + let ys = xs.apply(&self.norm1)?.apply(&self.attn)?; + let xs = (xs + &ys)?; + let ys = xs.apply(&self.norm2)?.apply(&self.mlp)?; + let xs = (&xs + &ys)?; + Ok(xs) + } +} + +#[derive(Debug, Clone)] +struct VisionTransformer { + patch_embed: LinearPatchEmbedding, + pos_embed: Tensor, + blocks: Vec, + norm: candle_nn::LayerNorm, +} + +impl VisionTransformer { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let patch_embed = LinearPatchEmbedding::new(vb.pp("patch_embed"))?; + let pos_embed = vb.get((1, cfg.embed_len, cfg.embed_dim), "pos_embed")?; + let blocks = (0..cfg.num_blocks) + .map(|i| { + VitBlock::new( + vb.pp(&format!("blocks.{}", i)), + cfg.embed_dim, + cfg.num_heads, + cfg, + ) + }) + .collect::>()?; + let norm = layer_norm(cfg.embed_dim, 1e-5, vb.pp("norm"))?; + Ok(Self { + patch_embed, + pos_embed, + blocks, + norm, + }) + } +} + +impl Module for VisionTransformer { + fn forward(&self, xs: &Tensor) -> Result { + let mut xs = (&xs.apply(&self.patch_embed)? + &self.pos_embed)?; + for block in self.blocks.iter() { + xs = xs.apply(block)?; + } + xs.apply(&self.norm) + } +} + +#[derive(Debug, Clone)] +pub struct Encoder { + model: VisionTransformer, +} + +impl Encoder { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let model = VisionTransformer::new(cfg, vb.pp("model.visual"))?; + Ok(Self { model }) + } +} + +impl Module for Encoder { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.model) + } +} + +#[derive(Debug, Clone)] +struct Mlp { + fc1: Linear, + act: candle_nn::Activation, + fc2: Linear, +} + +impl Mlp { + fn new( + vb: VarBuilder, + in_features: usize, + hidden_features: usize, + out_features: usize, + act: candle_nn::Activation, + ) -> Result { + let fc1 = linear_b(in_features, hidden_features, true, vb.pp("fc1"))?; + let fc2 = linear_b(hidden_features, out_features, true, vb.pp("fc2"))?; + Ok(Self { fc1, act, fc2 }) + } +} + +impl Module for Mlp { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2) + } +} + +#[derive(Debug, Clone)] +struct VisionProjection { + mlp: Mlp, +} + +impl VisionProjection { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let mlp = Mlp::new( + vb.pp("mlp"), + cfg.image_embedding_dim, + cfg.hidden_dim, + cfg.model_dim, + cfg.act, + )?; + Ok(Self { mlp }) + } +} + +impl Module for VisionProjection { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.mlp) + } +} + +#[derive(Debug, Clone)] +pub struct VisionEncoder { + encoder: Encoder, + projection: VisionProjection, +} + +impl VisionEncoder { + pub fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let encoder = Encoder::new(cfg, vb.pp("encoder"))?; + let projection = VisionProjection::new(cfg, vb.pp("projection"))?; + Ok(Self { + encoder, + projection, + }) + } +} + +impl Module for VisionEncoder { + fn forward(&self, xs: &Tensor) -> Result { + let (b, c, hp1, wp2) = xs.dims4()?; + let (p1, p2) = (14, 14); + let h = hp1 / p1; + let w = wp2 / p2; + let xs = xs + .reshape((b, c, h, p1, h, p2))? + .permute((0, 2, 4, 1, 3, 5))? + .reshape((b, h * w, c * p1 * p2))?; + xs.apply(&self.encoder)?.apply(&self.projection) + } +} + +pub struct Model { + pub text_model: PhiModel, + pub vision_encoder: VisionEncoder, +} + +impl Model { + pub fn new(config: &Config, vb: VarBuilder) -> Result { + let text_model = PhiModel::new_v2(&config.phi_config, vb.pp("text_model"))?; + let vision_encoder = VisionEncoder::new(&config.vision_config, vb.pp("vision_encoder"))?; + Ok(Self { + text_model, + vision_encoder, + }) + } + + pub fn vision_encoder(&self) -> &VisionEncoder { + &self.vision_encoder + } + + pub fn text_model(&mut self) -> &mut PhiModel { + &mut self.text_model + } +} From eead1dcead35556191e065a34053c5e27bb4d5c6 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 31 Mar 2024 08:57:40 +0200 Subject: [PATCH 088/131] Clippy fix. (#1972) --- candle-examples/examples/moondream/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index 7ea6570f..2ec04256 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -79,7 +79,7 @@ impl TextGeneration { } else { self.model .text_model - .forward_with_img(&input, &image_embeds)? + .forward_with_img(&input, image_embeds)? }; let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; let logits = if self.repeat_penalty == 1. { From f9954b73bac9fed91a9a08d952adc1cfb836a568 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 31 Mar 2024 09:32:50 +0200 Subject: [PATCH 089/131] Add options to use local files + specify a custom repo or branch. (#1973) --- candle-examples/examples/moondream/main.rs | 28 ++++++++++++++++++--- candle-transformers/src/models/moondream.rs | 28 +++++++++++---------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index 2ec04256..3e0f6d57 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -155,6 +155,18 @@ struct Args { /// The context size to consider for the repeat penalty. #[arg(long, default_value_t = 64)] repeat_last_n: usize, + + #[arg(long, default_value = "vikhyatk/moondream2")] + model_id: String, + + #[arg(long, default_value = "main")] + revision: String, + + #[arg(long)] + model_file: Option, + + #[arg(long)] + tokenizer_file: Option, } /// Loads an image from disk using the image crate, this returns a tensor with shape @@ -204,9 +216,19 @@ async fn main() -> anyhow::Result<()> { let start = std::time::Instant::now(); let api = hf_hub::api::tokio::Api::new()?; - let repo = api.model("vikhyatk/moondream2".to_string()); - let model_file = repo.get("model.safetensors").await?; - let tokenizer = repo.get("tokenizer.json").await?; + let repo = api.repo(hf_hub::Repo::with_revision( + args.model_id, + hf_hub::RepoType::Model, + args.revision, + )); + let model_file = match args.model_file { + Some(m) => m.into(), + None => repo.get("model.safetensors").await?, + }; + let tokenizer = match args.tokenizer_file { + Some(m) => m.into(), + None => repo.get("tokenizer.json").await?, + }; println!("retrieved the files in {:?}", start.elapsed()); let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?; diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs index 1172bf71..c36052c6 100644 --- a/candle-transformers/src/models/moondream.rs +++ b/candle-transformers/src/models/moondream.rs @@ -19,11 +19,8 @@ impl Config { fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result { let dim = q.dim(D::Minus1)?; let scale_factor = 1.0 / (dim as f64).sqrt(); - let k = k.transpose(D::Minus2, D::Minus1)?.contiguous()?; - let mut attn_weights = (q.contiguous()?.matmul(&k)? * scale_factor)?; - attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?.contiguous()?; - let attn_weights = attn_weights.matmul(&v.contiguous()?)?; - Ok(attn_weights) + let attn_weights = (q.matmul(&k.t()?)? * scale_factor)?; + candle_nn::ops::softmax_last_dim(&attn_weights)?.matmul(v) } #[derive(Debug, Clone, PartialEq, serde::Deserialize)] @@ -101,10 +98,15 @@ impl Module for Attention { .apply(&self.qkv)? .reshape((b, n, 3, self.num_heads, self.head_dim))? .permute((2, 0, 3, 1, 4))?; - let (q, k, v) = (qkv.i(0)?, qkv.i(1)?, qkv.i(2)?); - let attn_weights = scaled_dot_product_attention(&q, &k, &v)?; - let attn_weights = attn_weights.transpose(1, 2)?.reshape((b, n, c))?; - attn_weights.apply(&self.proj) + let (q, k, v) = ( + qkv.i(0)?.contiguous()?, + qkv.i(1)?.contiguous()?, + qkv.i(2)?.contiguous()?, + ); + scaled_dot_product_attention(&q, &k, &v)? + .transpose(1, 2)? + .reshape((b, n, c))? + .apply(&self.proj) } } @@ -275,11 +277,11 @@ impl Module for VisionEncoder { let (p1, p2) = (14, 14); let h = hp1 / p1; let w = wp2 / p2; - let xs = xs - .reshape((b, c, h, p1, h, p2))? + xs.reshape((b, c, h, p1, h, p2))? .permute((0, 2, 4, 1, 3, 5))? - .reshape((b, h * w, c * p1 * p2))?; - xs.apply(&self.encoder)?.apply(&self.projection) + .reshape((b, h * w, c * p1 * p2))? + .apply(&self.encoder)? + .apply(&self.projection) } } From cd29c7ccd420a840d883361c290ee92d06b9b96c Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 1 Apr 2024 00:15:48 +0200 Subject: [PATCH 090/131] More ggml cuda kernels (#1977) * Add more cuda kernels for quantized matmul. * Add the vec-dot bits. * Expose the quantized matmul-vec kernels. * Also include the quantize-q8-1 kernel. * Glue code for the q8-1 quantization. * mm-vec product via q8-1 quantization. * Add a test. * Add a mm test. * Get the test to return some sensible results. * Also test dmmv. * Fix the launch params. * Allow for tweaking the force_dmmv parameter while it's experimental. --- candle-core/src/quantized/cuda.rs | 154 ++- candle-examples/examples/quantized/main.rs | 8 + candle-kernels/src/quantized.cu | 1089 ++++++++++++++++++-- 3 files changed, 1169 insertions(+), 82 deletions(-) diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs index c90cf576..a8f0d622 100644 --- a/candle-core/src/quantized/cuda.rs +++ b/candle-core/src/quantized/cuda.rs @@ -2,7 +2,7 @@ use super::{GgmlDType, QStorage}; use crate::{backend::BackendDevice, cuda_backend::WrapErr}; use crate::{CudaDevice, CudaStorage, Result}; -use cudarc::driver::{CudaSlice, DeviceSlice}; +use cudarc::driver::{CudaSlice, CudaView, DeviceSlice}; pub struct QCudaStorage { data: CudaSlice, @@ -10,13 +10,43 @@ pub struct QCudaStorage { device: CudaDevice, } +static FORCE_DMMV: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(true); + +pub fn set_force_dmmv(f: bool) { + FORCE_DMMV.store(f, std::sync::atomic::Ordering::Relaxed) +} + pub const WARP_SIZE: usize = 32; pub const MMQ_X_Q4_0_AMPERE: usize = 4; pub const MMQ_Y_Q4_0_AMPERE: usize = 32; pub const NWARPS_Q4_0_AMPERE: usize = 4; pub const GGML_CUDA_MMV_X: usize = 32; pub const GGML_CUDA_MMV_Y: usize = 1; +pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256; pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256; +pub const MATRIX_ROW_PADDING: usize = 512; + +fn quantize_q8_1( + src: &CudaView, + dst: &mut CudaSlice, + elem_count: usize, + dev: &CudaDevice, +) -> Result<()> { + use cudarc::driver::LaunchAsync; + + let kx = elem_count; + let kx_padded = (kx + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let num_blocks = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + let func = dev.get_or_load_func("quantize_q8_1", candle_kernels::QUANTIZED)?; + let cfg = cudarc::driver::LaunchConfig { + grid_dim: (num_blocks as u32, 1, 1), + block_dim: (CUDA_QUANTIZE_BLOCK_SIZE as u32, 1, 1), + shared_mem_bytes: 0, + }; + let params = (src, dst, kx as i32, kx_padded as i32); + unsafe { func.launch(cfg, params) }.w()?; + Ok(()) +} fn dequantize( data: &CudaSlice, @@ -60,7 +90,7 @@ fn dequantize( _ => crate::bail!("unsupported dtype for dequantize {dtype:?}"), }; let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?; - let dst = dev.alloc_zeros::(elem_count).w()?; + let dst = unsafe { dev.alloc::(elem_count).w()? }; // See e.g. // https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270 let cfg = cudarc::driver::LaunchConfig { @@ -83,9 +113,9 @@ fn dequantize( Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone())) } -fn dequantize_mut_mal_vec( +fn dequantize_mul_mat_vec( data: &CudaSlice, - y: &cudarc::driver::CudaView, + y: &CudaView, dtype: GgmlDType, ncols: usize, nrows: usize, @@ -107,7 +137,7 @@ fn dequantize_mut_mal_vec( _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"), }; let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?; - let dst = dev.alloc_zeros::(nrows).w()?; + let dst = unsafe { dev.alloc::(nrows).w()? }; let block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; let cfg = cudarc::driver::LaunchConfig { grid_dim: (block_num_y as u32, 1, 1), @@ -120,6 +150,56 @@ fn dequantize_mut_mal_vec( Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone())) } +fn mul_mat_vec_via_q8_1( + data: &CudaSlice, + y: &CudaView, + dtype: GgmlDType, + ncols: usize, + nrows: usize, + dev: &CudaDevice, +) -> Result { + use cudarc::driver::LaunchAsync; + + // Start by quantizing y + let ncols_padded = (ncols + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let y_size_in_bytes = ncols_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size(); + let mut y_q8_1 = unsafe { dev.alloc::(y_size_in_bytes).w()? }; + quantize_q8_1(y, &mut y_q8_1, ncols, dev)?; + + let kernel_name = match dtype { + GgmlDType::Q4_0 => "mul_mat_vec_q4_0_q8_1_cuda", + GgmlDType::Q4_1 => "mul_mat_vec_q4_1_q8_1_cuda", + GgmlDType::Q5_0 => "mul_mat_vec_q5_0_q8_1_cuda", + GgmlDType::Q5_1 => "mul_mat_vec_q5_1_q8_1_cuda", + GgmlDType::Q8_0 => "mul_mat_vec_q8_0_q8_1_cuda", + GgmlDType::Q2K => "mul_mat_vec_q2_K_q8_1_cuda", + GgmlDType::Q3K => "mul_mat_vec_q3_K_q8_1_cuda", + GgmlDType::Q4K => "mul_mat_vec_q4_K_q8_1_cuda", + GgmlDType::Q5K => "mul_mat_vec_q5_K_q8_1_cuda", + GgmlDType::Q6K => "mul_mat_vec_q6_K_q8_1_cuda", + _ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"), + }; + let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?; + let dst = unsafe { dev.alloc::(nrows).w()? }; + let cfg = cudarc::driver::LaunchConfig { + grid_dim: (nrows as u32, 1, 1), + block_dim: (WARP_SIZE as u32, 4, 1), + shared_mem_bytes: 0, + }; + + let params = ( + data, + &y_q8_1, + &dst, + /* ncols_x */ ncols as i32, + /* nrows_x */ nrows as i32, + /* nrows_y */ ncols as i32, + /* nrows_dst */ nrows as i32, + ); + unsafe { func.launch(cfg, params) }.w()?; + Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone())) +} + impl QCudaStorage { pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result { let size_in_bytes = el_count * dtype.type_size() / dtype.block_size(); @@ -285,8 +365,11 @@ impl QCudaStorage { crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", rhs_l.shape()) } - let out = - dequantize_mut_mal_vec(&self.data, &rhs, self.dtype, ncols, nrows, self.device())?; + let out = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) { + dequantize_mul_mat_vec(&self.data, &rhs, self.dtype, ncols, nrows, self.device())? + } else { + mul_mat_vec_via_q8_1(&self.data, &rhs, self.dtype, ncols, nrows, self.device())? + }; let out_shape = if with_batch { vec![1, 1, nrows] } else { @@ -341,3 +424,60 @@ pub fn load_quantized( dtype: T::DTYPE, })) } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn cuda_quantize_q8_1() -> Result<()> { + let dev = CudaDevice::new(0)?; + let el = 256; + let el_padded = (el + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let y_size_in_bytes = + el_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size(); + let mut y_q8_1 = unsafe { dev.alloc::(y_size_in_bytes).w()? }; + let vs: Vec = (0..el).map(|v| v as f32).collect(); + let y = dev.htod_sync_copy(&vs).w()?; + quantize_q8_1(&y.slice(..), &mut y_q8_1, el, &dev)?; + Ok(()) + } + + #[test] + fn cuda_mmv_q8_1() -> Result<()> { + let dev = CudaDevice::new(0)?; + let ncols = 256; + let vs: Vec = (0..ncols).map(|v| v as f32).collect(); + let y = dev.htod_sync_copy(&vs).w()?; + let mut xs = QCudaStorage::zeros(&dev, ncols, GgmlDType::Q4_0)?; + xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?; + let cuda_storage = mul_mat_vec_via_q8_1( + &xs.data, + &y.slice(..), + /* dtype */ GgmlDType::Q4_0, + /* ncols */ ncols, + /* nrows */ 1, + &dev, + )?; + let vs = cuda_storage.as_cuda_slice::()?; + let vs = dev.dtoh_sync_copy(&vs.slice(..)).unwrap(); + assert_eq!(vs.len(), 1); + // for n = 255, n.(n+1).(2n+1) / 6 = 5559680 + // Q8 means 1/256 precision. + assert_eq!(vs[0], 5561664.5); + + let cuda_storage = dequantize_mul_mat_vec( + &xs.data, + &y.slice(..), + /* dtype */ GgmlDType::Q4_0, + /* ncols */ ncols, + /* nrows */ 1, + &dev, + )?; + let vs = cuda_storage.as_cuda_slice::()?; + let vs = dev.dtoh_sync_copy(&vs.slice(..)).unwrap(); + assert_eq!(vs.len(), 1); + assert_eq!(vs[0], 5561851.0); + Ok(()) + } +} diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs index 96344a49..3cabc3a4 100644 --- a/candle-examples/examples/quantized/main.rs +++ b/candle-examples/examples/quantized/main.rs @@ -235,6 +235,10 @@ struct Args { /// Group-Query Attention, use 8 for the 70B version of LLaMAv2. #[arg(long)] gqa: Option, + + /// Use the (experimental) fast cuda kernels. + #[arg(long)] + fast_cuda: bool, } impl Args { @@ -341,6 +345,10 @@ fn main() -> anyhow::Result<()> { use tracing_subscriber::prelude::*; let args = Args::parse(); + + #[cfg(feature = "cuda")] + candle::quantized::cuda::set_force_dmmv(!args.fast_cuda); + let temperature = if args.temperature == 0. { None } else { diff --git a/candle-kernels/src/quantized.cu b/candle-kernels/src/quantized.cu index f8becbbc..f91dbb32 100644 --- a/candle-kernels/src/quantized.cu +++ b/candle-kernels/src/quantized.cu @@ -23,6 +23,22 @@ typedef float dfloat; // dequantize float typedef float2 dfloat2; typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); +static __device__ __forceinline__ float warp_reduce_sum(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); + } + return x; +} + +static __device__ __forceinline__ float warp_reduce_max(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + } + return x; +} + static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment @@ -233,57 +249,6 @@ typedef struct { static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); -// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called -// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q - -#define VDR_Q4_0_Q8_1_MMVQ 2 -#define VDR_Q4_0_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( - const int * v, const int * u, const float & d4, const half2 & ds8) { - - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); - } - - const float2 ds8f = __half22float2(ds8); - - // second part effectively subtracts 8 from each quant value - const float res = d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); - printf("%f %f %f %f %f %f\n", res, d4, sumi, ds8f.x, vdr/QI4_0, ds8f.y); - return res; -} - - -static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const float * x_dmf = (const float *) x_dm; - - int u[2*VDR_Q4_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; - } - - return vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - template static __device__ __forceinline__ void mul_mat_q( @@ -447,30 +412,6 @@ template static __device__ __forceinline__ void allocate_tiles_q4_0( *x_dm = (half2 *) tile_x_d; } -extern "C" __global__ void mul_mat_q4_0_check( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q4_0_AMPERE; - const int mmq_y = MMQ_Y_Q4_0_AMPERE; - const int nwarps = NWARPS_Q4_0_AMPERE; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - -extern "C" __global__ void mul_mat_q4_0_no_check( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - const int mmq_x = MMQ_X_Q4_0_AMPERE; - const int mmq_y = MMQ_Y_Q4_0_AMPERE; - const int nwarps = NWARPS_Q4_0_AMPERE; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -} - static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_0 * x = (const block_q4_0 *) vx; @@ -1595,3 +1536,1001 @@ extern "C" __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ dst[row] = tmp; } } + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( + const int * v, const int * u, const float & d4, const half2 & ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( + const int * v, const int * u, const half2 & dm4, const half2 & ds8) { + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm4, ds8)); + const float d4d8 = tmp.x; + const float m4s8 = tmp.y; +#else + const float2 dm4f = __half22float2(dm4); + const float2 ds8f = __half22float2(ds8); + const float d4d8 = dm4f.x * ds8f.x; + const float m4s8 = dm4f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( + const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( + const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm5, ds8)); + const float d5d8 = tmp.x; + const float m5s8 = tmp.y; +#else + const float2 dm5f = __half22float2(dm5); + const float2 ds8f = __half22float2(ds8); + const float d5d8 = dm5f.x * ds8f.x; + const float m5s8 = dm5f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( + const int * v, const int * u, const float & d8_0, const float & d8_1) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +} + +template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( + const int * v, const int * u, const half2 & dm8, const half2 & ds8) { + + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm8, ds8)); + const float d8d8 = tmp.x; + const float m8s8 = tmp.y; +#else + const float2 dm8f = __half22float2(dm8); + const float2 ds8f = __half22float2(ds8); + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 +#define VDR_Q2_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float * __restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + } + + const float2 dm2f = __half22float2(dm2); + + return dm2f.x*sumf_d - dm2f.y*sumf_m; +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float & d8) { + + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const float2 dm2f = __half22float2(dm2); + + return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); +} + +#define VDR_Q3_K_Q8_1_MMVQ 1 +#define VDR_Q3_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const int & scale_offset, const float & d3, const float * __restrict__ d8) { + + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d3, const float & d8) { + + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; +} + +#define VDR_Q5_K_Q8_1_MMVQ 2 +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const float2 dm5f = __half22float2(dm5); + + return dm5f.x*sumf_d - dm5f.y*sumf_m; +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; +} + +#define VDR_Q6_K_Q8_1_MMVQ 1 +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d, const float * __restrict__ d8) { + + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, + const float & d6, const float * __restrict__ d8) { + + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + + sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + } + + sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); + } + + return d6 * sumf_d; +} + +static __device__ __forceinline__ float vec_dot_q4_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); +} + + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); +} + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); +} + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = __low2float(bq8_1[bq8_offset + i].ds); + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = __low2float(bq8_1[bq8_offset + i].ds); + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2float(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); + +#else + + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; +#endif +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2float(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); + +#else + + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; +#endif +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); +} + +// https://github.com/ggerganov/llama.cpp/blob/c50a82ce0f71558cbb8e555146ba124251504b38/ggml-cuda/mmvq.cu#L4 +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); + +template +static __device__ void mul_mat_vec_q( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) + constexpr int nwarps = 1; + constexpr int rows_per_cuda_block = 1; +#else + constexpr int nwarps = ncols_y <= 4 ? 4 : 2; + constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) + + const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + const int row0 = rows_per_cuda_block*blockIdx.x; + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi; + +// partial sum for each thread + float tmp[ncols_y][rows_per_cuda_block] = {0.0f}; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { + const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx + + // x block quant index when casting the quants to int + const int kqs = vdr * (tid % (qi/vdr)); + +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { + tmp[j][i] += vec_dot_q_cuda( + &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs); + } + } + } + + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE]; + if (threadIdx.y > 0) { +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { + tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; + } + } + } + __syncthreads(); + if (threadIdx.y > 0) { + return; + } + + // sum up partial sums and write back result +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { +#pragma unroll + for (int l = 0; l < nwarps-1; ++l) { + tmp[j][i] += tmp_shared[l][j][i][threadIdx.x]; + } + tmp[j][i] = warp_reduce_sum(tmp[j][i]); + } + + if (threadIdx.x < rows_per_cuda_block) { + dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; + } + } +} + +extern "C" __global__ void mul_mat_vec_q4_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q4_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q5_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q5_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q8_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q2_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q3_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q4_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q5_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void mul_mat_vec_q6_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + + mul_mat_vec_q<1, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1> + (vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); +} + +extern "C" __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { + const int ix = blockDim.x*blockIdx.x + threadIdx.x; + + if (ix >= kx_padded) { + return; + } + + const int iy = blockDim.y*blockIdx.y + threadIdx.y; + + const int i_padded = iy*kx_padded + ix; + + block_q8_1 * y = (block_q8_1 *) vy; + + const int ib = i_padded / QK8_1; // block index + const int iqs = i_padded % QK8_1; // quant index + + const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; + float amax = fabsf(xi); + float sum = xi; + + amax = warp_reduce_max(amax); + sum = warp_reduce_sum(sum); + + const float d = amax / 127; + const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); + + y[ib].qs[iqs] = q; + + if (iqs > 0) { + return; + } + + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; +} From c7557b65dcccbb45e53695db71e8d7c1bfd38dc2 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 1 Apr 2024 10:00:11 +0200 Subject: [PATCH 091/131] Switch the default to using the faster kernels. (#1978) * Switch the default to using the faster kernels. * Add the force-dmmv flag. --- candle-core/src/quantized/cuda.rs | 2 +- candle-examples/examples/mistral/main.rs | 7 +++++++ candle-examples/examples/quantized/main.rs | 6 +++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs index a8f0d622..64404beb 100644 --- a/candle-core/src/quantized/cuda.rs +++ b/candle-core/src/quantized/cuda.rs @@ -10,7 +10,7 @@ pub struct QCudaStorage { device: CudaDevice, } -static FORCE_DMMV: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(true); +static FORCE_DMMV: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); pub fn set_force_dmmv(f: bool) { FORCE_DMMV.store(f, std::sync::atomic::Ordering::Relaxed) diff --git a/candle-examples/examples/mistral/main.rs b/candle-examples/examples/mistral/main.rs index a972279c..c00af3fe 100644 --- a/candle-examples/examples/mistral/main.rs +++ b/candle-examples/examples/mistral/main.rs @@ -196,6 +196,10 @@ struct Args { /// The context size to consider for the repeat penalty. #[arg(long, default_value_t = 64)] repeat_last_n: usize, + + /// Use the slower dmmv cuda kernel. + #[arg(long)] + force_dmmv: bool, } fn main() -> Result<()> { @@ -203,6 +207,9 @@ fn main() -> Result<()> { use tracing_subscriber::prelude::*; let args = Args::parse(); + #[cfg(feature = "cuda")] + candle::quantized::cuda::set_force_dmmv(args.force_dmmv); + let _guard = if args.tracing { let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); tracing_subscriber::registry().with(chrome_layer).init(); diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs index 3cabc3a4..b03768ed 100644 --- a/candle-examples/examples/quantized/main.rs +++ b/candle-examples/examples/quantized/main.rs @@ -236,9 +236,9 @@ struct Args { #[arg(long)] gqa: Option, - /// Use the (experimental) fast cuda kernels. + /// Use the slower dmmv cuda kernel. #[arg(long)] - fast_cuda: bool, + force_dmmv: bool, } impl Args { @@ -347,7 +347,7 @@ fn main() -> anyhow::Result<()> { let args = Args::parse(); #[cfg(feature = "cuda")] - candle::quantized::cuda::set_force_dmmv(!args.fast_cuda); + candle::quantized::cuda::set_force_dmmv(args.force_dmmv); let temperature = if args.temperature == 0. { None From 318cb82f1657a7ebd238e94bce134203c9fd04b4 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 1 Apr 2024 11:06:42 +0200 Subject: [PATCH 092/131] Quantized cuda tweaks. (#1981) * Quantized cuda tweaks. * Add some safety checks. * Factorize the dequantization bits. --- candle-core/src/quantized/cuda.rs | 151 ++++++++++++------------------ 1 file changed, 62 insertions(+), 89 deletions(-) diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs index 64404beb..07f8c13e 100644 --- a/candle-core/src/quantized/cuda.rs +++ b/candle-core/src/quantized/cuda.rs @@ -1,9 +1,11 @@ use super::{GgmlDType, QStorage}; +use crate::quantized::k_quants::GgmlType; use crate::{backend::BackendDevice, cuda_backend::WrapErr}; use crate::{CudaDevice, CudaStorage, Result}; use cudarc::driver::{CudaSlice, CudaView, DeviceSlice}; +#[derive(Clone, Debug)] pub struct QCudaStorage { data: CudaSlice, dtype: GgmlDType, @@ -26,6 +28,14 @@ pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256; pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256; pub const MATRIX_ROW_PADDING: usize = 512; +fn ceil_div(p: usize, q: usize) -> usize { + (p + q - 1) / q +} + +fn pad(p: usize, q: usize) -> usize { + ceil_div(p, q) * q +} + fn quantize_q8_1( src: &CudaView, dst: &mut CudaSlice, @@ -35,8 +45,8 @@ fn quantize_q8_1( use cudarc::driver::LaunchAsync; let kx = elem_count; - let kx_padded = (kx + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; - let num_blocks = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + let kx_padded = pad(kx, MATRIX_ROW_PADDING); + let num_blocks = ceil_div(kx_padded, CUDA_QUANTIZE_BLOCK_SIZE); let func = dev.get_or_load_func("quantize_q8_1", candle_kernels::QUANTIZED)?; let cfg = cudarc::driver::LaunchConfig { grid_dim: (num_blocks as u32, 1, 1), @@ -60,26 +70,18 @@ fn dequantize( let (kernel_name, is_k, block_dim, num_blocks) = match dtype { GgmlDType::Q4_0 => ("dequantize_block_q4_0", false, 32, nb), GgmlDType::Q4_1 => ("dequantize_block_q4_1", false, 32, nb), - GgmlDType::Q5_0 => { - let nb = (elem_count + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1) - / (2 * CUDA_DEQUANTIZE_BLOCK_SIZE); - ( - "dequantize_block_q5_0", - false, - CUDA_DEQUANTIZE_BLOCK_SIZE, - nb, - ) - } - GgmlDType::Q5_1 => { - let nb = (elem_count + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1) - / (2 * CUDA_DEQUANTIZE_BLOCK_SIZE); - ( - "dequantize_block_q5_1", - false, - CUDA_DEQUANTIZE_BLOCK_SIZE, - nb, - ) - } + GgmlDType::Q5_0 => ( + "dequantize_block_q5_0", + false, + CUDA_DEQUANTIZE_BLOCK_SIZE, + ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE), + ), + GgmlDType::Q5_1 => ( + "dequantize_block_q5_1", + false, + CUDA_DEQUANTIZE_BLOCK_SIZE, + ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE), + ), GgmlDType::Q8_0 => ("dequantize_block_q8_0", false, 32, nb), GgmlDType::Q2K => ("dequantize_block_q2_K", true, 64, nb), GgmlDType::Q3K => ("dequantize_block_q3_K", true, 64, nb), @@ -123,6 +125,13 @@ fn dequantize_mul_mat_vec( ) -> Result { use cudarc::driver::LaunchAsync; + let data_elems = data.len() / dtype.type_size() * dtype.block_size(); + if data_elems < ncols * nrows { + crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems) + } + if y.len() != ncols { + crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len()) + } let kernel_name = match dtype { GgmlDType::Q4_0 => "dequantize_mul_mat_vec_q4_0_cuda", GgmlDType::Q4_1 => "dequantize_mul_mat_vec_q4_1_cuda", @@ -138,7 +147,7 @@ fn dequantize_mul_mat_vec( }; let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?; let dst = unsafe { dev.alloc::(nrows).w()? }; - let block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + let block_num_y = ceil_div(nrows, GGML_CUDA_MMV_Y); let cfg = cudarc::driver::LaunchConfig { grid_dim: (block_num_y as u32, 1, 1), block_dim: (WARP_SIZE as u32, GGML_CUDA_MMV_Y as u32, 1), @@ -160,8 +169,15 @@ fn mul_mat_vec_via_q8_1( ) -> Result { use cudarc::driver::LaunchAsync; + let data_elems = data.len() / dtype.type_size() * dtype.block_size(); + if data_elems < ncols * nrows { + crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems) + } + if y.len() != ncols { + crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len()) + } // Start by quantizing y - let ncols_padded = (ncols + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let ncols_padded = pad(ncols, MATRIX_ROW_PADDING); let y_size_in_bytes = ncols_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size(); let mut y_q8_1 = unsafe { dev.alloc::(y_size_in_bytes).w()? }; quantize_q8_1(y, &mut y_q8_1, ncols, dev)?; @@ -202,7 +218,7 @@ fn mul_mat_vec_via_q8_1( impl QCudaStorage { pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result { - let size_in_bytes = el_count * dtype.type_size() / dtype.block_size(); + let size_in_bytes = ceil_div(el_count, dtype.block_size()) * dtype.type_size(); let data = device.alloc_zeros::(size_in_bytes).w()?; Ok(QCudaStorage { data, @@ -220,6 +236,12 @@ impl QCudaStorage { } pub fn dequantize(&self, elem_count: usize) -> Result { + fn deq(buffer: &[u8], n: usize, dst: &mut [f32]) -> Result<()> { + let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) }; + let vec = slice.to_vec(); + T::to_float(&vec, dst) + } + let fast_kernel = matches!( self.dtype, GgmlDType::Q4_0 @@ -238,69 +260,25 @@ impl QCudaStorage { return dequantize(&self.data, self.dtype, elem_count, self.device()); } // Run the dequantization on cpu. - use crate::quantized::k_quants::GgmlType; let buffer = self.device.dtoh_sync_copy(&self.data).w()?; let mut out = vec![0.0; elem_count]; let block_len = elem_count / self.dtype.block_size(); match self.dtype { - GgmlDType::F32 => { - let slice = - unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const f32, block_len) }; - out.copy_from_slice(slice) - } - GgmlDType::F16 => { - let vec: Vec = read_to_vec(&buffer, block_len); - half::f16::to_float(&vec, &mut out)?; - } - GgmlDType::Q4_0 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?; - } - GgmlDType::Q4_1 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?; - } - GgmlDType::Q5_0 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?; - } - GgmlDType::Q5_1 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?; - } - GgmlDType::Q8_0 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?; - } - GgmlDType::Q8_1 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?; - } - GgmlDType::Q2K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ2K::to_float(&vec, &mut out)?; - } - GgmlDType::Q3K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ3K::to_float(&vec, &mut out)?; - } - GgmlDType::Q4K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ4K::to_float(&vec, &mut out)?; - } - GgmlDType::Q5K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ5K::to_float(&vec, &mut out)?; - } - GgmlDType::Q6K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ6K::to_float(&vec, &mut out)?; - } - GgmlDType::Q8K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ8K::to_float(&vec, &mut out)?; - } + GgmlDType::F32 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::F16 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q4_0 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q4_1 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q5_0 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q5_1 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q8_0 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q8_1 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q2K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q3K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q4K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q5K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q6K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q8K => deq::(&buffer, block_len, &mut out)?, } self.device @@ -405,11 +383,6 @@ impl QCudaStorage { } } -fn read_to_vec(buffer: &[u8], n: usize) -> Vec { - let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) }; - slice.to_vec() -} - pub fn load_quantized( device: &CudaDevice, data: &[T], @@ -433,7 +406,7 @@ mod test { fn cuda_quantize_q8_1() -> Result<()> { let dev = CudaDevice::new(0)?; let el = 256; - let el_padded = (el + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let el_padded = pad(el, MATRIX_ROW_PADDING); let y_size_in_bytes = el_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size(); let mut y_q8_1 = unsafe { dev.alloc::(y_size_in_bytes).w()? }; From 888c09a3dbf8413c3aa76076e49cf52460334bbd Mon Sep 17 00:00:00 2001 From: Mauro Sciancalepore <40116397+masc-it@users.noreply.github.com> Date: Mon, 1 Apr 2024 12:08:25 +0200 Subject: [PATCH 093/131] add identity op (#1976) --- candle-onnx/src/eval.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/candle-onnx/src/eval.rs b/candle-onnx/src/eval.rs index cacb56ca..15cadf1d 100644 --- a/candle-onnx/src/eval.rs +++ b/candle-onnx/src/eval.rs @@ -776,6 +776,11 @@ pub fn simple_eval( let output = input.reshape(new_shape)?; values.insert(node.output[0].clone(), output); } + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#identity + "Identity" => { + let input = get(&node.input[0])?; + values.insert(node.output[0].clone(), input.clone()); + } op_type => bail!("unsupported op_type {op_type} for op {node:?}"), } } From 5522bbc57c2967f3c8fb8fa9ab8a82d2c9ff8db8 Mon Sep 17 00:00:00 2001 From: yinqiwen Date: Mon, 1 Apr 2024 18:10:08 +0800 Subject: [PATCH 094/131] Add fn 'get_with_hints_dtype' in VarBuilder (#1877) (#1897) * quantized models(awq/squeezellm/...) have multiple data type tensors, use 'get_with_hints_dtype' to load tensors with given dtype --- candle-nn/src/var_builder.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/candle-nn/src/var_builder.rs b/candle-nn/src/var_builder.rs index bf090219..7de46044 100644 --- a/candle-nn/src/var_builder.rs +++ b/candle-nn/src/var_builder.rs @@ -178,16 +178,27 @@ impl<'a, B: Backend> VarBuilderArgs<'a, B> { name: &str, hints: B::Hints, ) -> Result { - let path = self.path(name); - self.data - .backend - .get(s.into(), &path, hints, self.data.dtype, &self.data.device) + self.get_with_hints_dtype(s, name, hints, self.data.dtype) } /// Retrieve the tensor associated with the given name at the current path. pub fn get>(&self, s: S, name: &str) -> Result { self.get_with_hints(s, name, Default::default()) } + + /// Retrieve the tensor associated with the given name & dtype at the current path. + pub fn get_with_hints_dtype>( + &self, + s: S, + name: &str, + hints: B::Hints, + dtype: DType, + ) -> Result { + let path = self.path(name); + self.data + .backend + .get(s.into(), &path, hints, dtype, &self.data.device) + } } struct Zeros; From b20acd622ced28f062d9f91410948282c10661ce Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 1 Apr 2024 17:07:02 +0200 Subject: [PATCH 095/131] Update for pyo3 0.21. (#1985) * Update for pyo3 0.21. * Also adapt the RL example. * Fix for the pyo3-onnx bindings... * Print details on failures. * Revert pyi. --- candle-examples/Cargo.toml | 2 +- .../reinforcement-learning/gym_env.rs | 14 +-- .../reinforcement-learning/vec_gym_env.rs | 10 +-- candle-pyo3/Cargo.toml | 4 +- candle-pyo3/py_src/candle/nn/__init__.pyi | 19 ++++ candle-pyo3/src/lib.rs | 88 +++++++++---------- candle-pyo3/src/onnx.rs | 2 +- candle-pyo3/stub.py | 4 + 8 files changed, 84 insertions(+), 59 deletions(-) create mode 100644 candle-pyo3/py_src/candle/nn/__init__.pyi diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index 864d2f6a..5b90f140 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -25,7 +25,7 @@ hf-hub = { workspace = true, features = ["tokio"] } image = { workspace = true } intel-mkl-src = { workspace = true, optional = true } num-traits = { workspace = true } -pyo3 = { version = "0.20.0", features = ["auto-initialize"], optional = true } +pyo3 = { version = "0.21.0", features = ["auto-initialize"], optional = true } rayon = { workspace = true } rubato = { version = "0.15.0", optional = true } safetensors = { workspace = true } diff --git a/candle-examples/examples/reinforcement-learning/gym_env.rs b/candle-examples/examples/reinforcement-learning/gym_env.rs index 8868c188..a2b6652f 100644 --- a/candle-examples/examples/reinforcement-learning/gym_env.rs +++ b/candle-examples/examples/reinforcement-learning/gym_env.rs @@ -42,7 +42,7 @@ impl GymEnv { /// Creates a new session of the specified OpenAI Gym environment. pub fn new(name: &str) -> Result { Python::with_gil(|py| { - let gym = py.import("gymnasium")?; + let gym = py.import_bound("gymnasium")?; let make = gym.getattr("make")?; let env = make.call1((name,))?; let action_space = env.getattr("action_space")?; @@ -66,10 +66,10 @@ impl GymEnv { /// Resets the environment, returning the observation tensor. pub fn reset(&self, seed: u64) -> Result { let state: Vec = Python::with_gil(|py| { - let kwargs = PyDict::new(py); + let kwargs = PyDict::new_bound(py); kwargs.set_item("seed", seed)?; - let state = self.env.call_method(py, "reset", (), Some(kwargs))?; - state.as_ref(py).get_item(0)?.extract() + let state = self.env.call_method_bound(py, "reset", (), Some(&kwargs))?; + state.bind(py).get_item(0)?.extract() }) .map_err(w)?; Tensor::new(state, &Device::Cpu) @@ -81,8 +81,10 @@ impl GymEnv { action: A, ) -> Result> { let (state, reward, terminated, truncated) = Python::with_gil(|py| { - let step = self.env.call_method(py, "step", (action.clone(),), None)?; - let step = step.as_ref(py); + let step = self + .env + .call_method_bound(py, "step", (action.clone(),), None)?; + let step = step.bind(py); let state: Vec = step.get_item(0)?.extract()?; let reward: f64 = step.get_item(1)?.extract()?; let terminated: bool = step.get_item(2)?.extract()?; diff --git a/candle-examples/examples/reinforcement-learning/vec_gym_env.rs b/candle-examples/examples/reinforcement-learning/vec_gym_env.rs index 8f8f30bd..e382ad76 100644 --- a/candle-examples/examples/reinforcement-learning/vec_gym_env.rs +++ b/candle-examples/examples/reinforcement-learning/vec_gym_env.rs @@ -24,13 +24,13 @@ fn w(res: PyErr) -> candle::Error { impl VecGymEnv { pub fn new(name: &str, img_dir: Option<&str>, nprocesses: usize) -> Result { Python::with_gil(|py| { - let sys = py.import("sys")?; + let sys = py.import_bound("sys")?; let path = sys.getattr("path")?; let _ = path.call_method1( "append", ("candle-examples/examples/reinforcement-learning",), )?; - let gym = py.import("atari_wrappers")?; + let gym = py.import_bound("atari_wrappers")?; let make = gym.getattr("make")?; let env = make.call1((name, img_dir, nprocesses))?; let action_space = env.getattr("action_space")?; @@ -60,10 +60,10 @@ impl VecGymEnv { pub fn step(&self, action: Vec) -> Result { let (obs, reward, is_done) = Python::with_gil(|py| { - let step = self.env.call_method(py, "step", (action,), None)?; - let step = step.as_ref(py); + let step = self.env.call_method_bound(py, "step", (action,), None)?; + let step = step.bind(py); let obs = step.get_item(0)?.call_method("flatten", (), None)?; - let obs_buffer = pyo3::buffer::PyBuffer::get(obs)?; + let obs_buffer = pyo3::buffer::PyBuffer::get_bound(&obs)?; let obs: Vec = obs_buffer.to_vec(py)?; let reward: Vec = step.get_item(1)?.extract()?; let is_done: Vec = step.get_item(2)?.extract()?; diff --git a/candle-pyo3/Cargo.toml b/candle-pyo3/Cargo.toml index 7c6fbd68..88001334 100644 --- a/candle-pyo3/Cargo.toml +++ b/candle-pyo3/Cargo.toml @@ -20,10 +20,10 @@ candle-nn = { workspace = true } candle-onnx = { workspace = true, optional = true } half = { workspace = true } intel-mkl-src = { workspace = true, optional = true } -pyo3 = { version = "0.20.0", features = ["extension-module", "abi3-py38"] } +pyo3 = { version = "0.21.0", features = ["extension-module", "abi3-py38"] } [build-dependencies] -pyo3-build-config = "0.20" +pyo3-build-config = "0.21" [features] default = [] diff --git a/candle-pyo3/py_src/candle/nn/__init__.pyi b/candle-pyo3/py_src/candle/nn/__init__.pyi new file mode 100644 index 00000000..118c4cff --- /dev/null +++ b/candle-pyo3/py_src/candle/nn/__init__.pyi @@ -0,0 +1,19 @@ +# Generated content DO NOT EDIT +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Sequence +from os import PathLike +from candle.typing import _ArrayLike, Device, Scalar, Index, Shape +from candle import Tensor, DType, QTensor + +@staticmethod +def silu(tensor: Tensor) -> Tensor: + """ + Applies the Sigmoid Linear Unit (SiLU) function to a given tensor. + """ + pass + +@staticmethod +def softmax(tensor: Tensor, dim: int) -> Tensor: + """ + Applies the Softmax function to a given tensor.# + """ + pass diff --git a/candle-pyo3/src/lib.rs b/candle-pyo3/src/lib.rs index e0d3bf30..0da2c700 100644 --- a/candle-pyo3/src/lib.rs +++ b/candle-pyo3/src/lib.rs @@ -60,8 +60,8 @@ impl PyDType { impl PyDType { fn from_pyobject(ob: PyObject, py: Python<'_>) -> PyResult { use std::str::FromStr; - if let Ok(dtype) = ob.extract::<&str>(py) { - let dtype = DType::from_str(dtype) + if let Ok(dtype) = ob.extract::(py) { + let dtype = DType::from_str(&dtype) .map_err(|_| PyTypeError::new_err(format!("invalid dtype '{dtype}'")))?; Ok(Self(dtype)) } else { @@ -116,8 +116,8 @@ impl PyDevice { impl<'source> FromPyObject<'source> for PyDevice { fn extract(ob: &'source PyAny) -> PyResult { - let device: &str = ob.extract()?; - let device = match device { + let device: String = ob.extract()?; + let device = match device.as_str() { "cpu" => PyDevice::Cpu, "cuda" => PyDevice::Cuda, _ => Err(PyTypeError::new_err(format!("invalid device '{device}'")))?, @@ -265,7 +265,7 @@ impl PyTensor { } else if let Ok(TorchTensor(numpy)) = data.extract::(py) { return PyTensor::new(py, numpy); } else { - let ty = data.as_ref(py).get_type(); + let ty = data.bind(py).get_type(); Err(PyTypeError::new_err(format!( "incorrect type {ty} for tensor" )))? @@ -322,7 +322,7 @@ impl PyTensor { fn to_torch(&self, py: Python<'_>) -> PyResult { let candle_values = self.values(py)?; let torch_tensor: PyObject = py - .import("torch")? + .import_bound("torch")? .getattr("tensor")? .call1((candle_values,))? .extract()?; @@ -333,7 +333,7 @@ impl PyTensor { /// Gets the tensor's shape. /// &RETURNS&: Tuple[int] fn shape(&self, py: Python<'_>) -> PyObject { - PyTuple::new(py, self.0.dims()).to_object(py) + PyTuple::new_bound(py, self.0.dims()).to_object(py) } #[getter] @@ -347,7 +347,7 @@ impl PyTensor { /// Gets the tensor's strides. /// &RETURNS&: Tuple[int] fn stride(&self, py: Python<'_>) -> PyObject { - PyTuple::new(py, self.0.stride()).to_object(py) + PyTuple::new_bound(py, self.0.stride()).to_object(py) } #[getter] @@ -527,7 +527,7 @@ impl PyTensor { } fn extract_indexer( - py_indexer: &PyAny, + py_indexer: &Bound, current_dim: usize, dims: &[usize], index_argument_count: usize, @@ -567,7 +567,7 @@ impl PyTensor { ), current_dim + 1, )) - } else if py_indexer.is_ellipsis() { + } else if py_indexer.is(&py_indexer.py().Ellipsis()) { // Handle '...' e.g. tensor[..., 0] if current_dim > 0 { return Err(PyTypeError::new_err( @@ -586,7 +586,7 @@ impl PyTensor { } } - if let Ok(tuple) = idx.downcast::(py) { + if let Ok(tuple) = idx.downcast_bound::(py) { let not_none_count: usize = tuple.iter().filter(|x| !x.is_none()).count(); if not_none_count > dims.len() { @@ -596,12 +596,12 @@ impl PyTensor { let mut current_dim = 0; for item in tuple.iter() { let (indexer, new_current_dim) = - extract_indexer(item, current_dim, dims, not_none_count)?; + extract_indexer(&item, current_dim, dims, not_none_count)?; current_dim = new_current_dim; indexers.push(indexer); } } else { - let (indexer, _) = extract_indexer(idx.downcast::(py)?, 0, dims, 1)?; + let (indexer, _) = extract_indexer(idx.downcast_bound::(py)?, 0, dims, 1)?; indexers.push(indexer); } @@ -652,7 +652,7 @@ impl PyTensor { /// Add two tensors. /// &RETURNS&: Tensor - fn __add__(&self, rhs: &PyAny) -> PyResult { + fn __add__(&self, rhs: &Bound) -> PyResult { let tensor = if let Ok(rhs) = rhs.extract::() { self.0.broadcast_add(&rhs.0).map_err(wrap_err)? } else if let Ok(rhs) = rhs.extract::() { @@ -663,13 +663,13 @@ impl PyTensor { Ok(Self(tensor)) } - fn __radd__(&self, rhs: &PyAny) -> PyResult { + fn __radd__(&self, rhs: &Bound) -> PyResult { self.__add__(rhs) } /// Multiply two tensors. /// &RETURNS&: Tensor - fn __mul__(&self, rhs: &PyAny) -> PyResult { + fn __mul__(&self, rhs: &Bound) -> PyResult { let tensor = if let Ok(rhs) = rhs.extract::() { self.0.broadcast_mul(&rhs.0).map_err(wrap_err)? } else if let Ok(rhs) = rhs.extract::() { @@ -680,13 +680,13 @@ impl PyTensor { Ok(Self(tensor)) } - fn __rmul__(&self, rhs: &PyAny) -> PyResult { + fn __rmul__(&self, rhs: &Bound) -> PyResult { self.__mul__(rhs) } /// Subtract two tensors. /// &RETURNS&: Tensor - fn __sub__(&self, rhs: &PyAny) -> PyResult { + fn __sub__(&self, rhs: &Bound) -> PyResult { let tensor = if let Ok(rhs) = rhs.extract::() { self.0.broadcast_sub(&rhs.0).map_err(wrap_err)? } else if let Ok(rhs) = rhs.extract::() { @@ -699,7 +699,7 @@ impl PyTensor { /// Divide two tensors. /// &RETURNS&: Tensor - fn __truediv__(&self, rhs: &PyAny) -> PyResult { + fn __truediv__(&self, rhs: &Bound) -> PyResult { let tensor = if let Ok(rhs) = rhs.extract::() { self.0.broadcast_div(&rhs.0).map_err(wrap_err)? } else if let Ok(rhs) = rhs.extract::() { @@ -711,7 +711,7 @@ impl PyTensor { } /// Rich-compare two tensors. /// &RETURNS&: Tensor - fn __richcmp__(&self, rhs: &PyAny, op: CompareOp) -> PyResult { + fn __richcmp__(&self, rhs: &Bound, op: CompareOp) -> PyResult { let compare = |lhs: &Tensor, rhs: &Tensor| { let t = match op { CompareOp::Eq => lhs.eq(rhs), @@ -957,7 +957,7 @@ impl PyTensor { #[pyo3(signature = (*args, **kwargs), text_signature = "(self, *args, **kwargs)")] /// Performs Tensor dtype and/or device conversion. /// &RETURNS&: Tensor - fn to(&self, args: &PyTuple, kwargs: Option<&PyDict>) -> PyResult { + fn to(&self, args: &Bound, kwargs: Option<&Bound>) -> PyResult { let mut device: Option = None; let mut dtype: Option = None; let mut other: Option = None; @@ -1227,7 +1227,7 @@ impl PyQTensor { ///Gets the shape of the tensor. /// &RETURNS&: Tuple[int] fn shape(&self, py: Python<'_>) -> PyObject { - PyTuple::new(py, self.0.shape().dims()).to_object(py) + PyTuple::new_bound(py, self.0.shape().dims()).to_object(py) } fn __repr__(&self) -> String { @@ -1265,7 +1265,7 @@ fn load_safetensors(path: &str, py: Python<'_>) -> PyResult { .into_iter() .map(|(key, value)| (key, PyTensor(value).into_py(py))) .collect::>(); - Ok(res.into_py_dict(py).to_object(py)) + Ok(res.into_py_dict_bound(py).to_object(py)) } #[pyfunction] @@ -1303,7 +1303,7 @@ fn load_ggml( .map(|(key, qtensor)| Ok((key, PyQTensor(Arc::new(qtensor)).into_py(py)))) .collect::<::candle::Result>>() .map_err(wrap_err)?; - let tensors = tensors.into_py_dict(py).to_object(py); + let tensors = tensors.into_py_dict_bound(py).to_object(py); let hparams = [ ("n_vocab", ggml.hparams.n_vocab), ("n_embd", ggml.hparams.n_embd), @@ -1313,7 +1313,7 @@ fn load_ggml( ("n_rot", ggml.hparams.n_rot), ("ftype", ggml.hparams.ftype), ]; - let hparams = hparams.into_py_dict(py).to_object(py); + let hparams = hparams.into_py_dict_bound(py).to_object(py); let vocab = ggml .vocab .token_score_pairs @@ -1351,7 +1351,7 @@ fn load_gguf( gguf_file::Value::Bool(x) => x.into_py(py), gguf_file::Value::String(x) => x.into_py(py), gguf_file::Value::Array(x) => { - let list = pyo3::types::PyList::empty(py); + let list = pyo3::types::PyList::empty_bound(py); for elem in x.iter() { list.append(gguf_value_to_pyobject(elem, py)?)?; } @@ -1371,13 +1371,13 @@ fn load_gguf( }) .collect::<::candle::Result>>() .map_err(wrap_err)?; - let tensors = tensors.into_py_dict(py).to_object(py); + let tensors = tensors.into_py_dict_bound(py).to_object(py); let metadata = gguf .metadata .iter() .map(|(key, value)| Ok((key, gguf_value_to_pyobject(value, py)?))) .collect::>>()? - .into_py_dict(py) + .into_py_dict_bound(py) .to_object(py); Ok((tensors, metadata)) } @@ -1390,7 +1390,7 @@ fn load_gguf( fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) -> PyResult<()> { use ::candle::quantized::gguf_file; - fn pyobject_to_gguf_value(v: &PyAny, py: Python<'_>) -> PyResult { + fn pyobject_to_gguf_value(v: &Bound, py: Python<'_>) -> PyResult { let v: gguf_file::Value = if let Ok(x) = v.extract::() { gguf_file::Value::U8(x) } else if let Ok(x) = v.extract::() { @@ -1418,7 +1418,7 @@ fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) } else if let Ok(x) = v.extract::>() { let x = x .into_iter() - .map(|f| pyobject_to_gguf_value(f.as_ref(py), py)) + .map(|f| pyobject_to_gguf_value(f.bind(py), py)) .collect::>>()?; gguf_file::Value::Array(x) } else { @@ -1450,7 +1450,7 @@ fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) Ok(( key.extract::() .map_err(|_| PyErr::new::("keys must be strings"))?, - pyobject_to_gguf_value(value, py)?, + pyobject_to_gguf_value(&value.as_borrowed(), py)?, )) }) .collect::>>()?; @@ -1498,7 +1498,7 @@ fn get_num_threads() -> usize { ::candle::utils::get_num_threads() } -fn candle_utils(_py: Python<'_>, m: &PyModule) -> PyResult<()> { +fn candle_utils(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(cuda_is_available, m)?)?; m.add_function(wrap_pyfunction!(get_num_threads, m)?)?; m.add_function(wrap_pyfunction!(has_accelerate, m)?)?; @@ -1579,7 +1579,7 @@ fn tanh(tensor: PyTensor) -> PyResult { Ok(PyTensor(s)) } -fn candle_functional_m(_py: Python<'_>, m: &PyModule) -> PyResult<()> { +fn candle_functional_m(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(silu, m)?)?; m.add_function(wrap_pyfunction!(softmax, m)?)?; m.add_function(wrap_pyfunction!(max_pool2d, m)?)?; @@ -1591,7 +1591,7 @@ fn candle_functional_m(_py: Python<'_>, m: &PyModule) -> PyResult<()> { } #[cfg(feature = "onnx")] -fn candle_onnx_m(_py: Python<'_>, m: &PyModule) -> PyResult<()> { +fn candle_onnx_m(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { use onnx::{PyONNXModel, PyONNXTensorDescriptor}; m.add_class::()?; m.add_class::()?; @@ -1599,18 +1599,18 @@ fn candle_onnx_m(_py: Python<'_>, m: &PyModule) -> PyResult<()> { } #[pymodule] -fn candle(py: Python<'_>, m: &PyModule) -> PyResult<()> { - let utils = PyModule::new(py, "utils")?; - candle_utils(py, utils)?; - m.add_submodule(utils)?; - let nn = PyModule::new(py, "functional")?; - candle_functional_m(py, nn)?; - m.add_submodule(nn)?; +fn candle(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let utils = PyModule::new_bound(py, "utils")?; + candle_utils(py, &utils)?; + m.add_submodule(&utils)?; + let nn = PyModule::new_bound(py, "functional")?; + candle_functional_m(py, &nn)?; + m.add_submodule(&nn)?; #[cfg(feature = "onnx")] { - let onnx = PyModule::new(py, "onnx")?; - candle_onnx_m(py, onnx)?; - m.add_submodule(onnx)?; + let onnx = PyModule::new_bound(py, "onnx")?; + candle_onnx_m(py, &onnx)?; + m.add_submodule(&onnx)?; } m.add_class::()?; m.add_class::()?; diff --git a/candle-pyo3/src/onnx.rs b/candle-pyo3/src/onnx.rs index b9a0eb22..a2e9a087 100644 --- a/candle-pyo3/src/onnx.rs +++ b/candle-pyo3/src/onnx.rs @@ -39,7 +39,7 @@ impl PyONNXTensorDescriptor { /// The shape of the tensor. /// &RETURNS&: Tuple[Union[int,str,Any]] fn shape(&self, py: Python) -> PyResult> { - let shape = PyList::empty(py); + let shape = PyList::empty_bound(py); if let Some(d) = &self.0.shape { for dim in d.dim.iter() { if let Some(value) = &dim.value { diff --git a/candle-pyo3/stub.py b/candle-pyo3/stub.py index 165941bd..b0e472e6 100644 --- a/candle-pyo3/stub.py +++ b/candle-pyo3/stub.py @@ -206,6 +206,8 @@ def write(module, directory, origin, check=False): if check: with open(filename, "r") as f: data = f.read() + print("generated content") + print(pyi_content) assert data == pyi_content, f"The content of {filename} seems outdated, please run `python stub.py`" else: with open(filename, "w") as f: @@ -229,6 +231,8 @@ def write(module, directory, origin, check=False): if check: with open(filename, "r") as f: data = f.read() + print("generated content") + print(py_content) assert data == py_content, f"The content of {filename} seems outdated, please run `python stub.py`" else: with open(filename, "w") as f: From 308ea070edd4fa52bdbbee3eb9279325c7500556 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Mon, 1 Apr 2024 11:44:49 -0400 Subject: [PATCH 096/131] modify access for conv and op to be pub to allow external packages to have custom backends (#1986) --- candle-core/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index 862436ab..ca73bb1c 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -37,7 +37,7 @@ mod accelerate; pub mod backend; pub mod backprop; -mod conv; +pub mod conv; mod convert; pub mod cpu; pub mod cpu_backend; @@ -57,7 +57,7 @@ pub mod metal_backend; #[cfg(feature = "mkl")] mod mkl; pub mod npy; -mod op; +pub mod op; pub mod pickle; pub mod quantized; pub mod safetensors; From ea0d8d3753b53a936c472c30ae5dc0d52bfa81fa Mon Sep 17 00:00:00 2001 From: Santiago Medina Date: Mon, 1 Apr 2024 10:37:54 -0700 Subject: [PATCH 097/131] Quantized moondream implementation and BOS token (#1980) * moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward * Pass bos token at the beginning of tensor. * Quantize moondream. * Forward with image bos token. * Clippy. * Use q4_0 quantization. * Add pointers for sequence and tokens; Remove seq_len conditional --- candle-examples/examples/moondream/main.rs | 93 ++++-- candle-transformers/src/models/mixformer.rs | 18 +- candle-transformers/src/models/mod.rs | 1 + candle-transformers/src/models/moondream.rs | 18 +- .../src/models/quantized_mixformer.rs | 24 ++ .../src/models/quantized_moondream.rs | 271 ++++++++++++++++++ 6 files changed, 393 insertions(+), 32 deletions(-) create mode 100644 candle-transformers/src/models/quantized_moondream.rs diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index 3e0f6d57..008346f0 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -9,11 +9,19 @@ use clap::Parser; use candle::{DType, Device, Tensor}; use candle_nn::VarBuilder; -use candle_transformers::{generation::LogitsProcessor, models::moondream}; +use candle_transformers::{ + generation::LogitsProcessor, + models::{moondream, quantized_moondream}, +}; use tokenizers::Tokenizer; +enum Model { + Moondream(moondream::Model), + Quantized(quantized_moondream::Model), +} + struct TextGeneration { - model: moondream::Model, + model: Model, device: Device, tokenizer: Tokenizer, logits_processor: LogitsProcessor, @@ -25,7 +33,7 @@ struct TextGeneration { impl TextGeneration { #[allow(clippy::too_many_arguments)] fn new( - model: moondream::Model, + model: Model, tokenizer: Tokenizer, seed: u64, temp: Option, @@ -64,6 +72,14 @@ impl TextGeneration { let mut tokens = tokens.get_ids().to_vec(); let mut generated_tokens = 0usize; + // Moondream tokenizer bos_token is "<|endoftext|>" + // https://huggingface.co/vikhyatk/moondream2/blob/main/special_tokens_map.json + let bos_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") { + Some(token) => *token, + None => anyhow::bail!("cannot find the BOS token"), + }; + // eos_token is "END" + // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L100 let eos_token = match self.tokenizer.get_vocab(true).get("END") { Some(token) => *token, None => anyhow::bail!("cannot find the EOS token"), @@ -75,11 +91,24 @@ impl TextGeneration { let ctxt = &tokens[tokens.len().saturating_sub(context_size)..]; let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?; let logits = if index > 0 { - self.model.text_model.forward(&input)? + match self.model { + Model::Moondream(ref mut model) => model.text_model.forward(&input)?, + Model::Quantized(ref mut model) => model.text_model.forward(&input)?, + } } else { - self.model - .text_model - .forward_with_img(&input, image_embeds)? + let bos_token = Tensor::new(&[bos_token], &self.device)?.unsqueeze(0)?; + match self.model { + Model::Moondream(ref mut model) => { + model + .text_model + .forward_with_img(&bos_token, &input, image_embeds)? + } + Model::Quantized(ref mut model) => { + model + .text_model + .forward_with_img(&bos_token, &input, image_embeds)? + } + } }; let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; let logits = if self.repeat_penalty == 1. { @@ -142,7 +171,7 @@ struct Args { top_p: Option, /// The seed to use when generating random samples. - #[arg(long, default_value_t = 299792458)] + #[arg(long, default_value_t = 0)] seed: u64, #[arg(long, default_value_t = 5000)] @@ -156,12 +185,15 @@ struct Args { #[arg(long, default_value_t = 64)] repeat_last_n: usize, - #[arg(long, default_value = "vikhyatk/moondream2")] - model_id: String, + #[arg(long)] + model_id: Option, #[arg(long, default_value = "main")] revision: String, + #[arg(long)] + quantized: bool, + #[arg(long)] model_file: Option, @@ -216,14 +248,30 @@ async fn main() -> anyhow::Result<()> { let start = std::time::Instant::now(); let api = hf_hub::api::tokio::Api::new()?; + let model_id = match args.model_id { + Some(model_id) => model_id.to_string(), + None => { + if args.quantized { + "santiagomed/candle-moondream".to_string() + } else { + "vikhyatk/moondream2".to_string() + } + } + }; let repo = api.repo(hf_hub::Repo::with_revision( - args.model_id, + model_id, hf_hub::RepoType::Model, args.revision, )); let model_file = match args.model_file { Some(m) => m.into(), - None => repo.get("model.safetensors").await?, + None => { + if args.quantized { + repo.get("model-q4_0.gguf").await? + } else { + repo.get("model.safetensors").await? + } + } }; let tokenizer = match args.tokenizer_file { Some(m) => m.into(), @@ -234,22 +282,35 @@ async fn main() -> anyhow::Result<()> { let start = std::time::Instant::now(); let device = candle_examples::device(args.cpu)?; - let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? }; let config = moondream::Config::v2(); - let model = moondream::Model::new(&config, vb)?; + let model = if args.quantized { + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf( + &model_file, + &device, + )?; + let model = quantized_moondream::Model::new(&config, vb)?; + Model::Quantized(model) + } else { + let vb = + unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? }; + let model = moondream::Model::new(&config, vb)?; + Model::Moondream(model) + }; println!("loaded the model in {:?}", start.elapsed()); let start = std::time::Instant::now(); let image = load_image(args.image)?.to_device(&device)?; let image_embeds = image.unsqueeze(0)?; - let image_embeds = image_embeds.apply(model.vision_encoder())?; + let image_embeds = match model { + Model::Moondream(ref m) => image_embeds.apply(m.vision_encoder())?, + Model::Quantized(ref m) => image_embeds.apply(m.vision_encoder())?, + }; println!( "loaded and encoded the image {image:?} in {:?}", start.elapsed() ); let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt); - let mut pipeline = TextGeneration::new( model, tokenizer, diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index edca8b9d..65a1665a 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -438,16 +438,20 @@ impl MixFormerSequentialForCausalLM { xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1) } - pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result { + pub fn forward_with_img( + &mut self, + bos_token: &Tensor, + xs: &Tensor, + img_embeds: &Tensor, + ) -> Result { let _enter = self.span.enter(); let xs = xs.apply(&self.embedding)?; - let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?; + let bos_token = bos_token.apply(&self.embedding)?; + // Python implementation sequence order is + // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56 + let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?; let (_b_size, seq_len, _embds) = xs.dims3()?; - let mask = if seq_len <= 1 { - None - } else { - Some(get_mask(seq_len, xs.device())?) - }; + let mask = Some(get_mask(seq_len, xs.device())?); for block in self.blocks.iter_mut() { xs = block.forward(&xs, mask.as_ref())? } diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index ed0e0de7..3514e648 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -35,6 +35,7 @@ pub mod quantized_llama2_c; pub mod quantized_metavoice; pub mod quantized_mistral; pub mod quantized_mixformer; +pub mod quantized_moondream; pub mod quantized_mpt; pub mod quantized_rwkv_v5; pub mod quantized_rwkv_v6; diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs index c36052c6..42b24fb8 100644 --- a/candle-transformers/src/models/moondream.rs +++ b/candle-transformers/src/models/moondream.rs @@ -25,15 +25,15 @@ fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result Result { + let _enter = self.span.enter(); + let xs = xs.apply(&self.embedding)?; + let bos_token = bos_token.apply(&self.embedding)?; + // Python implementation sequence order is + // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56 + let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?; + let (_b_size, seq_len, _embds) = xs.dims3()?; + let mask = Some(get_mask(seq_len, xs.device())?); + for block in self.blocks.iter_mut() { + xs = block.forward(&xs, mask.as_ref())? + } + let xs = xs + .narrow(1, seq_len - 1, 1)? + .apply(&self.head)? + .squeeze(1)?; + Ok(xs) + } + pub fn clear_kv_cache(&mut self) { self.blocks.iter_mut().for_each(|b| b.clear_kv_cache()) } diff --git a/candle-transformers/src/models/quantized_moondream.rs b/candle-transformers/src/models/quantized_moondream.rs new file mode 100644 index 00000000..1b125d93 --- /dev/null +++ b/candle-transformers/src/models/quantized_moondream.rs @@ -0,0 +1,271 @@ +use crate::models::moondream::{Config, VisionConfig}; +use crate::models::quantized_mixformer::MixFormerSequentialForCausalLM as PhiModel; +use crate::quantized_nn::{layer_norm, linear_b, Linear}; +use crate::quantized_var_builder::VarBuilder; +use candle::{IndexOp, Module, Result, Tensor, D}; + +fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result { + let dim = q.dim(D::Minus1)?; + let scale_factor = 1.0 / (dim as f64).sqrt(); + let attn_weights = (q.matmul(&k.t()?)? * scale_factor)?; + candle_nn::ops::softmax_last_dim(&attn_weights)?.matmul(v) +} + +#[derive(Debug, Clone)] +struct LinearPatchEmbedding { + linear: Linear, +} + +impl LinearPatchEmbedding { + fn new(vb: VarBuilder) -> Result { + let linear = linear_b(588, 1152, true, vb.pp("linear"))?; + Ok(Self { linear }) + } +} + +impl Module for LinearPatchEmbedding { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.linear) + } +} + +#[derive(Debug, Clone)] +struct Attention { + num_heads: usize, + head_dim: usize, + qkv: Linear, + proj: Linear, +} + +impl Attention { + pub fn new(vb: VarBuilder, dim: usize, num_heads: usize) -> Result { + let qkv = linear_b(dim, dim * 3, true, vb.pp("qkv"))?; + let proj = linear_b(dim, dim, true, vb.pp("proj"))?; + Ok(Self { + num_heads, + head_dim: dim / num_heads, + qkv, + proj, + }) + } +} + +impl Module for Attention { + fn forward(&self, xs: &Tensor) -> Result { + let (b, n, c) = xs.dims3()?; + let qkv = xs + .apply(&self.qkv)? + .reshape((b, n, 3, self.num_heads, self.head_dim))? + .permute((2, 0, 3, 1, 4))?; + let (q, k, v) = ( + qkv.i(0)?.contiguous()?, + qkv.i(1)?.contiguous()?, + qkv.i(2)?.contiguous()?, + ); + scaled_dot_product_attention(&q, &k, &v)? + .transpose(1, 2)? + .reshape((b, n, c))? + .apply(&self.proj) + } +} + +#[derive(Debug, Clone)] +struct VitBlock { + attn: Attention, + mlp: Mlp, + norm1: candle_nn::LayerNorm, + norm2: candle_nn::LayerNorm, +} + +impl VitBlock { + fn new(vb: VarBuilder, dim: usize, num_heads: usize, cfg: &VisionConfig) -> Result { + let attn = Attention::new(vb.pp("attn"), dim, num_heads)?; + let mlp = Mlp::new(vb.pp("mlp"), dim, cfg.hidden_features, dim, cfg.act)?; + let norm1 = layer_norm(dim, 1e-5, vb.pp("norm1"))?; + let norm2 = layer_norm(dim, 1e-5, vb.pp("norm2"))?; + Ok(Self { + attn, + mlp, + norm1, + norm2, + }) + } +} + +impl Module for VitBlock { + fn forward(&self, xs: &Tensor) -> Result { + let ys = xs.apply(&self.norm1)?.apply(&self.attn)?; + let xs = (xs + &ys)?; + let ys = xs.apply(&self.norm2)?.apply(&self.mlp)?; + let xs = (&xs + &ys)?; + Ok(xs) + } +} + +#[derive(Debug, Clone)] +struct VisionTransformer { + patch_embed: LinearPatchEmbedding, + pos_embed: Tensor, + blocks: Vec, + norm: candle_nn::LayerNorm, +} + +impl VisionTransformer { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let patch_embed = LinearPatchEmbedding::new(vb.pp("patch_embed"))?; + let pos_embed = vb + .get((1, cfg.embed_len, cfg.embed_dim), "pos_embed")? + .dequantize(vb.device())?; + let blocks = (0..cfg.num_blocks) + .map(|i| { + VitBlock::new( + vb.pp(format!("blocks.{}", i)), + cfg.embed_dim, + cfg.num_heads, + cfg, + ) + }) + .collect::>()?; + let norm = layer_norm(cfg.embed_dim, 1e-5, vb.pp("norm"))?; + Ok(Self { + patch_embed, + pos_embed, + blocks, + norm, + }) + } +} + +impl Module for VisionTransformer { + fn forward(&self, xs: &Tensor) -> Result { + let mut xs = (&xs.apply(&self.patch_embed)? + &self.pos_embed)?; + for block in self.blocks.iter() { + xs = xs.apply(block)?; + } + xs.apply(&self.norm) + } +} + +#[derive(Debug, Clone)] +pub struct Encoder { + model: VisionTransformer, +} + +impl Encoder { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let model = VisionTransformer::new(cfg, vb.pp("model.visual"))?; + Ok(Self { model }) + } +} + +impl Module for Encoder { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.model) + } +} + +#[derive(Debug, Clone)] +struct Mlp { + fc1: Linear, + act: candle_nn::Activation, + fc2: Linear, +} + +impl Mlp { + fn new( + vb: VarBuilder, + in_features: usize, + hidden_features: usize, + out_features: usize, + act: candle_nn::Activation, + ) -> Result { + let fc1 = linear_b(in_features, hidden_features, true, vb.pp("fc1"))?; + let fc2 = linear_b(hidden_features, out_features, true, vb.pp("fc2"))?; + Ok(Self { fc1, act, fc2 }) + } +} + +impl Module for Mlp { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2) + } +} + +#[derive(Debug, Clone)] +struct VisionProjection { + mlp: Mlp, +} + +impl VisionProjection { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let mlp = Mlp::new( + vb.pp("mlp"), + cfg.image_embedding_dim, + cfg.hidden_dim, + cfg.model_dim, + cfg.act, + )?; + Ok(Self { mlp }) + } +} + +impl Module for VisionProjection { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.mlp) + } +} + +#[derive(Debug, Clone)] +pub struct VisionEncoder { + encoder: Encoder, + projection: VisionProjection, +} + +impl VisionEncoder { + pub fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let encoder = Encoder::new(cfg, vb.pp("encoder"))?; + let projection = VisionProjection::new(cfg, vb.pp("projection"))?; + Ok(Self { + encoder, + projection, + }) + } +} + +impl Module for VisionEncoder { + fn forward(&self, xs: &Tensor) -> Result { + let (b, c, hp1, wp2) = xs.dims4()?; + let (p1, p2) = (14, 14); + let h = hp1 / p1; + let w = wp2 / p2; + xs.reshape((b, c, h, p1, h, p2))? + .permute((0, 2, 4, 1, 3, 5))? + .reshape((b, h * w, c * p1 * p2))? + .apply(&self.encoder)? + .apply(&self.projection) + } +} + +pub struct Model { + pub text_model: PhiModel, + pub vision_encoder: VisionEncoder, +} + +impl Model { + pub fn new(config: &Config, vb: VarBuilder) -> Result { + let text_model = PhiModel::new_v2(&config.phi_config, vb.pp("text_model"))?; + let vision_encoder = VisionEncoder::new(&config.vision_config, vb.pp("vision_encoder"))?; + Ok(Self { + text_model, + vision_encoder, + }) + } + + pub fn vision_encoder(&self) -> &VisionEncoder { + &self.vision_encoder + } + + pub fn text_model(&mut self) -> &mut PhiModel { + &mut self.text_model + } +} From be9c200cbb16b59fe1f1e8c0f606981412c9b757 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 1 Apr 2024 20:58:34 +0200 Subject: [PATCH 098/131] Expose the t5 config fields + allow t5-large. (#1987) --- candle-examples/examples/t5/main.rs | 2 ++ candle-transformers/src/models/t5.rs | 32 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/candle-examples/examples/t5/main.rs b/candle-examples/examples/t5/main.rs index 34ae0ead..902282c1 100644 --- a/candle-examples/examples/t5/main.rs +++ b/candle-examples/examples/t5/main.rs @@ -22,6 +22,7 @@ const DTYPE: DType = DType::F32; enum Which { T5Base, T5Small, + T5Large, T5_3B, Mt5Base, Mt5Small, @@ -108,6 +109,7 @@ impl T5ModelBuilder { let (default_model, default_revision) = match args.which { Which::T5Base => ("t5-base", "main"), Which::T5Small => ("t5-small", "refs/pr/15"), + Which::T5Large => ("t5-large", "main"), Which::T5_3B => ("t5-3b", "main"), Which::Mt5Base => ("google/mt5-base", "refs/pr/5"), Which::Mt5Small => ("google/mt5-small", "refs/pr/6"), diff --git a/candle-transformers/src/models/t5.rs b/candle-transformers/src/models/t5.rs index 5dc44cb5..f4b5b4b0 100644 --- a/candle-transformers/src/models/t5.rs +++ b/candle-transformers/src/models/t5.rs @@ -70,26 +70,26 @@ where #[derive(Debug, Clone, PartialEq, Deserialize)] pub struct Config { - vocab_size: usize, - d_model: usize, - d_kv: usize, - d_ff: usize, - num_layers: usize, - num_decoder_layers: Option, - num_heads: usize, - relative_attention_num_buckets: usize, + pub vocab_size: usize, + pub d_model: usize, + pub d_kv: usize, + pub d_ff: usize, + pub num_layers: usize, + pub num_decoder_layers: Option, + pub num_heads: usize, + pub relative_attention_num_buckets: usize, #[serde(default = "default_relative_attention_max_distance")] - relative_attention_max_distance: usize, - dropout_rate: f64, - layer_norm_epsilon: f64, - initializer_factor: f64, + pub relative_attention_max_distance: usize, + pub dropout_rate: f64, + pub layer_norm_epsilon: f64, + pub initializer_factor: f64, #[serde(default, deserialize_with = "deserialize_feed_forward_proj_activation")] - feed_forward_proj: ActivationWithOptionalGating, + pub feed_forward_proj: ActivationWithOptionalGating, #[serde(default = "default_tie_word_embeddings")] - tie_word_embeddings: bool, + pub tie_word_embeddings: bool, #[serde(default = "default_is_decoder")] - is_decoder: bool, - is_encoder_decoder: bool, + pub is_decoder: bool, + pub is_encoder_decoder: bool, #[serde(default = "default_use_cache")] pub use_cache: bool, pub pad_token_id: usize, From b23436bf90b99eb17aed36aaa219875d3c962a7e Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Tue, 2 Apr 2024 14:36:28 +0200 Subject: [PATCH 099/131] Stable diffusion fix. (#1993) * Stable diffusion fix. * And add a comment. --- candle-transformers/src/models/stable_diffusion/attention.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/candle-transformers/src/models/stable_diffusion/attention.rs b/candle-transformers/src/models/stable_diffusion/attention.rs index 07ce0fe4..05e51e44 100644 --- a/candle-transformers/src/models/stable_diffusion/attention.rs +++ b/candle-transformers/src/models/stable_diffusion/attention.rs @@ -533,7 +533,9 @@ impl Module for AttentionBlock { let attention_scores = (query_states * scale)?.matmul(&(key_states.t()? * scale)?)?; let attention_probs = nn::ops::softmax(&attention_scores, D::Minus1)?; - let xs = attention_probs.matmul(&value_states.contiguous()?)?; + // TODO: revert the call to force_contiguous once the three matmul kernels have been + // adapted to handle layout with some dims set to 1. + let xs = attention_probs.matmul(&value_states.force_contiguous()?)?; let xs = xs.to_dtype(in_dtype)?; let xs = xs.transpose(1, 2)?.contiguous()?; let xs = xs.flatten_from(D::Minus2)?; From fb918a23c85c298dbe52cac339dae2886086a0d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= Date: Tue, 2 Apr 2024 15:31:05 +0100 Subject: [PATCH 100/131] first commit (#1994) --- candle-transformers/src/models/falcon.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/candle-transformers/src/models/falcon.rs b/candle-transformers/src/models/falcon.rs index 24fd3c46..5fea27b9 100644 --- a/candle-transformers/src/models/falcon.rs +++ b/candle-transformers/src/models/falcon.rs @@ -1,5 +1,6 @@ use candle::{DType, Device, Result, Tensor, D}; use candle_nn::{embedding, linear_b as linear, Embedding, LayerNorm, Linear, Module, VarBuilder}; +use serde::Deserialize; const MAX_SEQ_LEN: usize = 5000; @@ -18,7 +19,7 @@ fn layer_norm(size: usize, eps: f64, vb: VarBuilder) -> Result { } // https://raw.githubusercontent.com/huggingface/transformers/030c863aaa0165e98352b61697430bf69bf33755/src/transformers/models/falcon/configuration_falcon.py -#[derive(Debug)] +#[derive(Clone, Debug, Deserialize)] pub struct Config { pub vocab_size: usize, pub hidden_size: usize, From d17b2cdad9ea60c762c887e7cddd6081d3b74bce Mon Sep 17 00:00:00 2001 From: Santiago Medina Date: Tue, 2 Apr 2024 12:37:09 -0700 Subject: [PATCH 101/131] Match Moondream's latest release (#1997) * moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward * Use latest release special token; Fix token/s accuracy; Use GeluPytorchTanh in VisionConfig v2 --- candle-examples/examples/moondream/main.rs | 28 ++++++++++----------- candle-transformers/src/models/moondream.rs | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index 008346f0..bcc21337 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -72,20 +72,16 @@ impl TextGeneration { let mut tokens = tokens.get_ids().to_vec(); let mut generated_tokens = 0usize; - // Moondream tokenizer bos_token is "<|endoftext|>" + // Moondream tokenizer bos_token and eos_token is "<|endoftext|>" // https://huggingface.co/vikhyatk/moondream2/blob/main/special_tokens_map.json - let bos_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") { + let special_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") { Some(token) => *token, - None => anyhow::bail!("cannot find the BOS token"), - }; - // eos_token is "END" - // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L100 - let eos_token = match self.tokenizer.get_vocab(true).get("END") { - Some(token) => *token, - None => anyhow::bail!("cannot find the EOS token"), + None => anyhow::bail!("cannot find the special token"), }; + let (bos_token, eos_token) = (special_token, special_token); let start_gen = std::time::Instant::now(); + let mut load_t = std::time::Duration::from_secs_f64(0f64); for index in 0..sample_len { let context_size = if index > 0 { 1 } else { tokens.len() }; let ctxt = &tokens[tokens.len().saturating_sub(context_size)..]; @@ -97,7 +93,7 @@ impl TextGeneration { } } else { let bos_token = Tensor::new(&[bos_token], &self.device)?.unsqueeze(0)?; - match self.model { + let logits = match self.model { Model::Moondream(ref mut model) => { model .text_model @@ -108,7 +104,10 @@ impl TextGeneration { .text_model .forward_with_img(&bos_token, &input, image_embeds)? } - } + }; + load_t = start_gen.elapsed(); + println!("load_t: {:?}", load_t); + logits }; let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; let logits = if self.repeat_penalty == 1. { @@ -132,10 +131,11 @@ impl TextGeneration { std::io::stdout().flush()?; } - let dt = start_gen.elapsed(); + let dt = start_gen.elapsed() - load_t; println!( - "\n{generated_tokens} tokens generated ({:.2} token/s)", - generated_tokens as f64 / dt.as_secs_f64() + "\ngenerated in {} seconds\n{generated_tokens} tokens generated ({:.2} token/s)", + dt.as_secs_f64(), + (generated_tokens - 1) as f64 / dt.as_secs_f64() ); Ok(()) diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs index 42b24fb8..717f3bb4 100644 --- a/candle-transformers/src/models/moondream.rs +++ b/candle-transformers/src/models/moondream.rs @@ -47,7 +47,7 @@ impl VisionConfig { embed_dim: 1152, num_blocks: 27, num_heads: 16, - act: candle_nn::Activation::Gelu, + act: candle_nn::Activation::GeluPytorchTanh, } } } From 08c049def36a8df4bd6d7231a62b1e342fcc9a65 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Tue, 2 Apr 2024 23:17:05 +0200 Subject: [PATCH 102/131] Improve the handling of matmul with squeezed layouts. (#1998) * Improve the handling of matmul with squeezed layouts. * Fix for the cuda backend. * Revert the temporary fix. --- candle-core/src/cpu_backend/mod.rs | 65 ++++------ candle-core/src/cuda_backend/mod.rs | 4 + candle-core/tests/matmul_tests.rs | 119 ++++++++++++++++++ candle-core/tests/tensor_tests.rs | 100 --------------- .../src/models/stable_diffusion/attention.rs | 2 +- 5 files changed, 151 insertions(+), 139 deletions(-) create mode 100644 candle-core/tests/matmul_tests.rs diff --git a/candle-core/src/cpu_backend/mod.rs b/candle-core/src/cpu_backend/mod.rs index d686440a..09226b58 100644 --- a/candle-core/src/cpu_backend/mod.rs +++ b/candle-core/src/cpu_backend/mod.rs @@ -1204,6 +1204,30 @@ impl MatMul { })) .bt() } + + fn ab_skip(&self, lhs_l: &Layout, rhs_l: &Layout) -> Result<(usize, usize)> { + let lhs_stride = lhs_l.stride(); + let rhs_stride = rhs_l.stride(); + let rank = lhs_stride.len(); + let (_b, m, n, k) = self.0; + let a_skip: usize = match lhs_stride[..rank - 2] { + [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride, + [_, stride] if lhs_l.dims()[0] == 1 => stride, + [stride, _] if lhs_l.dims()[1] == 1 => stride, + [stride] => stride, + [] => m * k, + _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous lhs"))?, + }; + let b_skip: usize = match rhs_stride[..rank - 2] { + [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride, + [_, stride] if rhs_l.dims()[0] == 1 => stride, + [stride, _] if rhs_l.dims()[1] == 1 => stride, + [stride] => stride, + [] => n * k, + _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))?, + }; + Ok((a_skip, b_skip)) + } } impl Map2 for MatMul { @@ -1237,18 +1261,7 @@ impl Map2 for MatMul { let rhs_cs = rhs_stride[rank - 1]; let rhs_rs = rhs_stride[rank - 2]; - let a_skip: usize = match lhs_stride[..rank - 2] { - [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride, - [stride] => stride, - [] => m * k, - _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous lhs"))?, - }; - let b_skip: usize = match rhs_stride[..rank - 2] { - [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride, - [stride] => stride, - [] => n * k, - _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))?, - }; + let (a_skip, b_skip) = self.ab_skip(lhs_l, rhs_l)?; let c_skip: usize = m * n; let dst_shape: Shape = (m, n).into(); @@ -1308,20 +1321,8 @@ impl Map2 for MatMul { let lhs_stride = lhs_l.stride(); let rhs_stride = rhs_l.stride(); - let rank = lhs_stride.len(); - let a_skip: usize = match lhs_stride[..rank - 2] { - [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride, - [stride] => stride, - [] => m * k, - _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous lhs"))?, - }; - let b_skip: usize = match rhs_stride[..rank - 2] { - [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride, - [stride] => stride, - [] => n * k, - _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))?, - }; + let (a_skip, b_skip) = self.ab_skip(lhs_l, rhs_l)?; let c_skip: usize = m * n; let rhs_m1 = rhs_stride[rhs_stride.len() - 1]; @@ -1411,20 +1412,8 @@ impl Map2 for MatMul { let lhs_stride = lhs_l.stride(); let rhs_stride = rhs_l.stride(); - let rank = lhs_stride.len(); - let a_skip: usize = match lhs_stride[..rank - 2] { - [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride, - [stride] => stride, - [] => m * k, - _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous lhs"))?, - }; - let b_skip: usize = match rhs_stride[..rank - 2] { - [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride, - [stride] => stride, - [] => n * k, - _ => Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))?, - }; + let (a_skip, b_skip) = self.ab_skip(lhs_l, rhs_l)?; let c_skip: usize = m * n; let rhs_m1 = rhs_stride[rhs_stride.len() - 1]; diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs index 78aebd9b..3690e0dc 100644 --- a/candle-core/src/cuda_backend/mod.rs +++ b/candle-core/src/cuda_backend/mod.rs @@ -1174,6 +1174,8 @@ fn gemm_config( let stride_b: usize = match lhs_stride[..lhs_stride.len() - 2] { [s1, stride] if s1 == stride * lhs_l.dims()[1] => stride, + [_, stride] if lhs_l.dims()[0] == 1 => stride, + [stride, _] if lhs_l.dims()[1] == 1 => stride, [stride] => stride, [] => m * k, _ => Err(CudaError::MatMulNonContiguous { @@ -1184,6 +1186,8 @@ fn gemm_config( }; let stride_a: usize = match rhs_stride[..rhs_stride.len() - 2] { [s1, stride] if s1 == stride * rhs_l.dims()[1] => stride, + [_, stride] if rhs_l.dims()[0] == 1 => stride, + [stride, _] if rhs_l.dims()[1] == 1 => stride, [stride] => stride, [] => n * k, _ => Err(CudaError::MatMulNonContiguous { diff --git a/candle-core/tests/matmul_tests.rs b/candle-core/tests/matmul_tests.rs new file mode 100644 index 00000000..834da29a --- /dev/null +++ b/candle-core/tests/matmul_tests.rs @@ -0,0 +1,119 @@ +use candle_core::{test_device, DType, Device, IndexOp, Result, Tensor}; + +fn matmul(device: &Device) -> Result<()> { + let data = vec![1.0f32, 2.0, 3.0, 4.0]; + let a = Tensor::from_slice(&data, (2, 2), device)?; + let data = vec![1.0f32, 2.0, 3.0, 4.0]; + let b = Tensor::from_slice(&data, (2, 2), device)?; + + let c = a.matmul(&b)?; + assert_eq!(c.to_vec2::()?, &[[7.0f32, 10.0], [15.0, 22.0]]); + + let data = vec![1.0f32, 2.0]; + let a = Tensor::from_slice(&data, (2, 1), device)?; + let data = vec![3.0f32, 4.0]; + let b = Tensor::from_slice(&data, (1, 2), device)?; + let c = a.matmul(&b)?; + assert_eq!(c.to_vec2::()?, &[&[3.0, 4.0], &[6.0, 8.0]]); + + let data: Vec<_> = (0..6).map(|i| i as f32).collect(); + let a = Tensor::from_slice(&data, (2, 3), device)?; + let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect(); + let b = Tensor::from_slice(&data, (3, 2), device)?; + let c = a.matmul(&b)?; + assert_eq!(c.to_vec2::()?, &[&[16., 19.], &[52., 64.]]); + + let data: Vec<_> = (0..12).map(|i| i as f32).collect(); + let a = Tensor::from_slice(&data, (2, 2, 3), device)?; + let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect(); + let b = Tensor::from_slice(&data, (2, 3, 2), device)?; + let expected = [[[16., 19.], [52., 64.]], [[214., 235.], [304., 334.]]]; + + let c = a.matmul(&b)?; + assert_eq!(c.to_vec3::()?, &expected); + + // Also perform the matmul on contiguous transposed versions. + let a_tt = a.t()?.contiguous()?.t()?; + assert!(!a_tt.is_contiguous()); + assert_eq!(a.dims(), a_tt.dims()); + assert_eq!(a_tt.stride(), &[6, 1, 2]); + + let b_tt = b.t()?.contiguous()?.t()?; + assert!(!b_tt.is_contiguous()); + assert_eq!(b.dims(), b_tt.dims()); + assert_eq!(b_tt.stride(), &[6, 1, 3]); + + assert_eq!(a_tt.matmul(&b)?.to_vec3::()?, &expected); + assert_eq!(a.matmul(&b_tt)?.to_vec3::()?, &expected); + assert_eq!(a_tt.matmul(&b_tt)?.to_vec3::()?, &expected); + Ok(()) +} + +fn broadcast_matmul(device: &Device) -> Result<()> { + let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?; + let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?; + let out = lhs.broadcast_matmul(&rhs)?; + assert_eq!(out.dims(), &[3, 6, 4, 2]); + for idx1 in 0..3 { + for idx2 in 0..6 { + let out = out.i((idx1, idx2))?; + let lhs = lhs.i((idx1, 0))?; + let rhs = rhs.i(idx2)?; + let out2 = lhs.matmul(&rhs); + let sum_diff2 = (out - out2)?.sqr()?.sum_all()?; + // With cuda, we see errors of up to ~1e-12. + assert!(sum_diff2.to_vec0::()? < 1e-6) + } + } + Ok(()) +} + +// https://github.com/huggingface/candle/issues/1948 +fn squeeze_mm(device: &Device) -> Result<()> { + let seq_len = 8_usize; + let a = Tensor::zeros((1, seq_len, 16), DType::F32, device)?; + let x = a.i((.., seq_len - 1, ..))?; + println!( + "x shape:{:?}, stride:{:?}, is_contiguous:{}", + x.shape(), + x.stride(), + x.is_contiguous() + ); + + let w = Tensor::zeros((32, 16), DType::F32, device)?.t()?; + println!( + "w shape:{:?}, stride:{:?}, is_contiguous:{}", + w.shape(), + w.stride(), + w.is_contiguous() + ); + let x = x.matmul(&w)?; + assert_eq!(x.dims(), &[1, 32]); + Ok(()) +} + +// https://github.com/huggingface/candle/issues/1992 +fn mm_layout(device: &Device) -> Result<()> { + let a = Tensor::arange(0f32, 16f32, device)?.reshape((1, 1, 4, 4))?; + let b = Tensor::arange(0f32, 8f32, device)?.reshape((1, 1, 4, 2))?; + let mm1 = a.matmul(&b)?; + // Forces the layout to be: + // shape: [1, 1, 4, 2], stride: [8, 2, 2, 1], start_offset: 0 + // This is still a contiguous matrix but matmul checks are only the two last dimensions have + // non 1 sizes but matmul check may be reluctant to handle it. + let b = b.transpose(1, 2)?.force_contiguous()?.transpose(1, 2)?; + let mm2 = a.matmul(&b)?; + let diff = (mm1 - mm2)?.abs()?.sum_all()?.to_vec0::()?; + assert_eq!(diff, 0.); + Ok(()) +} + +test_device!(matmul, matmul_cpu, matmul_gpu, matmul_metal); +test_device!( + broadcast_matmul, + broadcast_matmul_cpu, + broadcast_matmul_gpu, + broadcast_matmul_metal +); +test_device!(squeeze_mm, squeeze_mm_cpu, squeeze_mm_gpu, squeeze_mm_metal); +test_device!(mm_layout, mm_layout_cpu, mm_layout_gpu, mm_layout_metal); diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index 8aacc05d..902b84f7 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -938,74 +938,6 @@ fn gather(device: &Device) -> Result<()> { Ok(()) } -fn matmul(device: &Device) -> Result<()> { - let data = vec![1.0f32, 2.0, 3.0, 4.0]; - let a = Tensor::from_slice(&data, (2, 2), device)?; - let data = vec![1.0f32, 2.0, 3.0, 4.0]; - let b = Tensor::from_slice(&data, (2, 2), device)?; - - let c = a.matmul(&b)?; - assert_eq!(c.to_vec2::()?, &[[7.0f32, 10.0], [15.0, 22.0]]); - - let data = vec![1.0f32, 2.0]; - let a = Tensor::from_slice(&data, (2, 1), device)?; - let data = vec![3.0f32, 4.0]; - let b = Tensor::from_slice(&data, (1, 2), device)?; - let c = a.matmul(&b)?; - assert_eq!(c.to_vec2::()?, &[&[3.0, 4.0], &[6.0, 8.0]]); - - let data: Vec<_> = (0..6).map(|i| i as f32).collect(); - let a = Tensor::from_slice(&data, (2, 3), device)?; - let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect(); - let b = Tensor::from_slice(&data, (3, 2), device)?; - let c = a.matmul(&b)?; - assert_eq!(c.to_vec2::()?, &[&[16., 19.], &[52., 64.]]); - - let data: Vec<_> = (0..12).map(|i| i as f32).collect(); - let a = Tensor::from_slice(&data, (2, 2, 3), device)?; - let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect(); - let b = Tensor::from_slice(&data, (2, 3, 2), device)?; - let expected = [[[16., 19.], [52., 64.]], [[214., 235.], [304., 334.]]]; - - let c = a.matmul(&b)?; - assert_eq!(c.to_vec3::()?, &expected); - - // Also perform the matmul on contiguous transposed versions. - let a_tt = a.t()?.contiguous()?.t()?; - assert!(!a_tt.is_contiguous()); - assert_eq!(a.dims(), a_tt.dims()); - assert_eq!(a_tt.stride(), &[6, 1, 2]); - - let b_tt = b.t()?.contiguous()?.t()?; - assert!(!b_tt.is_contiguous()); - assert_eq!(b.dims(), b_tt.dims()); - assert_eq!(b_tt.stride(), &[6, 1, 3]); - - assert_eq!(a_tt.matmul(&b)?.to_vec3::()?, &expected); - assert_eq!(a.matmul(&b_tt)?.to_vec3::()?, &expected); - assert_eq!(a_tt.matmul(&b_tt)?.to_vec3::()?, &expected); - Ok(()) -} - -fn broadcast_matmul(device: &Device) -> Result<()> { - let lhs = Tensor::randn(0f32, 1f32, (3, 1, 4, 5), device)?; - let rhs = Tensor::randn(0f32, 1f32, (6, 5, 2), device)?; - let out = lhs.broadcast_matmul(&rhs)?; - assert_eq!(out.dims(), &[3, 6, 4, 2]); - for idx1 in 0..3 { - for idx2 in 0..6 { - let out = out.i((idx1, idx2))?; - let lhs = lhs.i((idx1, 0))?; - let rhs = rhs.i(idx2)?; - let out2 = lhs.matmul(&rhs); - let sum_diff2 = (out - out2)?.sqr()?.sum_all()?; - // With cuda, we see errors of up to ~1e-12. - assert!(sum_diff2.to_vec0::()? < 1e-6) - } - } - Ok(()) -} - fn broadcasting(device: &Device) -> Result<()> { let t1 = Tensor::arange(0f32, 24f32, device)?.reshape((4, 2, 3))?; let t2 = Tensor::new(&[100f32, 200f32], device)?; @@ -1140,30 +1072,6 @@ fn randn(device: &Device) -> Result<()> { Ok(()) } -// https://github.com/huggingface/candle/issues/1948 -fn squeeze_mm(device: &Device) -> Result<()> { - let seq_len = 8_usize; - let a = Tensor::zeros((1, seq_len, 16), DType::F32, device)?; - let x = a.i((.., seq_len - 1, ..))?; - println!( - "x shape:{:?}, stride:{:?}, is_contiguous:{}", - x.shape(), - x.stride(), - x.is_contiguous() - ); - - let w = Tensor::zeros((32, 16), DType::F32, device)?.t()?; - println!( - "w shape:{:?}, stride:{:?}, is_contiguous:{}", - w.shape(), - w.stride(), - w.is_contiguous() - ); - let x = x.matmul(&w)?; - assert_eq!(x.dims(), &[1, 32]); - Ok(()) -} - test_device!(zeros, zeros_cpu, zeros_gpu, zeros_metal); test_device!(ones, ones_cpu, ones_gpu, ones_metal); test_device!(full, full_cpu, full_gpu, full_metal); @@ -1183,13 +1091,6 @@ test_device!(unary_op, unary_op_cpu, unary_op_gpu, unary_op_metal); test_device!(binary_op, binary_op_cpu, binary_op_gpu, binary_op_metal); test_device!(embeddings, embeddings_cpu, embeddings_gpu, embeddings_metal); test_device!(cmp, cmp_cpu, cmp_gpu, cmp_metal); -test_device!(matmul, matmul_cpu, matmul_gpu, matmul_metal); -test_device!( - broadcast_matmul, - broadcast_matmul_cpu, - broadcast_matmul_gpu, - broadcast_matmul_metal -); test_device!( broadcasting, broadcasting_cpu, @@ -1219,7 +1120,6 @@ test_device!( test_device!(randn, randn_cpu, randn_gpu, randn_metal); test_device!(clamp, clamp_cpu, clamp_gpu, clamp_metal); test_device!(var, var_cpu, var_gpu, var_metal); -test_device!(squeeze_mm, squeeze_mm_cpu, squeeze_mm_gpu, squeeze_mm_metal); // There was originally a bug on the CPU implementation for randn // https://github.com/huggingface/candle/issues/381 diff --git a/candle-transformers/src/models/stable_diffusion/attention.rs b/candle-transformers/src/models/stable_diffusion/attention.rs index 05e51e44..4d5a7c47 100644 --- a/candle-transformers/src/models/stable_diffusion/attention.rs +++ b/candle-transformers/src/models/stable_diffusion/attention.rs @@ -535,7 +535,7 @@ impl Module for AttentionBlock { // TODO: revert the call to force_contiguous once the three matmul kernels have been // adapted to handle layout with some dims set to 1. - let xs = attention_probs.matmul(&value_states.force_contiguous()?)?; + let xs = attention_probs.matmul(&value_states)?; let xs = xs.to_dtype(in_dtype)?; let xs = xs.transpose(1, 2)?.contiguous()?; let xs = xs.flatten_from(D::Minus2)?; From cd6b9e317c9499ffde396391d8b2b18fa9aa6afb Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Wed, 3 Apr 2024 01:03:54 -0400 Subject: [PATCH 103/131] Add benchmarks for the candle-nn package (#1995) * add benchmarks for the candle-nn package * uncomment test * format --- candle-nn/Cargo.toml | 5 ++ candle-nn/benches/bench_main.rs | 4 ++ candle-nn/benches/benchmarks/conv.rs | 54 ++++++++++++++++++ candle-nn/benches/benchmarks/layer_norm.rs | 48 ++++++++++++++++ candle-nn/benches/benchmarks/mod.rs | 64 ++++++++++++++++++++++ 5 files changed, 175 insertions(+) create mode 100644 candle-nn/benches/bench_main.rs create mode 100644 candle-nn/benches/benchmarks/conv.rs create mode 100644 candle-nn/benches/benchmarks/layer_norm.rs create mode 100644 candle-nn/benches/benchmarks/mod.rs diff --git a/candle-nn/Cargo.toml b/candle-nn/Cargo.toml index 3408dae3..9f0d56bd 100644 --- a/candle-nn/Cargo.toml +++ b/candle-nn/Cargo.toml @@ -26,6 +26,7 @@ candle-metal-kernels = { workspace = true, optional = true } anyhow = { workspace = true } clap = { workspace = true } rand = { workspace = true } +criterion = { workspace = true } [features] default = [] @@ -33,3 +34,7 @@ accelerate = ["dep:accelerate-src", "candle/accelerate"] cuda = ["candle/cuda"] mkl = ["dep:intel-mkl-src", "candle/mkl"] metal = ["candle/metal", "dep:candle-metal-kernels", "dep:metal"] + +[[bench]] +name = "bench_main" +harness = false \ No newline at end of file diff --git a/candle-nn/benches/bench_main.rs b/candle-nn/benches/bench_main.rs new file mode 100644 index 00000000..4db1d35c --- /dev/null +++ b/candle-nn/benches/bench_main.rs @@ -0,0 +1,4 @@ +mod benchmarks; + +use criterion::criterion_main; +criterion_main!(benchmarks::layer_norm::benches, benchmarks::conv::benches); diff --git a/candle-nn/benches/benchmarks/conv.rs b/candle-nn/benches/benchmarks/conv.rs new file mode 100644 index 00000000..eb80645b --- /dev/null +++ b/candle-nn/benches/benchmarks/conv.rs @@ -0,0 +1,54 @@ +use crate::benchmarks::{BenchDevice, BenchDeviceHandler}; +use candle::{DType, Device, Module, Tensor}; +use candle_nn::{Conv2d, Conv2dConfig}; +use criterion::{black_box, criterion_group, Criterion}; +use std::time::Instant; + +const B: usize = 1; +const C: usize = 1; +const M: usize = 128; +const K: usize = 128; +const K_SIZE: usize = 3; + +fn run(input: Tensor, weight: Tensor, bias: Tensor, config: Conv2dConfig) { + Conv2d::new(weight, Some(bias), config) + .forward(&input) + .unwrap(); +} + +fn run_conv2d_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) { + let weight = Tensor::ones((1, 1, K_SIZE, K_SIZE), dtype, device) + .unwrap() + .to_dtype(dtype) + .unwrap(); + let bias = Tensor::zeros(K, dtype, device).unwrap(); + let input = Tensor::ones((B, C, M, K), dtype, device).unwrap(); + + let mut group = c.benchmark_group(device.bench_name(name)); + group.bench_function("iter", move |b| { + b.iter_custom(|iters| { + let start = Instant::now(); + for _i in 0..iters { + run( + black_box(input.clone()), + black_box(weight.clone()), + black_box(bias.clone()), + Default::default(), + ); + } + device.sync().unwrap(); + start.elapsed() + }) + }); + group.finish(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let device = BenchDeviceHandler::new().unwrap(); + for d in device.devices { + run_conv2d_benchmark(c, &d, DType::F32, "conv2d_f32"); + run_conv2d_benchmark(c, &d, DType::F16, "conv2d_f16"); + } +} + +criterion_group!(benches, criterion_benchmark); diff --git a/candle-nn/benches/benchmarks/layer_norm.rs b/candle-nn/benches/benchmarks/layer_norm.rs new file mode 100644 index 00000000..0be5c450 --- /dev/null +++ b/candle-nn/benches/benchmarks/layer_norm.rs @@ -0,0 +1,48 @@ +use crate::benchmarks::{BenchDevice, BenchDeviceHandler}; +use candle::{DType, Device, Module, Tensor}; +use candle_nn::LayerNorm; +use criterion::{black_box, criterion_group, Criterion}; +use std::time::Instant; + +fn run(input: &Tensor, weight: &Tensor, bias: &Tensor) { + let _ = LayerNorm::new(weight.clone(), bias.clone(), 1e-5).forward(&input); +} + +const B: usize = 1; +const M: usize = 1024; +const K: usize = 1024; + +fn run_layer_norm_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) { + let elements = B * M * K; + + let weight = Tensor::arange(0.0, elements as f32, device) + .unwrap() + .to_dtype(dtype) + .unwrap(); + let bias = weight.ones_like().unwrap(); + let input = weight.ones_like().unwrap(); + + let mut group = c.benchmark_group(device.bench_name(name)); + group.bench_function("iter", move |b| { + b.iter_custom(|iters| { + let start = Instant::now(); + for _i in 0..iters { + run(black_box(&input), black_box(&weight), black_box(&bias)); + } + device.sync().unwrap(); + start.elapsed() + }) + }); + group.finish(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let device = BenchDeviceHandler::new().unwrap(); + for d in device.devices { + run_layer_norm_benchmark(c, &d, DType::F32, "layer_norm_f32"); + run_layer_norm_benchmark(c, &d, DType::BF16, "layer_norm_bf16"); + run_layer_norm_benchmark(c, &d, DType::F16, "layer_norm_f16"); + } +} + +criterion_group!(benches, criterion_benchmark); diff --git a/candle-nn/benches/benchmarks/mod.rs b/candle-nn/benches/benchmarks/mod.rs new file mode 100644 index 00000000..30a6ab6a --- /dev/null +++ b/candle-nn/benches/benchmarks/mod.rs @@ -0,0 +1,64 @@ +pub(crate) mod conv; +pub(crate) mod layer_norm; + +use candle::{Device, Result}; + +pub(crate) trait BenchDevice { + fn sync(&self) -> Result<()>; + + fn bench_name>(&self, name: S) -> String; +} + +impl BenchDevice for Device { + fn sync(&self) -> Result<()> { + match self { + Device::Cpu => Ok(()), + Device::Cuda(device) => { + #[cfg(feature = "cuda")] + return Ok(device.synchronize()?); + #[cfg(not(feature = "cuda"))] + panic!("Cuda device without cuda feature enabled: {:?}", device) + } + Device::Metal(device) => { + #[cfg(feature = "metal")] + return Ok(device.wait_until_completed()?); + #[cfg(not(feature = "metal"))] + panic!("Metal device without metal feature enabled: {:?}", device) + } + } + } + + fn bench_name>(&self, name: S) -> String { + match self { + Device::Cpu => { + let cpu_type = if cfg!(feature = "accelerate") { + "accelerate" + } else if cfg!(feature = "mkl") { + "mkl" + } else { + "cpu" + }; + format!("{}_{}", cpu_type, name.into()) + } + Device::Cuda(_) => format!("cuda_{}", name.into()), + Device::Metal(_) => format!("metal_{}", name.into()), + } + } +} + +struct BenchDeviceHandler { + devices: Vec, +} + +impl BenchDeviceHandler { + pub fn new() -> Result { + let mut devices = Vec::new(); + if cfg!(feature = "metal") { + devices.push(Device::new_metal(0)?); + } else if cfg!(feature = "cuda") { + devices.push(Device::new_cuda(0)?); + } + devices.push(Device::Cpu); + Ok(Self { devices }) + } +} From 26226068a4912891f1d374b036aa2d1f6e5af4d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radam=C3=A9s=20Ajna?= Date: Tue, 2 Apr 2024 22:11:50 -0700 Subject: [PATCH 104/131] Moondream WASM (#1999) * moondream wasm wip * examples, more * fix eos token check * README * cleanip * cleanup, clippy --- candle-wasm-examples/moondream/Cargo.toml | 32 ++ candle-wasm-examples/moondream/README.md | 24 ++ candle-wasm-examples/moondream/build-lib.sh | 2 + candle-wasm-examples/moondream/code.js | 262 +++++++++++++++ candle-wasm-examples/moondream/index.html | 312 ++++++++++++++++++ .../moondream/moondreamWorker.js | 201 +++++++++++ candle-wasm-examples/moondream/src/bin/m.rs | 279 ++++++++++++++++ candle-wasm-examples/moondream/src/lib.rs | 16 + 8 files changed, 1128 insertions(+) create mode 100644 candle-wasm-examples/moondream/Cargo.toml create mode 100644 candle-wasm-examples/moondream/README.md create mode 100644 candle-wasm-examples/moondream/build-lib.sh create mode 100644 candle-wasm-examples/moondream/code.js create mode 100644 candle-wasm-examples/moondream/index.html create mode 100644 candle-wasm-examples/moondream/moondreamWorker.js create mode 100644 candle-wasm-examples/moondream/src/bin/m.rs create mode 100644 candle-wasm-examples/moondream/src/lib.rs diff --git a/candle-wasm-examples/moondream/Cargo.toml b/candle-wasm-examples/moondream/Cargo.toml new file mode 100644 index 00000000..fc1b82ca --- /dev/null +++ b/candle-wasm-examples/moondream/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "candle-wasm-example-moondream" +version.workspace = true +edition.workspace = true +description.workspace = true +repository.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true + +[dependencies] +candle = { workspace = true } +candle-nn = { workspace = true } +candle-transformers = { workspace = true } +tokenizers = { workspace = true, features = ["unstable_wasm"] } +num-traits = { workspace = true } + +# App crates. +anyhow = { workspace = true } +byteorder = { workspace = true } +getrandom = { version = "0.2", features = ["js"] } +image = { workspace = true } +log = { workspace = true } +safetensors = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } + +# Wasm specific crates. +console_error_panic_hook = "0.1.7" +wasm-bindgen = "0.2.87" +js-sys = "0.3.64" +serde-wasm-bindgen = "0.6.5" diff --git a/candle-wasm-examples/moondream/README.md b/candle-wasm-examples/moondream/README.md new file mode 100644 index 00000000..ca7f7ced --- /dev/null +++ b/candle-wasm-examples/moondream/README.md @@ -0,0 +1,24 @@ +## Running [Moondream 2](https://huggingface.co/vikhyatk/moondream2) Model Example + +### Vanilla JS and WebWorkers + +To build and test the UI made in Vanilla JS and WebWorkers, first we need to build the WASM library: + +```bash +sh build-lib.sh +``` + +This will bundle the library under `./build` and we can import it inside our WebWorker like a normal JS module: + +```js +import init, { Model } from "./build/m.js"; +``` + +The full example can be found under `./index.html`. All needed assets are fetched from the web, so no need to download anything. +Finally, you can preview the example by running a local HTTP server. For example: + +```bash +python -m http.server +``` + +Then open `http://localhost:8000/index.html` in your browser. diff --git a/candle-wasm-examples/moondream/build-lib.sh b/candle-wasm-examples/moondream/build-lib.sh new file mode 100644 index 00000000..b0ebb182 --- /dev/null +++ b/candle-wasm-examples/moondream/build-lib.sh @@ -0,0 +1,2 @@ +cargo build --target wasm32-unknown-unknown --release +wasm-bindgen ../../target/wasm32-unknown-unknown/release/m.wasm --out-dir build --target web diff --git a/candle-wasm-examples/moondream/code.js b/candle-wasm-examples/moondream/code.js new file mode 100644 index 00000000..c766196d --- /dev/null +++ b/candle-wasm-examples/moondream/code.js @@ -0,0 +1,262 @@ +import snarkdown from "https://cdn.skypack.dev/snarkdown"; +import hljs from "https://cdn.skypack.dev/highlight.js"; +// models base url +const MODELS = { + moondream2_q4k: { + base_url: + "https://huggingface.co/santiagomed/candle-moondream/resolve/main/", + model: "model-q4_0.gguf", + tokenizer: "tokenizer.json", + quantized: true, + size: "1.51 GB", + }, +}; + +const moodreamWorker = new Worker("./moondreamWorker.js", { + type: "module", +}); + +async function generateSequence(controller) { + const getValue = (id) => document.querySelector(`#${id}`).value; + const modelID = getValue("model"); + const model = MODELS[modelID]; + const weightsURL = + model.model instanceof Array + ? model.model.map((m) => model.base_url + m) + : model.base_url + model.model; + const tokenizerURL = model.base_url + model.tokenizer; + + const prompt = getValue("prompt").trim(); + const temperature = getValue("temperature"); + const topP = getValue("top-p"); + const repeatPenalty = getValue("repeat_penalty"); + const seed = getValue("seed"); + const maxSeqLen = getValue("max-seq"); + + if (prompt?.value?.trim() === "") { + return; + } + + function updateStatus(data) { + const outStatus = document.querySelector("#output-status"); + const outGen = document.querySelector("#output-generation"); + const outCounter = document.querySelector("#output-counter"); + + switch (data.status) { + case "loading": + outStatus.hidden = false; + outStatus.textContent = data.message; + outGen.hidden = true; + outCounter.hidden = true; + break; + case "generating": + const { message, prompt, sentence, tokensSec, totalTime } = data; + outStatus.hidden = true; + outCounter.hidden = false; + outGen.hidden = false; + outGen.innerHTML = snarkdown(prompt + sentence); + outCounter.innerHTML = `${(totalTime / 1000).toFixed( + 2 + )}s (${tokensSec.toFixed(2)} tok/s)`; + hljs.highlightAll(); + break; + case "complete": + outStatus.hidden = true; + outGen.hidden = false; + break; + } + } + + return new Promise((resolve, reject) => { + moodreamWorker.postMessage({ + weightsURL, + modelID, + tokenizerURL, + quantized: model.quantized, + imageURL: currentImageURL, + prompt, + temp: temperature, + top_p: topP, + repeatPenalty, + seed: seed, + maxSeqLen, + verbose_prompt: false, + command: "start", + }); + + const handleAbort = () => { + moodreamWorker.postMessage({ command: "abort" }); + }; + const handleMessage = (event) => { + const { status, error, message, prompt, sentence } = event.data; + if (status) updateStatus(event.data); + if (error) { + moodreamWorker.removeEventListener("message", handleMessage); + reject(new Error(error)); + } + if (status === "aborted") { + moodreamWorker.removeEventListener("message", handleMessage); + resolve(event.data); + } + if (status === "complete") { + moodreamWorker.removeEventListener("message", handleMessage); + resolve(event.data); + } + }; + + controller.signal.addEventListener("abort", handleAbort); + moodreamWorker.addEventListener("message", handleMessage); + }); +} + +const form = document.querySelector("#form"); +const prompt = document.querySelector("#prompt"); +const runBtn = document.querySelector("#run"); +const modelSelect = document.querySelector("#model"); +const dropArea = document.querySelector("#drop-area"); +const canvas = document.querySelector("#canvas"); +const ctxCanvas = canvas.getContext("2d"); +const fileUpload = document.querySelector("#file-upload"); +const clearImgBtn = document.querySelector("#clear-img-btn"); +const imagesExamples = document.querySelector("#image-select"); + +let currentImageURL = null; +let runController = new AbortController(); +let isRunning = false; + +document.addEventListener("DOMContentLoaded", () => { + for (const [id, model] of Object.entries(MODELS)) { + const option = document.createElement("option"); + option.value = id; + option.innerText = `${id} (${model.size})`; + modelSelect.appendChild(option); + } + const query = new URLSearchParams(window.location.search); + const modelID = query.get("model"); + if (modelID) { + modelSelect.value = modelID; + } else { + modelSelect.value = "moondream2_q4k"; + } +}); + +imagesExamples.addEventListener("click", (e) => { + // if (isEmbedding || isSegmenting) { + // return; + // } + const target = e.target; + if (target.nodeName === "IMG") { + const href = target.src; + clearImageCanvas(); + currentImageURL = href; + drawImageCanvas(href); + } +}); +modelSelect.addEventListener("change", (e) => { + const query = new URLSearchParams(window.location.search); + query.set("model", e.target.value); + window.history.replaceState({}, "", `${window.location.pathname}?${query}`); + window.parent.postMessage({ queryString: "?" + query }, "*"); + const model = MODELS[e.target.value]; + document.querySelector("#max-seq").max = model.seq_len; + document.querySelector("#max-seq").nextElementSibling.value = 200; +}); + +clearImgBtn.addEventListener("click", () => { + clearImageCanvas(); +}); + +//add event listener to file input +fileUpload.addEventListener("input", async (e) => { + const target = e.target; + if (target.files.length > 0 && !target.files[0].type.includes("svg")) { + const href = URL.createObjectURL(target.files[0]); + clearImageCanvas(); + await drawImageCanvas(href); + } +}); +// add event listener to drop-area +dropArea.addEventListener("dragenter", (e) => { + e.preventDefault(); + dropArea.classList.add("border-blue-700"); +}); +dropArea.addEventListener("dragleave", (e) => { + e.preventDefault(); + dropArea.classList.remove("border-blue-700"); +}); +dropArea.addEventListener("dragover", (e) => { + e.preventDefault(); +}); +dropArea.addEventListener("drop", async (e) => { + e.preventDefault(); + dropArea.classList.remove("border-blue-700"); + const url = e.dataTransfer.getData("text/uri-list"); + const files = e.dataTransfer.files; + if (files.length > 0) { + const href = URL.createObjectURL(files[0]); + clearImageCanvas(); + await drawImageCanvas(href); + } else if (url) { + clearImageCanvas(); + await drawImageCanvas(url); + } +}); + +form.addEventListener("submit", async (e) => { + e.preventDefault(); + if (isRunning) { + stopRunning(); + } else { + startRunning(); + await generateSequence(runController); + stopRunning(); + } +}); + +async function drawImageCanvas(imgURL) { + if (!imgURL) { + throw new Error("No image URL provided"); + } + return new Promise((resolve, reject) => { + ctxCanvas.clearRect(0, 0, canvas.width, canvas.height); + ctxCanvas.clearRect(0, 0, canvas.width, canvas.height); + const img = new Image(); + img.crossOrigin = "anonymous"; + img.onload = () => { + canvas.width = img.width; + canvas.height = img.height; + ctxCanvas.drawImage(img, 0, 0); + clearImgBtn.disabled = false; + resolve(img); + }; + img.src = imgURL; + currentImageURL = imgURL; + }); +} + +function clearImageCanvas() { + ctxCanvas.clearRect(0, 0, canvas.width, canvas.height); + clearImgBtn.disabled = true; + canvas.parentElement.style.height = "auto"; + currentImageURL = null; + canvas.width = 0; + canvas.height = 0; +} + +function startRunning() { + isRunning = true; + runBtn.textContent = "Stop"; + prompt.disabled = true; +} + +function stopRunning() { + runController.abort(); + runController = new AbortController(); + runBtn.textContent = "Run"; + isRunning = false; + prompt.disabled = false; +} + +prompt.addEventListener("input", (e) => { + runBtn.disabled = false; +}); diff --git a/candle-wasm-examples/moondream/index.html b/candle-wasm-examples/moondream/index.html new file mode 100644 index 00000000..26bd6a40 --- /dev/null +++ b/candle-wasm-examples/moondream/index.html @@ -0,0 +1,312 @@ + + + + Candle Moondream Rust/WASM + + + + + + + + + + + + + + + + + + + diff --git a/candle-wasm-examples/moondream/moondreamWorker.js b/candle-wasm-examples/moondream/moondreamWorker.js new file mode 100644 index 00000000..cf85053f --- /dev/null +++ b/candle-wasm-examples/moondream/moondreamWorker.js @@ -0,0 +1,201 @@ +import init, { Model } from "./build/m.js"; + +async function fetchArrayBuffer(url, cacheModel = true) { + if (!cacheModel) + return new Uint8Array(await (await fetch(url)).arrayBuffer()); + const cacheName = "moondream-candle-cache"; + const cache = await caches.open(cacheName); + const cachedResponse = await cache.match(url); + if (cachedResponse) { + const data = await cachedResponse.arrayBuffer(); + return new Uint8Array(data); + } + const res = await fetch(url, { cache: "force-cache" }); + cache.put(url, res.clone()); + return new Uint8Array(await res.arrayBuffer()); +} + +async function concatenateArrayBuffers(urls) { + const arrayBuffers = await Promise.all( + urls.map((url) => fetchArrayBuffer(url)) + ); + + let totalLength = arrayBuffers.reduce( + (acc, arrayBuffer) => acc + arrayBuffer.byteLength, + 0 + ); + let concatenatedBuffer = new Uint8Array(totalLength); + + let offset = 0; + arrayBuffers.forEach((buffer) => { + concatenatedBuffer.set(new Uint8Array(buffer), offset); + offset += buffer.byteLength; + }); + return concatenatedBuffer; +} + +class Moondream { + static imageArrayHash = {}; + static instance = {}; + static currentModelID = null; + + static async getInstance(weightsURL, modelID, tokenizerURL, quantized) { + // load individual modelID only once + if (!this.instance[modelID]) { + await init(); + + self.postMessage({ status: "loading", message: "Loading Model" }); + const [weightsArrayU8, tokenizerArrayU8] = await Promise.all([ + weightsURL instanceof Array + ? concatenateArrayBuffers(weightsURL) + : fetchArrayBuffer(weightsURL), + fetchArrayBuffer(tokenizerURL), + ]); + + this.instance[modelID] = new Model( + weightsArrayU8, + tokenizerArrayU8, + quantized + ); + } + this.currentModelID = modelID; + return this.instance[modelID]; + } + + // Remove the modelID parameter from setImageEmbeddings + static setImageEmbeddings(imageArrayU8) { + // check if image embeddings are already set for this image and model + const imageArrayHash = this.getSimpleHash(imageArrayU8); + if ( + this.imageArrayHash[this.currentModelID] === imageArrayHash && + this.instance[this.currentModelID] + ) { + self.postMessage({ + status: "embedding", + message: "Embeddings Already Set", + }); + return; + } + this.imageArrayHash[this.currentModelID] = imageArrayHash; + this.instance[this.currentModelID].set_image_embeddings(imageArrayU8); + self.postMessage({ status: "embedding", message: "Embeddings Set" }); + } + + static getSimpleHash(imageArrayU8) { + // get simple hash of imageArrayU8 + let imageArrayHash = 0; + for (let i = 0; i < imageArrayU8.length; i += 100) { + imageArrayHash ^= imageArrayU8[i]; + } + return imageArrayHash.toString(16); + } +} + +let controller = null; +self.addEventListener("message", (event) => { + if (event.data.command === "start") { + controller = new AbortController(); + generate(event.data); + } else if (event.data.command === "abort") { + controller.abort(); + } +}); + +async function generate(data) { + const { + weightsURL, + modelID, + tokenizerURL, + quantized, + imageURL, + prompt, + seed, + temp, + top_p, + repeatPenalty, + maxSeqLen, + verbose_prompt, + } = data; + try { + self.postMessage({ status: "loading", message: "Starting Moondream" }); + const model = await Moondream.getInstance( + weightsURL, + modelID, + tokenizerURL, + quantized + ); + + self.postMessage({ status: "loading", message: "Initializing model" }); + + self.postMessage({ status: "loading", message: "Loading Image" }); + const imageArrayU8 = await fetchArrayBuffer(imageURL, false); + + self.postMessage({ status: "embedding", message: "Creating Embeddings" }); + Moondream.setImageEmbeddings(imageArrayU8); + self.postMessage({ + status: "complete-embedding", + message: "Embeddings Complete", + }); + const { token, token_id } = model.init_with_image_prompt({ + prompt, + seed: BigInt(seed), + temp: parseFloat(temp), + top_p: parseFloat(top_p), + repeat_penalty: parseFloat(repeatPenalty), + repeat_last_n: 64, + verbose_prompt, + }); + + const seq_len = 2048; + + let sentence = token; + let maxTokens = maxSeqLen ? maxSeqLen : seq_len - prompt.length - 1; + let startTime = performance.now(); + let tokensCount = 0; + while (tokensCount < maxTokens) { + await new Promise(async (resolve) => { + if (controller && controller.signal.aborted) { + console.log("Aborted"); + self.postMessage({ + status: "aborted", + message: "Aborted", + output: prompt + sentence, + }); + return; + } + const { token, token_id } = await model.next_token(); + if (token_id === 50256) { + // <|endoftext|> + self.postMessage({ + status: "complete", + message: "complete", + output: prompt + sentence, + }); + return; + } + const tokensSec = + ((tokensCount + 1) / (performance.now() - startTime)) * 1000; + + sentence += token; + self.postMessage({ + status: "generating", + message: "Generating token", + token: token, + sentence: sentence, + totalTime: performance.now() - startTime, + tokensSec, + prompt: prompt, + }); + setTimeout(resolve, 0); + }); + tokensCount++; + } + self.postMessage({ + status: "complete", + message: "complete", + output: prompt + sentence, + }); + } catch (e) { + self.postMessage({ error: e }); + } +} diff --git a/candle-wasm-examples/moondream/src/bin/m.rs b/candle-wasm-examples/moondream/src/bin/m.rs new file mode 100644 index 00000000..2af6c0d2 --- /dev/null +++ b/candle-wasm-examples/moondream/src/bin/m.rs @@ -0,0 +1,279 @@ +use candle::{DType, Device, Tensor}; +use candle_nn::VarBuilder; +use candle_transformers::{ + generation::LogitsProcessor, + models::{moondream, quantized_moondream}, +}; +use candle_wasm_example_moondream::console_log; +use js_sys::Date; +use serde::{Deserialize, Serialize}; +use tokenizers::Tokenizer; +use wasm_bindgen::prelude::*; + +enum SelectedModel { + Moondream(moondream::Model), + Quantized(quantized_moondream::Model), +} + +#[wasm_bindgen] +pub struct Model { + model: SelectedModel, + tokenizer: Tokenizer, + logits_processor: LogitsProcessor, + tokens: Vec, + repeat_penalty: f32, + repeat_last_n: usize, + index: usize, + bos_token: Option, + image_embeddings: Option, +} + +#[derive(Serialize, Deserialize)] +struct Output { + token: String, + token_id: u32, +} +#[derive(Serialize, Deserialize)] +struct InitInput { + prompt: String, + seed: u64, + temp: f64, + top_p: f64, + repeat_penalty: f32, + repeat_last_n: usize, + verbose_prompt: bool, +} + +#[wasm_bindgen] +impl Model { + #[wasm_bindgen(constructor)] + pub fn load(weights: Vec, tokenizer: Vec, quantized: bool) -> Result { + console_error_panic_hook::set_once(); + console_log!("loading model"); + let device = Device::Cpu; + let config = moondream::Config::v2(); + + console_log!("config loaded in {:?}", Date::now()); + let tokenizer = + Tokenizer::from_bytes(&tokenizer).map_err(|m| JsError::new(&m.to_string()))?; + let start = Date::now(); + console_log!("weights len: {:?}", weights.len()); + let model = if quantized { + let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf_buffer( + &weights, &device, + )?; + console_log!("weights loaded"); + let model = quantized_moondream::Model::new(&config, vb)?; + SelectedModel::Quantized(model) + } else { + let device = &Device::Cpu; + let vb = VarBuilder::from_buffered_safetensors(weights, DType::F32, device)?; + let model = moondream::Model::new(&config, vb)?; + SelectedModel::Moondream(model) + }; + console_log!("model loaded in {:?}s", (Date::now() - start) / 1000.); + let logits_processor = LogitsProcessor::new(299792458, None, None); + Ok(Self { + model, + tokenizer, + tokens: vec![], + logits_processor, + repeat_penalty: 1., + repeat_last_n: 64, + bos_token: None, + image_embeddings: None, + index: 0, + }) + } + + pub fn set_image_embeddings(&mut self, image: Vec) -> Result<(), JsError> { + let device = Device::Cpu; + + console_log!("loading image as tensor"); + let start = Date::now(); + let image: Tensor = self.load_image(image)?.to_device(&device)?; + console_log!("image loaded in {:?}s", (Date::now() - start) / 1000.); + let start = Date::now(); + let image_embeds = &image.unsqueeze(0)?; + let image_embeds = match &self.model { + SelectedModel::Moondream(ref m) => image_embeds.apply(m.vision_encoder())?, + SelectedModel::Quantized(ref m) => image_embeds.apply(m.vision_encoder())?, + }; + console_log!( + "loaded and encoded the image {image:?} in {:?}", + (Date::now() - start) / 1000. + ); + self.image_embeddings = Some(image_embeds); + Ok(()) + } + + #[wasm_bindgen] + pub fn init_with_image_prompt(&mut self, input: JsValue) -> Result { + let InitInput { + prompt, + seed, + temp, + top_p, + repeat_penalty, + repeat_last_n, + verbose_prompt, + } = serde_wasm_bindgen::from_value(input).map_err(|m| JsError::new(&m.to_string()))?; + + let device = Device::Cpu; + let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", prompt); + match &mut self.model { + SelectedModel::Moondream(m) => m.text_model.clear_kv_cache(), + SelectedModel::Quantized(m) => m.text_model.clear_kv_cache(), + }; + + let temp = if temp <= 0. { None } else { Some(temp) }; + let top_p = if top_p <= 0. || top_p >= 1. { + None + } else { + Some(top_p) + }; + self.logits_processor = LogitsProcessor::new(seed, temp, top_p); + self.repeat_penalty = repeat_penalty; + self.repeat_last_n = repeat_last_n; + self.tokens.clear(); + self.index = 0; + + // Moondream tokenizer bos_token is "<|endoftext|>" + // https://huggingface.co/vikhyatk/moondream2/blob/main/special_tokens_map.json + let special_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") { + Some(token) => *token, + None => return Err(JsError::new("BOS token not found in the tokenizer.")), + }; + + self.bos_token = Some(Tensor::new(&[special_token], &device)?.unsqueeze(0)?); + + let tokens = self + .tokenizer + .encode(prompt, true) + .map_err(|m| JsError::new(&m.to_string()))?; + + if tokens.is_empty() { + return Err(JsError::new( + "Empty prompts are not supported in the Moondream model.", + )); + } + + if verbose_prompt { + for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) { + let token = token.replace('▁', " ").replace("<0x0A>", "\n"); + println!("{id:7} -> '{token}'"); + } + } + let tokens = tokens.get_ids().to_vec(); + let text = match self.process(&tokens) { + Ok(text) => text, + Err(_e) => { + console_log!("error decoding token"); + Output { + token: "".to_string(), + token_id: 0, + } + } + }; + Ok(serde_wasm_bindgen::to_value(&text)?) + } + #[wasm_bindgen] + pub fn next_token(&mut self) -> Result { + let last_token = *self.tokens.last().unwrap(); + let text = match self.process(&[last_token]) { + Ok(text) => text, + Err(_e) => { + console_log!("error decoding token"); + Output { + token: "".to_string(), + token_id: 0, + } + } + }; + Ok(serde_wasm_bindgen::to_value(&text)?) + } +} +impl Model { + fn load_image(&self, image: Vec) -> Result { + let img = image::io::Reader::new(std::io::Cursor::new(image)) + .with_guessed_format()? + .decode() + .map_err(|e| JsError::new(&e.to_string()))? + .resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378 + let img = img.to_rgb8(); + let data = img.into_raw(); + let data = Tensor::from_vec(data, (378, 378, 3), &Device::Cpu)?.permute((2, 0, 1))?; + let mean = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?; + let std = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?; + (data.to_dtype(candle::DType::F32)? / 255.)? + .broadcast_sub(&mean)? + .broadcast_div(&std) + .map_err(|e| JsError::new(&e.to_string())) + } +} + +impl Model { + fn process(&mut self, tokens: &[u32]) -> Result { + let image_embeddings = match &self.image_embeddings { + Some(embeddings) => embeddings, + None => return Err(JsError::new("Image embeddings are not set.")), + }; + let bos_token = match &self.bos_token { + Some(token) => token, + None => return Err(JsError::new("BOS token is not set.")), + }; + let device = Device::Cpu; + let context_size = if self.index > 0 { 1 } else { tokens.len() }; + let ctxt = &tokens[tokens.len().saturating_sub(context_size)..]; + let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?; + let logits = if self.index > 0 { + match self.model { + SelectedModel::Moondream(ref mut model) => model.text_model.forward(&input)?, + SelectedModel::Quantized(ref mut model) => model.text_model.forward(&input)?, + } + } else { + match self.model { + SelectedModel::Moondream(ref mut model) => { + model + .text_model + .forward_with_img(bos_token, &input, image_embeddings)? + } + SelectedModel::Quantized(ref mut model) => { + model + .text_model + .forward_with_img(bos_token, &input, image_embeddings)? + } + } + }; + + let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; + let logits = if self.repeat_penalty == 1. { + logits + } else { + let start_at = tokens.len().saturating_sub(self.repeat_last_n); + candle_transformers::utils::apply_repeat_penalty( + &logits, + self.repeat_penalty, + &tokens[start_at..], + )? + }; + let next_token = self.logits_processor.sample(&logits)?; + self.tokens.push(next_token); + let token = match self.tokenizer.decode(&[next_token], true) { + Ok(token) => token, + Err(e) => { + console_log!("error decoding token: {:?}", e); + "".to_string() + } + }; + self.index += 1; + Ok(Output { + token, + token_id: next_token, + }) + } +} + +fn main() { + console_error_panic_hook::set_once(); +} diff --git a/candle-wasm-examples/moondream/src/lib.rs b/candle-wasm-examples/moondream/src/lib.rs new file mode 100644 index 00000000..cb15633c --- /dev/null +++ b/candle-wasm-examples/moondream/src/lib.rs @@ -0,0 +1,16 @@ +use wasm_bindgen::prelude::*; + +#[wasm_bindgen] +extern "C" { + // Use `js_namespace` here to bind `console.log(..)` instead of just + // `log(..)` + #[wasm_bindgen(js_namespace = console)] + pub fn log(s: &str); +} + +#[macro_export] +macro_rules! console_log { + // Note that this is using the `log` function imported above during + // `bare_bones` + ($($t:tt)*) => ($crate::log(&format_args!($($t)*).to_string())) +} From 2be1a357102d8f64feb694720e5528d4974ca141 Mon Sep 17 00:00:00 2001 From: Vishal Patil Date: Wed, 3 Apr 2024 01:16:32 -0400 Subject: [PATCH 105/131] Added link to the Coursera ML algorithm implementations (#1989) * Added link to the coursera ML algo implementations * Fixed link --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0fdcedca..b9e603b2 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,7 @@ And then head over to - [`candle-vllm`](https://github.com/EricLBuehler/candle-vllm): Efficient platform for inference and serving local LLMs including an OpenAI compatible API server. - [`candle-ext`](https://github.com/mokeyish/candle-ext): An extension library to Candle that provides PyTorch functions not currently available in Candle. +- [`candle-coursera-ml`](https://github.com/vishpat/candle-coursera-ml): Implementation of ML algorithms from Coursera's [Machine Learning Specialization](https://www.coursera.org/specializations/machine-learning-introduction) course. - [`kalosm`](https://github.com/floneum/floneum/tree/master/interfaces/kalosm): A multi-modal meta-framework in Rust for interfacing with local pre-trained models with support for controlled generation, custom samplers, in-memory vector databases, audio transcription, and more. - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle. - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem. From 318d143224805e490d396874b9e1aaf28991393c Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 3 Apr 2024 09:02:38 +0200 Subject: [PATCH 106/131] Relax the contiguous check for cuda kernels. (#2000) * Relax the contiguous check for cuda kernels. * Ensure contiguity for RNNs. * Unrelated fix for segment anything. * Better error message + allow concatenating empty slices. --- candle-core/src/cuda_backend/mod.rs | 7 ++++++- candle-kernels/src/cuda_utils.cuh | 2 +- candle-nn/src/rnn.rs | 2 +- .../src/models/segment_anything/prompt_encoder.rs | 3 ++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs index 3690e0dc..6a9e73f8 100644 --- a/candle-core/src/cuda_backend/mod.rs +++ b/candle-core/src/cuda_backend/mod.rs @@ -99,7 +99,7 @@ pub trait WrapErr { impl> WrapErr for std::result::Result { fn w(self) -> std::result::Result { - self.map_err(|e| crate::Error::Cuda(Box::new(e.into()))) + self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt()) } } @@ -1761,6 +1761,11 @@ impl BackendStorage for CudaStorage { let dev = &self.device; let d1 = d1 as u32; let d2 = d2 as u32; + // Nothing to copy so we exit early to avoid launching a kernel and some potential invalid + // argument with a null pointer. + if d1 == 0 || d2 == 0 { + return Ok(()); + } let dst_s = dst_s as u32; let src_s = src_s as u32; let (src, dst, kname) = match (&self.slice, &mut dst.slice) { diff --git a/candle-kernels/src/cuda_utils.cuh b/candle-kernels/src/cuda_utils.cuh index b0a85249..2673b8aa 100644 --- a/candle-kernels/src/cuda_utils.cuh +++ b/candle-kernels/src/cuda_utils.cuh @@ -14,7 +14,7 @@ __device__ bool is_contiguous( size_t acc = 1; for (unsigned int d = 0; d < num_dims; d++) { unsigned int dim_idx = num_dims - 1 - d; - if (acc != strides[dim_idx]) { + if (dims[dim_idx] > 1 && acc != strides[dim_idx]) { return false; } acc *= dims[dim_idx]; diff --git a/candle-nn/src/rnn.rs b/candle-nn/src/rnn.rs index 07795eda..dbfa639b 100644 --- a/candle-nn/src/rnn.rs +++ b/candle-nn/src/rnn.rs @@ -31,7 +31,7 @@ pub trait RNN { let (_b_size, seq_len, _features) = input.dims3()?; let mut output = Vec::with_capacity(seq_len); for seq_index in 0..seq_len { - let input = input.i((.., seq_index, ..))?; + let input = input.i((.., seq_index, ..))?.contiguous()?; let state = if seq_index == 0 { self.step(&input, init_state)? } else { diff --git a/candle-transformers/src/models/segment_anything/prompt_encoder.rs b/candle-transformers/src/models/segment_anything/prompt_encoder.rs index 16e8a4e8..258fb5aa 100644 --- a/candle-transformers/src/models/segment_anything/prompt_encoder.rs +++ b/candle-transformers/src/models/segment_anything/prompt_encoder.rs @@ -218,7 +218,8 @@ impl PromptEncoder { (Some(se_points), None) => se_points, (None, Some(se_boxes)) => se_boxes, (None, None) => { - Tensor::zeros((1, 0, self.embed_dim), DType::F32, &candle::Device::Cpu)? + let dev = self.no_mask_embed.embeddings().device(); + Tensor::zeros((1, 0, self.embed_dim), DType::F32, dev)? } }; From bd8db2a7712e14ea76a80475905db04bbf402aa6 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Thu, 4 Apr 2024 02:13:12 -0400 Subject: [PATCH 107/131] refactor to reduce the amount of code wrapped in template syntax (#2002) --- candle-metal-kernels/src/reduce.metal | 629 +++++++++++++++----------- 1 file changed, 368 insertions(+), 261 deletions(-) diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index be5a0921..d06efbf2 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -21,6 +21,59 @@ METAL_FUNC uint get_strided_index( constant int THREADGROUP_SIZE = 2048; +template +METAL_FUNC void argmin( + constant size_t &num_dims, + constant size_t *dims, + constant size_t *strides, + constant size_t &el_to_sum_per_block, + device const T *src, + device uint *dst, + uint id, + uint tid, + uint dst_id, + uint block_dim, + threadgroup T *shared_memory, + threadgroup uint *shared_indices +) { + bool notset = true; + /* + // Elements summed in this block range from dst_id * el_to_sum_per_block + // to (dst_id + 1) * el_to_sum_per_block. + */ + size_t start_idx = dst_id * el_to_sum_per_block; + size_t stop_idx = start_idx + el_to_sum_per_block; + size_t idx = start_idx + tid; + while (idx < stop_idx) { + /* + // TODO: Fast version for the contiguous case. + */ + size_t strided_i = get_strided_index(idx, num_dims, dims, strides); + if (notset || src[strided_i] < shared_memory[tid]) { + shared_memory[tid] = src[strided_i]; + /* Assume that the reduction takes place over the last dimension which is contiguous. */ + shared_indices[tid] = idx % dims[num_dims - 1]; + notset = false; + } + idx += block_dim; + } + + threadgroup_barrier(mem_flags::mem_none); + /* + // reduction in shared memory + */ + for (uint s = block_dim / 2; s > 0; s >>= 1) { + if (tid < s && shared_memory[tid + s] < shared_memory[tid]) { + shared_indices[tid] = shared_indices[tid + s]; + shared_memory[tid] = shared_memory[tid + s]; + } \ + threadgroup_barrier(mem_flags::mem_none); + } + + if (tid == 0){ + dst[dst_id] = shared_indices[0]; + } +} #define ARGMIN(NAME, T, MAXVALUE) \ kernel void NAME( \ @@ -35,53 +88,71 @@ kernel void NAME( \ uint dst_id [[ threadgroup_position_in_grid ]], \ uint block_dim [[ threads_per_threadgroup ]] \ ) { \ - \ - threadgroup T shared_memory[THREADGROUP_SIZE]; \ - threadgroup uint shared_indices[THREADGROUP_SIZE]; \ - \ - shared_memory[tid] = MAXVALUE; \ - shared_indices[tid] = 0xFFFFFFFF; \ - bool notset = true; \ - /* \ - // Elements summed in this block range from dst_id * el_to_sum_per_block \ - // to (dst_id + 1) * el_to_sum_per_block. \ - */ \ - size_t start_idx = dst_id * el_to_sum_per_block; \ - size_t stop_idx = start_idx + el_to_sum_per_block; \ - size_t idx = start_idx + tid; \ - while (idx < stop_idx) { \ - /* \ - // TODO: Fast version for the contiguous case. \ - */ \ - size_t strided_i = get_strided_index(idx, num_dims, dims, strides); \ - if (notset || src[strided_i] < shared_memory[tid]) { \ - shared_memory[tid] = src[strided_i]; \ - /* Assume that the reduction takes place over the last dimension which is contiguous. */ \ - shared_indices[tid] = idx % dims[num_dims - 1]; \ - notset = false; \ - } \ - idx += block_dim; \ - } \ - \ - threadgroup_barrier(mem_flags::mem_none); \ - \ - /* \ - // reduction in shared memory \ - */ \ - for (uint s = block_dim / 2; s > 0; s >>= 1) { \ - if (tid < s && shared_memory[tid + s] < shared_memory[tid]) { \ - shared_indices[tid] = shared_indices[tid + s]; \ - shared_memory[tid] = shared_memory[tid + s]; \ - } \ - threadgroup_barrier(mem_flags::mem_none); \ - } \ - \ - if (tid == 0){ \ - dst[dst_id] = shared_indices[0]; \ - } \ + threadgroup T shared_memory[THREADGROUP_SIZE]; \ + threadgroup uint shared_indices[THREADGROUP_SIZE]; \ + shared_memory[tid] = MAXVALUE; \ + shared_indices[tid] = 0xFFFFFFFF; \ + argmin(num_dims, dims, strides, el_to_sum_per_block, src, dst, id, tid, dst_id, block_dim, shared_memory, shared_indices); \ } \ +template +METAL_FUNC void argmax( + constant size_t & num_dims, + constant size_t * dims, + constant size_t * strides, + constant size_t & el_to_sum_per_block, + device const T * src, + device uint * dst, + uint id, + uint tid, + uint dst_id, + uint block_dim, + threadgroup T * shared_memory, + threadgroup uint * shared_indices + ) { + /* + // Elements summed in this block range from dst_id * el_to_sum_per_block + // to (dst_id + 1) * el_to_sum_per_block. + */ + size_t start_idx = dst_id * el_to_sum_per_block; + size_t stop_idx = start_idx + el_to_sum_per_block; + size_t idx = start_idx + tid; + bool notset = true; + while (idx < stop_idx) { + /* + // TODO: Fast version for the contiguous case. + */ + size_t strided_i = get_strided_index(idx, num_dims, dims, strides); + if (notset || shared_memory[tid] < src[strided_i]) { + shared_memory[tid] = src[strided_i]; + shared_indices[tid] = idx % dims[num_dims - 1]; + notset = false; + } + idx += block_dim; + } + + threadgroup_barrier(mem_flags::mem_none); + + /* + // reduction in shared memory + */ + for (uint s = block_dim / 2; s > 0; s >>= 1) { + if (tid < s && shared_memory[tid + s] > shared_memory[tid]) { + shared_indices[tid] = shared_indices[tid + s]; + shared_memory[tid] = shared_memory[tid + s]; + } + threadgroup_barrier(mem_flags::mem_none); + } + + /* + // Thread 0 writes the result of the reduction + */ + if (tid == 0) { + dst[dst_id] = shared_indices[0]; + } + } + #define ARGMAX(NAME, T, MINVALUE) \ kernel void NAME( \ constant size_t &num_dims, \ @@ -95,223 +166,279 @@ kernel void NAME( \ uint dst_id [[ threadgroup_position_in_grid ]], \ uint block_dim [[ threads_per_threadgroup ]] \ ) { \ - \ threadgroup T shared_memory[THREADGROUP_SIZE]; \ threadgroup uint shared_indices[THREADGROUP_SIZE]; \ - \ shared_memory[tid] = MINVALUE; \ shared_indices[tid] = 0xFFFFFFFF; \ - /* \ - // Elements summed in this block range from dst_id * el_to_sum_per_block \ - // to (dst_id + 1) * el_to_sum_per_block. \ - */ \ - size_t start_idx = dst_id * el_to_sum_per_block; \ - size_t stop_idx = start_idx + el_to_sum_per_block; \ - size_t idx = start_idx + tid; \ - bool notset = true; \ - while (idx < stop_idx) { \ - /* \ - // TODO: Fast version for the contiguous case. \ - */ \ - size_t strided_i = get_strided_index(idx, num_dims, dims, strides); \ - if (notset || shared_memory[tid] < src[strided_i]) { \ - shared_memory[tid] = src[strided_i]; \ - shared_indices[tid] = idx % dims[num_dims - 1]; \ - notset = false; \ - } \ - idx += block_dim; \ - } \ - \ - threadgroup_barrier(mem_flags::mem_none); \ - \ - /* \ - // reduction in shared memory \ - */ \ - for (uint s = block_dim / 2; s > 0; s >>= 1) { \ - if (tid < s && shared_memory[tid + s] > shared_memory[tid]) { \ - shared_indices[tid] = shared_indices[tid + s]; \ - shared_memory[tid] = shared_memory[tid + s]; \ - } \ - threadgroup_barrier(mem_flags::mem_none); \ - } \ - \ - if (tid == 0){ \ - dst[dst_id] = shared_indices[0]; \ - } \ + argmax(num_dims, dims, strides, el_to_sum_per_block, src, dst, id, tid, dst_id, block_dim, shared_memory, shared_indices); \ } \ +template +METAL_FUNC void reduce( + constant size_t & num_dims, + constant size_t * dims, + constant size_t * strides, + constant size_t & el_to_sum_per_block, + device const T * src, + device T * dst, + uint id, + uint tid, + uint dst_id, + uint block_dim, + threadgroup T * shared_memory, + T (*fn)(T, T) +) { + /* + // Elements summed in this block range from dst_id * el_to_sum_per_block + // to (dst_id + 1) * el_to_sum_per_block. + */ + size_t start_idx = dst_id * el_to_sum_per_block; + size_t stop_idx = start_idx + el_to_sum_per_block; + size_t idx = start_idx + tid; + while (idx < stop_idx) { + /* + // TODO: Fast version for the contiguous case. + */ + size_t strided_i = get_strided_index(idx, num_dims, dims, strides); + T x = shared_memory[tid]; + T y = src[strided_i]; + shared_memory[tid] = fn(x, y); + idx += block_dim; + } + + threadgroup_barrier(mem_flags::mem_none); + + /* + // reduction in shared memory + */ + for (uint s = block_dim / 2; s > 0; s >>= 1) { + if (tid < s) { + T x = shared_memory[tid]; + T y = shared_memory[tid + s]; + shared_memory[tid] = fn(x, y); + } + threadgroup_barrier(mem_flags::mem_none); + } + + if (tid == 0) { + dst[dst_id] = shared_memory[0]; + } +} + #define REDUCE(FN, NAME, T, START) \ +METAL_FUNC T NAME##_##op(T x, T y) { return FN; } \ kernel void NAME( \ constant size_t &num_dims, \ constant size_t *dims, \ constant size_t *strides, \ constant size_t &el_to_sum_per_block, \ - device const T *src, \ + device const T *src, \ device T *dst, \ uint id [[ thread_position_in_grid ]], \ uint tid [[ thread_index_in_threadgroup ]], \ uint dst_id [[ threadgroup_position_in_grid ]], \ uint block_dim [[ threads_per_threadgroup ]] \ ) { \ - \ - threadgroup T shared_memory[THREADGROUP_SIZE]; \ - \ - shared_memory[tid] = START; \ - /* \ - // Elements summed in this block range from dst_id * el_to_sum_per_block \ - // to (dst_id + 1) * el_to_sum_per_block. \ - */ \ - size_t start_idx = dst_id * el_to_sum_per_block; \ - size_t stop_idx = start_idx + el_to_sum_per_block; \ - size_t idx = start_idx + tid; \ - while (idx < stop_idx) { \ - /* \ - // TODO: Fast version for the contiguous case. \ - */ \ - size_t strided_i = get_strided_index(idx, num_dims, dims, strides); \ - T x = shared_memory[tid]; \ - T y = src[strided_i]; \ - shared_memory[tid] = FN; \ - idx += block_dim; \ - } \ - \ - threadgroup_barrier(mem_flags::mem_none); \ - \ - /* \ - // reduction in shared memory \ - */ \ - for (uint s = block_dim / 2; s > 0; s >>= 1) { \ - if (tid < s) { \ - T x = shared_memory[tid]; \ - T y = shared_memory[tid + s]; \ - shared_memory[tid] = FN; \ - } \ - threadgroup_barrier(mem_flags::mem_none); \ - } \ - \ - dst[dst_id] = shared_memory[0]; \ + threadgroup T shared_memory[THREADGROUP_SIZE]; \ + shared_memory[tid] = START; \ + reduce(num_dims, dims, strides, el_to_sum_per_block, src, dst, id, tid, dst_id, block_dim, shared_memory, NAME##_##op); \ } \ +template +METAL_FUNC void softmax( + constant size_t & src_numel, + constant size_t & el_to_sum_per_block, + device const T * src, + device T * dst, + uint id, + uint tid, + uint dst_id, + uint block_dim, + threadgroup float * shared_memory +) { + size_t start_idx = dst_id * el_to_sum_per_block; + size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel); + size_t idx = start_idx + tid; -#define SOFTMAX(NAME, T) \ -kernel void NAME( \ - constant size_t &src_numel, \ - constant size_t &el_to_sum_per_block, \ - device const T *src, \ - device T *dst, \ - \ - uint id [[ thread_position_in_grid ]], \ - uint tid [[ thread_index_in_threadgroup ]], \ - uint dst_id [[ threadgroup_position_in_grid ]], \ - uint block_dim [[ threads_per_threadgroup ]] \ -) { \ - threadgroup float shared_memory[THREADGROUP_SIZE]; \ - shared_memory[tid] = -INFINITY; \ - size_t start_idx = dst_id * el_to_sum_per_block; \ - size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel); \ - size_t idx = start_idx + tid; \ - \ - \ - float tmp = -INFINITY; \ - while (idx < stop_idx) { \ - tmp = MAX(tmp, float(src[idx])); \ - idx += block_dim; \ - } \ - shared_memory[tid] = tmp; \ - \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - \ - for (uint s = block_dim / 2; s > 0; s >>= 1) { \ - if (tid < s) { \ - shared_memory[tid] = MAX(shared_memory[tid], shared_memory[tid + s]); \ - } \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - } \ - \ - /* wait for shared_memory[0] to be filled */ \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - \ - float _max = shared_memory[0]; \ - \ - /* prevent tid=0 from overwriting _max before other threads have written it */ \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - shared_memory[tid] = 0; \ - \ - idx = start_idx + tid; \ - while (idx < stop_idx) { \ - const float val = exp(float(src[idx]) - _max); \ - dst[idx] = T(val); \ - shared_memory[tid] += val; \ - idx += block_dim; \ - } \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - for (uint s = block_dim / 2; s > 0; s >>= 1) { \ - if (tid < s) { \ - shared_memory[tid] += shared_memory[tid + s]; \ - } \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - } \ - \ - const T inv_acc = T(1.0/shared_memory[0]); \ - idx = start_idx + tid; \ - while (idx < stop_idx) { \ - dst[idx] *= inv_acc; \ - idx += block_dim; \ - } \ -} \ + float tmp = -INFINITY; + while (idx < stop_idx) { + tmp = MAX(tmp, float(src[idx])); + idx += block_dim; + } + shared_memory[tid] = tmp; -#define RMSNORM(NAME, T) \ -kernel void NAME( \ - constant size_t &src_numel, \ - constant size_t &el_to_sum_per_block, \ - device const T *src, \ - device T *dst, \ - device const T *alpha, \ - constant float &eps, \ - \ - uint id [[ thread_position_in_grid ]], \ - uint tid [[ thread_index_in_threadgroup ]], \ - uint dst_id [[ threadgroup_position_in_grid ]], \ - uint block_dim [[ threads_per_threadgroup ]] \ -) { \ - threadgroup float shared_memory[THREADGROUP_SIZE]; \ - shared_memory[tid] = 0; \ - size_t start_idx = dst_id * el_to_sum_per_block; \ - size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel); \ - size_t idx = start_idx + tid; \ - \ - \ - float tmp = 0; \ - while (idx < stop_idx) { \ - tmp = tmp + float(src[idx]) * float(src[idx]); \ - idx += block_dim; \ - } \ - shared_memory[tid] = tmp; \ - \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - \ - for (uint s = block_dim / 2; s > 0; s >>= 1) { \ - if (tid < s) { \ - shared_memory[tid] = shared_memory[tid] + shared_memory[tid + s]; \ - } \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - } \ - \ - /* wait for shared_memory[0] to be filled */ \ - threadgroup_barrier(mem_flags::mem_threadgroup); \ - \ - float norm = sqrt(shared_memory[0] / float(el_to_sum_per_block) + eps); \ - float inv_norm = 1.0f / norm; \ - idx = start_idx + tid; \ - while (idx < stop_idx) { \ - float val = float(src[idx]) * inv_norm; \ - if (alpha != nullptr) { \ - val *= float(alpha[idx - start_idx]); \ - } \ - dst[idx] = T(val); \ - idx += block_dim; \ - } \ -} \ + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint s = block_dim / 2; s > 0; s >>= 1) { + if (tid < s) { + shared_memory[tid] = MAX(shared_memory[tid], shared_memory[tid + s]);\ + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + /* wait for shared_memory[0] to be filled */ + \ + threadgroup_barrier(mem_flags::mem_threadgroup); + + float _max = shared_memory[0]; + + /* prevent tid=0 from overwriting _max before other threads have written it */ + threadgroup_barrier(mem_flags::mem_threadgroup); + shared_memory[tid] = 0; + + idx = start_idx + tid; + while (idx < stop_idx) { + const float val = exp(float(src[idx]) - _max); + dst[idx] = T(val); + shared_memory[tid] += val; + idx += block_dim; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint s = block_dim / 2; s > 0; s >>= 1) { + if (tid < s) { + shared_memory[tid] += shared_memory[tid + s]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const T inv_acc = T(1.0 / shared_memory[0]); + idx = start_idx + tid; + while (idx < stop_idx) { + dst[idx] *= inv_acc; + idx += block_dim; + } +} + +#define SOFTMAX(NAME, T) \ +kernel void NAME( \ + constant size_t &src_numel, \ + constant size_t &el_to_sum_per_block, \ + device const T *src, \ + device T *dst, \ + uint id [[ thread_position_in_grid ]], \ + uint tid [[ thread_index_in_threadgroup ]], \ + uint dst_id [[ threadgroup_position_in_grid ]], \ + uint block_dim [[ threads_per_threadgroup ]] \ +) { \ + threadgroup float shared_memory[THREADGROUP_SIZE]; \ + shared_memory[tid] = -INFINITY; \ + softmax(src_numel, el_to_sum_per_block, src, dst, id, tid, dst_id, block_dim, shared_memory); \ +} \ + +template +METAL_FUNC void rmsnorm( + constant size_t & src_numel, + constant size_t & el_to_sum_per_block, + device const T * src, + device T * dst, + device const T * alpha, + constant float & eps, + uint id, + uint tid, + uint dst_id, + uint block_dim, + threadgroup float * shared_memory +) { + size_t start_idx = dst_id * el_to_sum_per_block; + size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel); + size_t idx = start_idx + tid; + + float tmp = 0; + while (idx < stop_idx) { + tmp = tmp + float(src[idx]) * float(src[idx]); + idx += block_dim; + } + shared_memory[tid] = tmp; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint s = block_dim / 2; s > 0; s >>= 1) { + if (tid < s) { + shared_memory[tid] = shared_memory[tid] + shared_memory[tid + s]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + /* wait for shared_memory[0] to be filled */ + threadgroup_barrier(mem_flags::mem_threadgroup); + + float norm = sqrt(shared_memory[0] / float(el_to_sum_per_block) + eps); + float inv_norm = 1.0f / norm; + idx = start_idx + tid; + while (idx < stop_idx) { + float val = float(src[idx]) * inv_norm; + if (alpha != nullptr) { + val *= float(alpha[idx - start_idx]); + } + dst[idx] = T(val); + idx += block_dim; + } +} + +#define RMSNORM(NAME, T) \ +kernel void NAME( \ + constant size_t &src_numel, \ + constant size_t &el_to_sum_per_block, \ + device const T *src, \ + device T *dst, \ + device const T *alpha, \ + constant float &eps, \ + uint id [[ thread_position_in_grid ]], \ + uint tid [[ thread_index_in_threadgroup ]], \ + uint dst_id [[ threadgroup_position_in_grid ]], \ + uint block_dim [[ threads_per_threadgroup ]] \ +) { \ + threadgroup float shared_memory[THREADGROUP_SIZE]; \ + shared_memory[tid] = 0; \ + rmsnorm(src_numel, el_to_sum_per_block, src, dst, alpha, eps, id, tid, dst_id, block_dim, shared_memory); \ +} \ + +template +METAL_FUNC void ropei( + constant size_t &bh, + constant size_t &td, + device const T *src, + device const T *cos, + device const T *sin, + device T *dst, + uint tid +) { + if (2 * tid >= bh * td) { + return; + } + size_t rope_idx = tid % (td / 2); + T c = cos[rope_idx]; + T s = sin[rope_idx]; + dst[2 * tid] = src[2 * tid] * c - src[2 * tid + 1] * s; + dst[2 * tid + 1] = src[2 * tid] * s + src[2 * tid + 1] * c; +} + +template +METAL_FUNC void rope( + constant size_t &bh, + constant size_t &td, + constant size_t &d, + device const T *src, + device const T *cos, + device const T *sin, + device T *dst, + uint idx +) { + if (2 * idx >= bh * td) { + return; + } + size_t i_bh = idx / (td / 2); + size_t i_td = idx - (td / 2) * i_bh; + size_t i_t = i_td / (d / 2); + size_t i_d = i_td - (d / 2) * i_t; + size_t i1 = i_bh * td + i_t * d + i_d; + size_t i2 = i1 + d / 2; + size_t i_cs = i_t * (d / 2) + i_d; + T c = cos[i_cs]; + T s = sin[i_cs]; + dst[i1] = src[i1] * c - src[i2] * s; + dst[i2] = src[i1] * s + src[i2] * c; +} #define ROPEI(FN_NAME, FN_NAME_I, TYPENAME) \ kernel void FN_NAME_I( \ @@ -323,14 +450,7 @@ kernel void FN_NAME_I( \ device TYPENAME *dst, \ uint tid [[ thread_position_in_grid ]] \ ) { \ - if (2 * tid >= bh * td) { \ - return; \ - } \ - size_t rope_idx = tid % (td / 2); \ - TYPENAME c = cos[rope_idx]; \ - TYPENAME s = sin[rope_idx]; \ - dst[2 * tid] = src[2 * tid] * c - src[2 * tid + 1] * s; \ - dst[2 * tid + 1] = src[2 * tid] * s + src[2 * tid + 1] * c; \ + ropei(bh, td, src, cos, sin, dst, tid); \ }\ kernel void FN_NAME( \ constant size_t &bh, \ @@ -342,20 +462,7 @@ kernel void FN_NAME( \ device TYPENAME *dst, \ uint idx [[ thread_position_in_grid ]] \ ) { \ - if (2 * idx >= bh * td) { \ - return; \ - } \ - size_t i_bh = idx / (td / 2); \ - size_t i_td = idx - (td / 2) * i_bh; \ - size_t i_t = i_td / (d / 2); \ - size_t i_d = i_td - (d / 2) * i_t; \ - size_t i1 = i_bh * td + i_t * d + i_d; \ - size_t i2 = i1 + d / 2; \ - size_t i_cs = i_t * (d / 2) + i_d; \ - TYPENAME c = cos[i_cs]; \ - TYPENAME s = sin[i_cs]; \ - dst[i1] = src[i1] * c - src[i2] * s; \ - dst[i2] = src[i1] * s + src[i2] * c; \ + rope(bh, td, d, src, cos, sin, dst, idx); \ }\ REDUCE(x + y, fast_sum_f32_strided, float, 0) From 1e46cf8b1942d496f76e13c53e5bcb4cb73586a5 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 08:26:02 +0200 Subject: [PATCH 108/131] Minor cleanups in reduce.metal. (#2004) --- candle-metal-kernels/src/reduce.metal | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index d06efbf2..561d1744 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -37,17 +37,13 @@ METAL_FUNC void argmin( threadgroup uint *shared_indices ) { bool notset = true; - /* // Elements summed in this block range from dst_id * el_to_sum_per_block // to (dst_id + 1) * el_to_sum_per_block. - */ size_t start_idx = dst_id * el_to_sum_per_block; size_t stop_idx = start_idx + el_to_sum_per_block; size_t idx = start_idx + tid; while (idx < stop_idx) { - /* // TODO: Fast version for the contiguous case. - */ size_t strided_i = get_strided_index(idx, num_dims, dims, strides); if (notset || src[strided_i] < shared_memory[tid]) { shared_memory[tid] = src[strided_i]; @@ -59,9 +55,7 @@ METAL_FUNC void argmin( } threadgroup_barrier(mem_flags::mem_none); - /* // reduction in shared memory - */ for (uint s = block_dim / 2; s > 0; s >>= 1) { if (tid < s && shared_memory[tid + s] < shared_memory[tid]) { shared_indices[tid] = shared_indices[tid + s]; @@ -69,8 +63,7 @@ METAL_FUNC void argmin( } \ threadgroup_barrier(mem_flags::mem_none); } - - if (tid == 0){ + if (tid == 0) { dst[dst_id] = shared_indices[0]; } } @@ -111,18 +104,14 @@ METAL_FUNC void argmax( threadgroup T * shared_memory, threadgroup uint * shared_indices ) { - /* // Elements summed in this block range from dst_id * el_to_sum_per_block // to (dst_id + 1) * el_to_sum_per_block. - */ size_t start_idx = dst_id * el_to_sum_per_block; size_t stop_idx = start_idx + el_to_sum_per_block; size_t idx = start_idx + tid; bool notset = true; while (idx < stop_idx) { - /* // TODO: Fast version for the contiguous case. - */ size_t strided_i = get_strided_index(idx, num_dims, dims, strides); if (notset || shared_memory[tid] < src[strided_i]) { shared_memory[tid] = src[strided_i]; @@ -134,9 +123,7 @@ METAL_FUNC void argmax( threadgroup_barrier(mem_flags::mem_none); - /* // reduction in shared memory - */ for (uint s = block_dim / 2; s > 0; s >>= 1) { if (tid < s && shared_memory[tid + s] > shared_memory[tid]) { shared_indices[tid] = shared_indices[tid + s]; @@ -145,9 +132,7 @@ METAL_FUNC void argmax( threadgroup_barrier(mem_flags::mem_none); } - /* // Thread 0 writes the result of the reduction - */ if (tid == 0) { dst[dst_id] = shared_indices[0]; } @@ -188,17 +173,13 @@ METAL_FUNC void reduce( threadgroup T * shared_memory, T (*fn)(T, T) ) { - /* // Elements summed in this block range from dst_id * el_to_sum_per_block // to (dst_id + 1) * el_to_sum_per_block. - */ size_t start_idx = dst_id * el_to_sum_per_block; size_t stop_idx = start_idx + el_to_sum_per_block; size_t idx = start_idx + tid; while (idx < stop_idx) { - /* // TODO: Fast version for the contiguous case. - */ size_t strided_i = get_strided_index(idx, num_dims, dims, strides); T x = shared_memory[tid]; T y = src[strided_i]; @@ -208,9 +189,7 @@ METAL_FUNC void reduce( threadgroup_barrier(mem_flags::mem_none); - /* // reduction in shared memory - */ for (uint s = block_dim / 2; s > 0; s >>= 1) { if (tid < s) { T x = shared_memory[tid]; @@ -277,7 +256,6 @@ METAL_FUNC void softmax( } /* wait for shared_memory[0] to be filled */ - \ threadgroup_barrier(mem_flags::mem_threadgroup); float _max = shared_memory[0]; From 8967c46563221c01db4fc6a920231a9ef0d6f7bc Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 08:27:23 +0200 Subject: [PATCH 109/131] Split the cuda error file. (#2003) --- candle-core/src/cuda_backend/error.rs | 62 +++++++++++++++++++++++ candle-core/src/cuda_backend/mod.rs | 72 +++------------------------ 2 files changed, 68 insertions(+), 66 deletions(-) create mode 100644 candle-core/src/cuda_backend/error.rs diff --git a/candle-core/src/cuda_backend/error.rs b/candle-core/src/cuda_backend/error.rs new file mode 100644 index 00000000..bd6f8ac6 --- /dev/null +++ b/candle-core/src/cuda_backend/error.rs @@ -0,0 +1,62 @@ +use crate::{DType, Layout}; + +/// cudarc related errors +#[derive(thiserror::Error, Debug)] +pub enum CudaError { + #[error(transparent)] + Cuda(#[from] cudarc::driver::DriverError), + + #[error(transparent)] + Compiler(#[from] cudarc::nvrtc::CompileError), + + #[error(transparent)] + Cublas(#[from] cudarc::cublas::result::CublasError), + + #[error(transparent)] + Curand(#[from] cudarc::curand::result::CurandError), + + #[error("missing kernel '{module_name}'")] + MissingKernel { module_name: String }, + + #[error("unsupported dtype {dtype:?} for {op}")] + UnsupportedDtype { dtype: DType, op: &'static str }, + + #[error("internal error '{0}'")] + InternalError(&'static str), + + #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")] + MatMulNonContiguous { + lhs_stride: Layout, + rhs_stride: Layout, + mnk: (usize, usize, usize), + }, + + #[error("{msg}, expected: {expected:?}, got: {got:?}")] + UnexpectedDType { + msg: &'static str, + expected: DType, + got: DType, + }, + + #[error("{cuda} when loading {module_name}")] + Load { + cuda: cudarc::driver::DriverError, + module_name: String, + }, +} + +impl From for crate::Error { + fn from(val: CudaError) -> Self { + crate::Error::Cuda(Box::new(val)).bt() + } +} + +pub trait WrapErr { + fn w(self) -> std::result::Result; +} + +impl> WrapErr for std::result::Result { + fn w(self) -> std::result::Result { + self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt()) + } +} diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs index 6a9e73f8..6fecf7c7 100644 --- a/candle-core/src/cuda_backend/mod.rs +++ b/candle-core/src/cuda_backend/mod.rs @@ -9,13 +9,14 @@ use cudarc::driver::{ }; use half::{bf16, f16}; -mod device; -pub use device::{CudaDevice, DeviceId}; -mod utils; -pub use utils::{Map1, Map1Any, Map2, Map2Any, Map2InPlace, S}; - #[cfg(feature = "cudnn")] pub mod cudnn; +mod device; +mod error; +mod utils; +pub use device::{CudaDevice, DeviceId}; +pub use error::{CudaError, WrapErr}; +pub use utils::{Map1, Map1Any, Map2, Map2Any, Map2InPlace, S}; enum SlicePtrOrNull { Ptr(CudaSlice), @@ -42,67 +43,6 @@ impl SlicePtrOrNull { } } -/// cudarc related errors -#[derive(thiserror::Error, Debug)] -pub enum CudaError { - #[error(transparent)] - Cuda(#[from] cudarc::driver::DriverError), - - #[error(transparent)] - Compiler(#[from] cudarc::nvrtc::CompileError), - - #[error(transparent)] - Cublas(#[from] cudarc::cublas::result::CublasError), - - #[error(transparent)] - Curand(#[from] cudarc::curand::result::CurandError), - - #[error("missing kernel '{module_name}'")] - MissingKernel { module_name: String }, - - #[error("unsupported dtype {dtype:?} for {op}")] - UnsupportedDtype { dtype: DType, op: &'static str }, - - #[error("internal error '{0}'")] - InternalError(&'static str), - - #[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")] - MatMulNonContiguous { - lhs_stride: Layout, - rhs_stride: Layout, - mnk: (usize, usize, usize), - }, - - #[error("{msg}, expected: {expected:?}, got: {got:?}")] - UnexpectedDType { - msg: &'static str, - expected: DType, - got: DType, - }, - - #[error("{cuda} when loading {module_name}")] - Load { - cuda: cudarc::driver::DriverError, - module_name: String, - }, -} - -impl From for crate::Error { - fn from(val: CudaError) -> Self { - crate::Error::Cuda(Box::new(val)).bt() - } -} - -pub trait WrapErr { - fn w(self) -> std::result::Result; -} - -impl> WrapErr for std::result::Result { - fn w(self) -> std::result::Result { - self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt()) - } -} - #[derive(Debug)] pub enum CudaStorageSlice { U8(CudaSlice), From f48c07e2428a6d777ffdea57a2d1ac6a7d58a8ee Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 09:27:54 +0200 Subject: [PATCH 110/131] Include topk sampling in the quantized example. (#2005) * Include topk sampling in the quantized example. * Also sample with top-k on the mistral side. --- candle-examples/examples/mistral/main.rs | 24 ++++++++++++++++++-- candle-examples/examples/quantized/main.rs | 26 ++++++++++++++++------ candle-transformers/src/generation/mod.rs | 26 +++++++++++++++++++++- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/candle-examples/examples/mistral/main.rs b/candle-examples/examples/mistral/main.rs index c00af3fe..6aa3f51e 100644 --- a/candle-examples/examples/mistral/main.rs +++ b/candle-examples/examples/mistral/main.rs @@ -13,7 +13,7 @@ use candle_transformers::models::quantized_mistral::Model as QMistral; use candle::{DType, Device, Tensor}; use candle_examples::token_output_stream::TokenOutputStream; use candle_nn::VarBuilder; -use candle_transformers::generation::LogitsProcessor; +use candle_transformers::generation::{LogitsProcessor, Sampling}; use hf_hub::{api::sync::Api, Repo, RepoType}; use tokenizers::Tokenizer; @@ -39,11 +39,26 @@ impl TextGeneration { seed: u64, temp: Option, top_p: Option, + top_k: Option, repeat_penalty: f32, repeat_last_n: usize, device: &Device, ) -> Self { - let logits_processor = LogitsProcessor::new(seed, temp, top_p); + let logits_processor = { + let temperature = temp.unwrap_or(0.); + let sampling = if temperature <= 0. { + Sampling::ArgMax + } else { + match (top_k, top_p) { + (None, None) => Sampling::All { temperature }, + (Some(k), None) => Sampling::TopK { k, temperature }, + (None, Some(p)) => Sampling::TopP { p, temperature }, + (Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature }, + } + }; + LogitsProcessor::from_sampling(seed, sampling) + }; + Self { model, tokenizer: TokenOutputStream::new(tokenizer), @@ -159,6 +174,10 @@ struct Args { #[arg(long)] top_p: Option, + /// Only sample among the top K samples. + #[arg(long)] + top_k: Option, + /// The seed to use when generating random samples. #[arg(long, default_value_t = 299792458)] seed: u64, @@ -314,6 +333,7 @@ fn main() -> Result<()> { args.seed, args.temperature, args.top_p, + args.top_k, args.repeat_penalty, args.repeat_last_n, &device, diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs index b03768ed..ea7f70eb 100644 --- a/candle-examples/examples/quantized/main.rs +++ b/candle-examples/examples/quantized/main.rs @@ -10,7 +10,7 @@ use tokenizers::Tokenizer; use candle::quantized::{ggml_file, gguf_file}; use candle::Tensor; -use candle_transformers::generation::LogitsProcessor; +use candle_transformers::generation::{LogitsProcessor, Sampling}; use candle_examples::token_output_stream::TokenOutputStream; use candle_transformers::models::quantized_llama as model; @@ -200,6 +200,10 @@ struct Args { #[arg(long)] top_p: Option, + /// Only sample among the top K samples. + #[arg(long)] + top_k: Option, + /// The seed to use when generating random samples. #[arg(long, default_value_t = 299792458)] seed: u64, @@ -349,11 +353,6 @@ fn main() -> anyhow::Result<()> { #[cfg(feature = "cuda")] candle::quantized::cuda::set_force_dmmv(args.force_dmmv); - let temperature = if args.temperature == 0. { - None - } else { - Some(args.temperature) - }; let _guard = if args.tracing { let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); tracing_subscriber::registry().with(chrome_layer).init(); @@ -500,7 +499,20 @@ fn main() -> anyhow::Result<()> { prompt_tokens }; let mut all_tokens = vec![]; - let mut logits_processor = LogitsProcessor::new(args.seed, temperature, args.top_p); + let mut logits_processor = { + let temperature = args.temperature; + let sampling = if temperature <= 0. { + Sampling::ArgMax + } else { + match (args.top_k, args.top_p) { + (None, None) => Sampling::All { temperature }, + (Some(k), None) => Sampling::TopK { k, temperature }, + (None, Some(p)) => Sampling::TopP { p, temperature }, + (Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature }, + } + }; + LogitsProcessor::from_sampling(args.seed, sampling) + }; let start_prompt_processing = std::time::Instant::now(); let mut next_token = if !args.split_prompt { diff --git a/candle-transformers/src/generation/mod.rs b/candle-transformers/src/generation/mod.rs index 257d9171..c250a186 100644 --- a/candle-transformers/src/generation/mod.rs +++ b/candle-transformers/src/generation/mod.rs @@ -7,6 +7,7 @@ pub enum Sampling { All { temperature: f64 }, TopK { k: usize, temperature: f64 }, TopP { p: f64, temperature: f64 }, + TopKThenTopP { k: usize, p: f64, temperature: f64 }, } pub struct LogitsProcessor { @@ -77,7 +78,6 @@ impl LogitsProcessor { self.sample_multinomial(prs) } else { let mut argsort_indices = (0..prs.len()).collect::>(); - // Sort by descending probability. let (indices, _, _) = argsort_indices.select_nth_unstable_by(top_k, |&i, &j| prs[j].total_cmp(&prs[i])); let prs = indices.iter().map(|&i| prs[i]).collect::>(); @@ -86,6 +86,26 @@ impl LogitsProcessor { } } + // top-k sampling samples from the k tokens with the largest probabilities. + // then top-p sampling. + fn sample_topk_topp(&mut self, prs: &mut Vec, top_k: usize, top_p: f32) -> Result { + if top_k >= prs.len() { + self.sample_topp(prs, top_p) + } else { + let mut argsort_indices = (0..prs.len()).collect::>(); + let (indices, _, _) = + argsort_indices.select_nth_unstable_by(top_k, |&i, &j| prs[j].total_cmp(&prs[i])); + let mut prs = indices.iter().map(|&i| prs[i]).collect::>(); + let sum_p = prs.iter().sum::(); + let index = if top_p <= 0.0 || top_p >= sum_p { + self.sample_multinomial(&prs)? + } else { + self.sample_topp(&mut prs, top_p)? + }; + Ok(indices[index as usize] as u32) + } + } + pub fn sample(&mut self, logits: &Tensor) -> Result { self.sample_f(logits, |_| {}) } @@ -120,6 +140,10 @@ impl LogitsProcessor { let mut prs = prs(*temperature)?; self.sample_topk(&mut prs, *k)? } + Sampling::TopKThenTopP { k, p, temperature } => { + let mut prs = prs(*temperature)?; + self.sample_topk_topp(&mut prs, *k, *p as f32)? + } }; Ok(next_token) } From 30b145150f47cc21b51e04adf03ce41995ff729f Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 16:28:23 +0200 Subject: [PATCH 111/131] Optimize the gelu f16 opt. (#2008) * Optimize the gelu f16 opt. * And add a test. --- candle-core/src/op.rs | 19 +++++++++++-------- candle-core/tests/tensor_tests.rs | 8 ++++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs index 3b34eb75..776f5182 100644 --- a/candle-core/src/op.rs +++ b/candle-core/src/op.rs @@ -457,6 +457,13 @@ unary_op!(Recip, "recip", v, v.recip()); unary_op!(Sqr, "sqr", v, v * v, vs_sqr, vd_sqr); unary_op!(Sqrt, "sqrt", v, v.sqrt(), vs_sqrt, vd_sqrt); +// Hardcode the value for sqrt(2/pi) +// https://github.com/huggingface/candle/issues/1982 +#[allow(clippy::excessive_precision)] +const SQRT_TWO_OVER_PI_F32: f32 = 0.79788456080286535587989211986876373; +#[allow(clippy::excessive_precision)] +const SQRT_TWO_OVER_PI_F64: f64 = 0.79788456080286535587989211986876373; + /// Tanh based approximation of the `gelu` operation /// GeluErf is the more precise one. /// @@ -469,7 +476,7 @@ impl UnaryOpT for Gelu { * v * (bf16::ONE + bf16::tanh( - (bf16::from_f32_const(2.0) / bf16::PI).sqrt() + bf16::from_f32_const(SQRT_TWO_OVER_PI_F32) * v * (bf16::ONE + bf16::from_f32_const(0.044715) * v * v), )) @@ -480,22 +487,18 @@ impl UnaryOpT for Gelu { * v * (f16::ONE + f16::tanh( - (f16::from_f32_const(2.0) / f16::PI).sqrt() + f16::from_f32_const(SQRT_TWO_OVER_PI_F32) * v * (f16::ONE + f16::from_f32_const(0.044715) * v * v), )) } #[inline(always)] fn f32(v: f32) -> f32 { - 0.5 * v - * (1.0 - + f32::tanh((2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v))) + 0.5 * v * (1.0 + f32::tanh(SQRT_TWO_OVER_PI_F32 * v * (1.0 + 0.044715 * v * v))) } #[inline(always)] fn f64(v: f64) -> f64 { - 0.5 * v - * (1.0 - + f64::tanh((2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v))) + 0.5 * v * (1.0 + f64::tanh(SQRT_TWO_OVER_PI_F64 * v * (1.0 + 0.044715 * v * v))) } #[inline(always)] fn u8(_: u8) -> u8 { diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index 902b84f7..1e2c1c77 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -106,6 +106,14 @@ fn unary_op(device: &Device) -> Result<()> { [2.6911, -0.0647, -0.1091, 1.7353, 2.7933] ] ); + let t_f16 = tensor.to_dtype(DType::F16)?.gelu()?.to_dtype(DType::F32)?; + assert_eq!( + test_utils::to_vec2_round(&t_f16, 2)?, + [ + [-0.0, 0.84, 4.0, -0.05, 0.35], + [2.69, -0.07, -0.11, 1.73, 2.79] + ], + ); assert_eq!( test_utils::to_vec2_round(&tensor.gelu_erf()?, 4)?, [ From f76bb7794aa8659c5023797979a3392cdfc01f32 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 17:48:45 +0200 Subject: [PATCH 112/131] Bumping the version number to 0.5.0. (#2009) --- Cargo.toml | 18 +++++++++--------- candle-flash-attn/Cargo.toml | 4 ++-- candle-kernels/Cargo.toml | 2 +- candle-metal-kernels/Cargo.toml | 2 +- candle-onnx/Cargo.toml | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 313c68f9..fe50b356 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ exclude = [ resolver = "2" [workspace.package] -version = "0.4.2" +version = "0.5.0" edition = "2021" description = "Minimalist ML framework." repository = "https://github.com/huggingface/candle" @@ -33,14 +33,14 @@ ab_glyph = "0.2.23" accelerate-src = { version = "0.3.2" } anyhow = { version = "1", features = ["backtrace"] } byteorder = "1.4.3" -candle = { path = "./candle-core", package = "candle-core", version = "0.4.2" } -candle-datasets = { path = "./candle-datasets", version = "0.4.2" } -candle-flash-attn = { path = "./candle-flash-attn", version = "0.4.2" } -candle-kernels = { path = "./candle-kernels", version = "0.4.2" } -candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.4.2" } -candle-nn = { path = "./candle-nn", version = "0.4.2" } -candle-onnx = { path = "./candle-onnx", version = "0.4.2" } -candle-transformers = { path = "./candle-transformers", version = "0.4.2" } +candle = { path = "./candle-core", package = "candle-core", version = "0.5.0" } +candle-datasets = { path = "./candle-datasets", version = "0.5.0" } +candle-flash-attn = { path = "./candle-flash-attn", version = "0.5.0" } +candle-kernels = { path = "./candle-kernels", version = "0.5.0" } +candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.5.0" } +candle-nn = { path = "./candle-nn", version = "0.5.0" } +candle-onnx = { path = "./candle-onnx", version = "0.5.0" } +candle-transformers = { path = "./candle-transformers", version = "0.5.0" } clap = { version = "4.2.4", features = ["derive"] } criterion = { version = "0.5.1", default-features=false } cudarc = { version = "0.10.0", features = ["f16"] } diff --git a/candle-flash-attn/Cargo.toml b/candle-flash-attn/Cargo.toml index 3b570776..827cf970 100644 --- a/candle-flash-attn/Cargo.toml +++ b/candle-flash-attn/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-flash-attn" -version = "0.4.2" +version = "0.5.0" edition = "2021" description = "Flash attention layer for the candle ML framework." @@ -11,7 +11,7 @@ license = "MIT OR Apache-2.0" readme = "README.md" [dependencies] -candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.4.2" } +candle = { path = "../candle-core", features = ["cuda"], package = "candle-core", version = "0.5.0" } half = { version = "2.3.1", features = ["num-traits"] } [build-dependencies] diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml index ceee23fe..5cedb7d3 100644 --- a/candle-kernels/Cargo.toml +++ b/candle-kernels/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-kernels" -version = "0.4.2" +version = "0.5.0" edition = "2021" description = "CUDA kernels for Candle" diff --git a/candle-metal-kernels/Cargo.toml b/candle-metal-kernels/Cargo.toml index 569ad032..65e00bbc 100644 --- a/candle-metal-kernels/Cargo.toml +++ b/candle-metal-kernels/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-metal-kernels" -version = "0.4.2" +version = "0.5.0" edition = "2021" description = "Metal kernels for Candle" diff --git a/candle-onnx/Cargo.toml b/candle-onnx/Cargo.toml index 01472a5f..2f438cda 100644 --- a/candle-onnx/Cargo.toml +++ b/candle-onnx/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "candle-onnx" -version = "0.4.2" +version = "0.5.0" edition = "2021" description = "ONNX support for Candle" @@ -10,8 +10,8 @@ categories = ["science"] license = "MIT OR Apache-2.0" [dependencies] -candle = { path = "../candle-core", package = "candle-core", version = "0.4.2" } -candle-nn = { path = "../candle-nn", version = "0.4.2" } +candle = { path = "../candle-core", package = "candle-core", version = "0.5.0" } +candle-nn = { path = "../candle-nn", version = "0.5.0" } prost = "0.12.1" [build-dependencies] From 5aebe53dd2470db731bd9ce2d65914f86f1542f7 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Thu, 4 Apr 2024 12:39:06 -0400 Subject: [PATCH 113/131] update dtypes checks for several metal operations (#2010) --- candle-core/src/metal_backend/mod.rs | 80 ++++++++++++++++----------- candle-metal-kernels/src/binary.metal | 22 ++++++-- candle-metal-kernels/src/reduce.metal | 4 ++ 3 files changed, 69 insertions(+), 37 deletions(-) diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index deb7a401..fa6973b4 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -443,42 +443,60 @@ impl BackendStorage for MetalStorage { use candle_metal_kernels::unary::contiguous; let kernel_name = match (B::KERNEL, dtype) { - ("ucos", DType::F32) => contiguous::cos::FLOAT, - ("usin", DType::F32) => contiguous::sin::FLOAT, - ("usqr", DType::F32) => contiguous::sqr::FLOAT, - ("usqrt", DType::F32) => contiguous::sqrt::FLOAT, - ("uneg", DType::F32) => contiguous::neg::FLOAT, - ("uexp", DType::F32) => contiguous::exp::FLOAT, - ("ulog", DType::F32) => contiguous::log::FLOAT, - ("ugelu", DType::F32) => contiguous::gelu::FLOAT, - ("ugelu_erf", DType::F32) => contiguous::gelu_erf::FLOAT, - ("uerf", DType::F32) => contiguous::erf::FLOAT, - ("usilu", DType::F32) => contiguous::silu::FLOAT, - ("uabs", DType::F32) => contiguous::abs::FLOAT, - ("uceil", DType::F32) => contiguous::ceil::FLOAT, - ("ufloor", DType::F32) => contiguous::floor::FLOAT, - ("uround", DType::F32) => contiguous::round::FLOAT, - ("urecip", DType::F32) => contiguous::recip::FLOAT, - ("utanh", DType::F32) => contiguous::tanh::FLOAT, - ("urelu", DType::F32) => contiguous::relu::FLOAT, - ("ucos", DType::F16) => contiguous::cos::HALF, - ("usin", DType::F16) => contiguous::sin::HALF, - ("usqr", DType::F16) => contiguous::sqr::HALF, - ("usqrt", DType::F16) => contiguous::sqrt::HALF, - ("uneg", DType::F16) => contiguous::neg::HALF, - ("uexp", DType::F16) => contiguous::exp::HALF, - ("ulog", DType::F16) => contiguous::log::HALF, - ("ugelu", DType::F16) => contiguous::gelu::HALF, - ("ugelu_erf", DType::F16) => contiguous::gelu_erf::HALF, - ("uerf", DType::F16) => contiguous::erf::HALF, - ("usilu", DType::F16) => contiguous::silu::HALF, ("uabs", DType::F16) => contiguous::abs::HALF, + ("uabs", DType::F32) => contiguous::abs::FLOAT, + ("uabs", DType::BF16) => contiguous::abs::BFLOAT, ("uceil", DType::F16) => contiguous::ceil::HALF, + ("uceil", DType::F32) => contiguous::ceil::FLOAT, + ("uceil", DType::BF16) => contiguous::ceil::BFLOAT, + ("ucos", DType::F16) => contiguous::cos::HALF, + ("ucos", DType::F32) => contiguous::cos::FLOAT, + ("ucos", DType::BF16) => contiguous::cos::BFLOAT, + ("uerf", DType::F16) => contiguous::erf::HALF, + ("uerf", DType::F32) => contiguous::erf::FLOAT, + ("uerf", DType::BF16) => contiguous::erf::BFLOAT, + ("uexp", DType::F16) => contiguous::exp::HALF, + ("uexp", DType::F32) => contiguous::exp::FLOAT, + ("uexp", DType::BF16) => contiguous::exp::BFLOAT, ("ufloor", DType::F16) => contiguous::floor::HALF, - ("uround", DType::F16) => contiguous::round::HALF, + ("ufloor", DType::F32) => contiguous::floor::FLOAT, + ("ufloor", DType::BF16) => contiguous::floor::BFLOAT, + ("ugelu_erf", DType::F16) => contiguous::gelu_erf::HALF, + ("ugelu_erf", DType::F32) => contiguous::gelu_erf::FLOAT, + ("ugelu_erf", DType::BF16) => contiguous::gelu_erf::BFLOAT, + ("ugelu", DType::F16) => contiguous::gelu::HALF, + ("ugelu", DType::F32) => contiguous::gelu::FLOAT, + ("ugelu", DType::BF16) => contiguous::gelu::BFLOAT, + ("ulog", DType::F16) => contiguous::log::HALF, + ("ulog", DType::F32) => contiguous::log::FLOAT, + ("ulog", DType::BF16) => contiguous::log::BFLOAT, + ("uneg", DType::F16) => contiguous::neg::HALF, + ("uneg", DType::F32) => contiguous::neg::FLOAT, + ("uneg", DType::BF16) => contiguous::neg::BFLOAT, ("urecip", DType::F16) => contiguous::recip::HALF, - ("utanh", DType::F16) => contiguous::tanh::HALF, + ("urecip", DType::F32) => contiguous::recip::FLOAT, + ("urecip", DType::BF16) => contiguous::recip::BFLOAT, ("urelu", DType::F16) => contiguous::relu::HALF, + ("urelu", DType::F32) => contiguous::relu::FLOAT, + ("urelu", DType::BF16) => contiguous::relu::BFLOAT, + ("uround", DType::F16) => contiguous::round::HALF, + ("uround", DType::F32) => contiguous::round::FLOAT, + ("uround", DType::BF16) => contiguous::round::BFLOAT, + ("usilu", DType::F16) => contiguous::silu::HALF, + ("usilu", DType::F32) => contiguous::silu::FLOAT, + ("usilu", DType::BF16) => contiguous::silu::BFLOAT, + ("usin", DType::F16) => contiguous::sin::HALF, + ("usin", DType::F32) => contiguous::sin::FLOAT, + ("usin", DType::BF16) => contiguous::sin::BFLOAT, + ("usqr", DType::F16) => contiguous::sqr::HALF, + ("usqr", DType::F32) => contiguous::sqr::FLOAT, + ("usqr", DType::BF16) => contiguous::sqr::BFLOAT, + ("usqrt", DType::F16) => contiguous::sqrt::HALF, + ("usqrt", DType::F32) => contiguous::sqrt::FLOAT, + ("usqrt", DType::BF16) => contiguous::sqrt::BFLOAT, + ("utanh", DType::F16) => contiguous::tanh::HALF, + ("utanh", DType::F32) => contiguous::tanh::FLOAT, + ("utanh", DType::BF16) => contiguous::tanh::BFLOAT, (name, dtype) => { crate::bail!("Metal contiguous unary {name} {dtype:?} not implemented") } diff --git a/candle-metal-kernels/src/binary.metal b/candle-metal-kernels/src/binary.metal index ae11286a..e83498e4 100644 --- a/candle-metal-kernels/src/binary.metal +++ b/candle-metal-kernels/src/binary.metal @@ -60,21 +60,24 @@ BINARY(FN, half, half, NAME##_f16, NAME##_f16_strided); \ BINARY(FN, uint32_t, uint32_t, NAME##_u32, NAME##_u32_strided); \ BINARY(FN, uint8_t, uint8_t, NAME##_u8, NAME##_u8_strided); -#define INT64_BINARY_OP(NAME, FN) \ -BINARY(FN, int64_t, int64_t, NAME##_i64, NAME##_i64_strided); - -#define BFLOAT_BINARY_OP(FN, NAME) \ -BINARY(FN, bfloat, bfloat, NAME##_bf16, NAME##_bf16_strided); - #define BINARY_OP_OUT(NAME, FN) \ BINARY(FN, float, uint8_t, NAME##_f32, NAME##_f32_strided); \ BINARY(FN, half, uint8_t, NAME##_f16, NAME##_f16_strided); \ BINARY(FN, uint32_t, uint8_t, NAME##_u32, NAME##_u32_strided); \ BINARY(FN, uint8_t, uint8_t, NAME##_u8, NAME##_u8_strided); +#define INT64_BINARY_OP(NAME, FN) \ +BINARY(FN, int64_t, int64_t, NAME##_i64, NAME##_i64_strided); + #define INT64_BINARY_OP_OUT(NAME, FN) \ BINARY(FN, int64_t, uint8_t, NAME##_i64, NAME##_i64_strided); +#define BFLOAT_BINARY_OP(FN, NAME) \ +BINARY(FN, bfloat, bfloat, NAME##_bf16, NAME##_bf16_strided); + +#define BFLOAT_BINARY_OP_OUT(NAME, FN) \ +BINARY(FN, bfloat, uint8_t, NAME##_bf16, NAME##_bf16_strided); + BINARY_OP(x + y, add) BINARY_OP(x - y, sub) BINARY_OP(x * y, mul) @@ -112,4 +115,11 @@ BFLOAT_BINARY_OP(x * y, mul) BFLOAT_BINARY_OP(x / y, div) BFLOAT_BINARY_OP(MIN(x, y), min) BFLOAT_BINARY_OP(MAX(x, y), max) + +BFLOAT_BINARY_OP_OUT(eq, x == y) +BFLOAT_BINARY_OP_OUT(ne, x != y) +BFLOAT_BINARY_OP_OUT(le, x <= y) +BFLOAT_BINARY_OP_OUT(lt, x < y) +BFLOAT_BINARY_OP_OUT(ge, x >= y) +BFLOAT_BINARY_OP_OUT(gt, x > y) #endif diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index 561d1744..acb69299 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -484,9 +484,13 @@ ARGMAX(fast_argmax_i64_strided, int64_t, INT_MIN) #if defined(__HAVE_BFLOAT__) REDUCE(x + y, fast_sum_bf16, bfloat, 0) +REDUCE(x + y, fast_sum_bf16_strided, half, 0) REDUCE(x * y, fast_mul_bf16, bfloat, 1) +REDUCE(x * y, fast_mul_bf16_strided, bfloat, 1) REDUCE(MAX(x, y), fast_max_bf16, bfloat, -HUGE_VALBF) +REDUCE(MAX(x, y), fast_max_bf16_strided, bfloat, -HUGE_VALBF) REDUCE(MIN(x, y), fast_min_bf16, bfloat, HUGE_VALBF) +REDUCE(MIN(x, y), fast_min_bf16_strided, bfloat, HUGE_VALBF) ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF) ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF) SOFTMAX(softmax_bf16, bfloat) From e6a5b82ba6507e7e21d5a5d45241bd8f005609b7 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 19:18:03 +0200 Subject: [PATCH 114/131] Fix the matmul layout for accelerate & mkl. (#2011) * Fix the matmul layout for accelerate & mkl. * Reduce the required precision for pow (because of accelerate). * And a fix the gelu f16 test. --- candle-core/src/cpu_backend/mod.rs | 8 ++++---- candle-core/tests/matmul_tests.rs | 13 ------------- candle-core/tests/tensor_tests.rs | 13 ++++--------- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/candle-core/src/cpu_backend/mod.rs b/candle-core/src/cpu_backend/mod.rs index 09226b58..6f8250f0 100644 --- a/candle-core/src/cpu_backend/mod.rs +++ b/candle-core/src/cpu_backend/mod.rs @@ -1330,7 +1330,7 @@ impl Map2 for MatMul { let lhs_m1 = lhs_stride[lhs_stride.len() - 1]; let lhs_m2 = lhs_stride[lhs_stride.len() - 2]; - let (lda, transa) = if rhs_m1 == 1 && rhs_m2 == n { + let (lda, transa) = if (rhs_m1 == 1 || n == 1) && (rhs_m2 == n || k == 1) { (n as i32, b'N') } else if rhs_m1 == k && rhs_m2 == 1 { (k as i32, b'T') @@ -1338,7 +1338,7 @@ impl Map2 for MatMul { Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))? }; // The b tensor has dims batching, m, k (lhs) - let (ldb, transb) = if lhs_m1 == 1 && lhs_m2 == k { + let (ldb, transb) = if (lhs_m1 == 1 || k == 1) && (lhs_m2 == k || m == 1) { (k as i32, b'N') } else if lhs_m1 == m && lhs_m2 == 1 { (m as i32, b'T') @@ -1421,7 +1421,7 @@ impl Map2 for MatMul { let lhs_m1 = lhs_stride[lhs_stride.len() - 1]; let lhs_m2 = lhs_stride[lhs_stride.len() - 2]; - let (lda, transa) = if rhs_m1 == 1 && rhs_m2 == n { + let (lda, transa) = if (rhs_m1 == 1 || n == 1) && (rhs_m2 == n || k == 1) { (n as i32, b'N') } else if rhs_m1 == k && rhs_m2 == 1 { (k as i32, b'T') @@ -1429,7 +1429,7 @@ impl Map2 for MatMul { Err(self.striding_error(lhs_l, rhs_l, "non-contiguous rhs"))? }; // The b tensor has dims batching, m, k (lhs) - let (ldb, transb) = if lhs_m1 == 1 && lhs_m2 == k { + let (ldb, transb) = if (lhs_m1 == 1 || k == 1) && (lhs_m2 == k || m == 1) { (k as i32, b'N') } else if lhs_m1 == m && lhs_m2 == 1 { (m as i32, b'T') diff --git a/candle-core/tests/matmul_tests.rs b/candle-core/tests/matmul_tests.rs index 834da29a..e3e18107 100644 --- a/candle-core/tests/matmul_tests.rs +++ b/candle-core/tests/matmul_tests.rs @@ -73,20 +73,7 @@ fn squeeze_mm(device: &Device) -> Result<()> { let seq_len = 8_usize; let a = Tensor::zeros((1, seq_len, 16), DType::F32, device)?; let x = a.i((.., seq_len - 1, ..))?; - println!( - "x shape:{:?}, stride:{:?}, is_contiguous:{}", - x.shape(), - x.stride(), - x.is_contiguous() - ); - let w = Tensor::zeros((32, 16), DType::F32, device)?.t()?; - println!( - "w shape:{:?}, stride:{:?}, is_contiguous:{}", - w.shape(), - w.stride(), - w.is_contiguous() - ); let x = x.matmul(&w)?; assert_eq!(x.dims(), &[1, 32]); Ok(()) diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index 1e2c1c77..b3275804 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -107,13 +107,8 @@ fn unary_op(device: &Device) -> Result<()> { ] ); let t_f16 = tensor.to_dtype(DType::F16)?.gelu()?.to_dtype(DType::F32)?; - assert_eq!( - test_utils::to_vec2_round(&t_f16, 2)?, - [ - [-0.0, 0.84, 4.0, -0.05, 0.35], - [2.69, -0.07, -0.11, 1.73, 2.79] - ], - ); + let max_diff = (tensor.gelu()? - t_f16)?.flatten_all()?.max(0)?; + assert!(max_diff.to_vec0::()? < 5e-3); assert_eq!( test_utils::to_vec2_round(&tensor.gelu_erf()?, 4)?, [ @@ -1255,8 +1250,8 @@ fn pow() -> Result<()> { let rhs = (&lhs - 2.)?; let res = lhs.pow(&rhs)?; assert_eq!( - test_utils::to_vec2_round(&res, 4)?, - [[1.0, 1.0, 3.0], [16.0, 125.0, 1296.0001]] + test_utils::to_vec2_round(&res, 3)?, + [[1.0, 1.0, 3.0], [16.0, 125.0, 1296.0]] ); Ok(()) } From c5626b827147e5029c6bd3e37352ec8ac501cfc3 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Thu, 4 Apr 2024 16:32:47 -0400 Subject: [PATCH 115/131] Add support for "sign" on tensors (#2012) * add the sign unary operator * remove uneeded import * remove uneeded import * undo formatting * undo formatting * remove unnecessary redefintion * allow gradient to flow through for sign and round * fix cpu ops to ensure that negzero and positive zero are handled properly * clippy fixes * Properly avoid gradient tracking. * Use a branchless version. --------- Co-authored-by: laurent --- candle-core/src/backprop.rs | 18 +++++++------- candle-core/src/metal_backend/mod.rs | 4 ++++ candle-core/src/op.rs | 36 ++++++++++++++++++++++++++++ candle-core/src/tensor.rs | 1 + candle-core/tests/tensor_tests.rs | 8 +++++++ candle-kernels/src/unary.cu | 9 +++++++ candle-metal-kernels/src/lib.rs | 2 +- candle-metal-kernels/src/unary.metal | 2 ++ 8 files changed, 69 insertions(+), 11 deletions(-) diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs index f39eedbb..65d91849 100644 --- a/candle-core/src/backprop.rs +++ b/candle-core/src/backprop.rs @@ -112,7 +112,8 @@ impl Tensor { } Op::Unary(_node, UnaryOp::Ceil) | Op::Unary(_node, UnaryOp::Floor) - | Op::Unary(_node, UnaryOp::Round) => nodes, + | Op::Unary(_node, UnaryOp::Round) + | Op::Unary(_node, UnaryOp::Sign) => nodes, Op::Reshape(node) | Op::UpsampleNearest1D { arg: node, .. } | Op::UpsampleNearest2D { arg: node, .. } @@ -488,7 +489,6 @@ impl Tensor { let sum_grad = grads.or_insert(arg)?; *sum_grad = sum_grad.add(&grad)?; } - Op::Cmp(_args, _) => {} Op::Reduce(arg, ReduceOp::Max, reduced_dims) => { let node = broadcast_back(arg, node, reduced_dims)?; let grad = broadcast_back(arg, &grad, reduced_dims)?; @@ -578,20 +578,18 @@ impl Tensor { let sum_grad = grads.or_insert(arg)?; *sum_grad = sum_grad.add(&arg_grad)? } - Op::Reduce(_, ReduceOp::ArgMin, _) => {} - Op::Reduce(_, ReduceOp::ArgMax, _) => {} + Op::Unary(_, UnaryOp::Floor) + | Op::Unary(_, UnaryOp::Round) + | Op::Reduce(_, ReduceOp::ArgMin, _) + | Op::Reduce(_, ReduceOp::ArgMax, _) + | Op::Unary(_, UnaryOp::Sign) + | Op::Cmp(_, _) => {} Op::Reshape(arg) => { let arg_grad = grad.reshape(arg.dims())?; let sum_grad = grads.or_insert(arg)?; *sum_grad = sum_grad.add(&arg_grad)? } Op::Unary(_, UnaryOp::Ceil) => Err(Error::BackwardNotSupported { op: "ceil" })?, - Op::Unary(_, UnaryOp::Floor) => { - Err(Error::BackwardNotSupported { op: "floor" })? - } - Op::Unary(_, UnaryOp::Round) => { - Err(Error::BackwardNotSupported { op: "round" })? - } Op::Unary(arg, UnaryOp::Gelu) => { let sum_grad = grads.or_insert(arg)?; let cube = arg.powf(3.)?; diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index fa6973b4..0e058b45 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -497,6 +497,10 @@ impl BackendStorage for MetalStorage { ("utanh", DType::F16) => contiguous::tanh::HALF, ("utanh", DType::F32) => contiguous::tanh::FLOAT, ("utanh", DType::BF16) => contiguous::tanh::BFLOAT, + ("usign", DType::F16) => contiguous::sign::HALF, + ("usign", DType::F32) => contiguous::sign::FLOAT, + ("usign", DType::BF16) => contiguous::sign::BFLOAT, + ("usign", DType::I64) => contiguous::sign::I64, (name, dtype) => { crate::bail!("Metal contiguous unary {name} {dtype:?} not implemented") } diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs index 776f5182..49ba44be 100644 --- a/candle-core/src/op.rs +++ b/candle-core/src/op.rs @@ -66,6 +66,7 @@ pub enum UnaryOp { Floor, Ceil, Round, + Sign, } #[derive(Clone)] @@ -254,6 +255,7 @@ pub(crate) struct Tanh; pub(crate) struct Floor; pub(crate) struct Ceil; pub(crate) struct Round; +pub(crate) struct Sign; macro_rules! bin_op { ($op:ident, $name: literal, $e: expr, $f32_vec: ident, $f64_vec: ident) => { @@ -925,3 +927,37 @@ impl std::ops::Deref for BackpropOp { &self.0 } } + +impl UnaryOpT for Sign { + const NAME: &'static str = "sign"; + const KERNEL: &'static str = "usign"; + const V: Self = Sign; + #[inline(always)] + fn bf16(v: bf16) -> bf16 { + bf16::from((v > bf16::ZERO) as i8) - bf16::from((v < bf16::ZERO) as i8) + } + #[inline(always)] + fn f16(v: f16) -> f16 { + f16::from((v > f16::ZERO) as i8) - f16::from((v < f16::ZERO) as i8) + } + #[inline(always)] + fn f32(v: f32) -> f32 { + f32::from(v > 0.) - f32::from(v < 0.) + } + #[inline(always)] + fn f64(v: f64) -> f64 { + f64::from(v > 0.) - f64::from(v < 0.) + } + #[inline(always)] + fn u8(v: u8) -> u8 { + u8::min(1, v) + } + #[inline(always)] + fn u32(v: u32) -> u32 { + u32::min(1, v) + } + #[inline(always)] + fn i64(v: i64) -> i64 { + (v > 0) as i64 - (v < 0) as i64 + } +} diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index b53b0419..a5a9dbb1 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -510,6 +510,7 @@ impl Tensor { unary_op!(ceil, Ceil); unary_op!(floor, Floor); unary_op!(round, Round); + unary_op!(sign, Sign); /// Round element of the input tensor to the nearest integer. /// diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index b3275804..78841779 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -151,6 +151,14 @@ fn unary_op(device: &Device) -> Result<()> { test_utils::to_vec1_round(&tensor.round_to(-2)?, 4)?, [3000.0, 300.] ); + let tensor = Tensor::new( + &[-1.01f32, -0.9, -0.1, 0.0, -0.0, 0.1, 0.9, 1.0, 1.1], + device, + )?; + assert_eq!( + tensor.sign()?.to_vec1::()?, + [-1., -1., -1., 0., 0., 1., 1., 1., 1.] + ); Ok(()) } diff --git a/candle-kernels/src/unary.cu b/candle-kernels/src/unary.cu index 13489897..a234304a 100644 --- a/candle-kernels/src/unary.cu +++ b/candle-kernels/src/unary.cu @@ -86,6 +86,11 @@ extern "C" __global__ void FN_NAME( \ } \ } \ +template +__device__ T sign_(T t) { + return static_cast(t > static_cast(0)) - static_cast(t < static_cast(0)); +} + #if __CUDA_ARCH__ >= 800 UNARY_OP(__nv_bfloat16, ucopy_bf16, x) @@ -110,6 +115,7 @@ UNARY_OP(__nv_bfloat16, urelu_bf16, relu_fwd(x)) UNARY_OP1(__nv_bfloat16, uelu_bf16, elu_fwd(x, param)) UNARY_OP(__nv_bfloat16, usilu_bf16, silu_fwd(x)) UNARY_OP1(__nv_bfloat16, upowf_bf16, powg(x, param)) +UNARY_OP(__nv_bfloat16, usign_bf16, sign_(x)) #endif #if __CUDA_ARCH__ >= 530 @@ -135,6 +141,7 @@ UNARY_OP(__half, urelu_f16, relu_fwd(x)) UNARY_OP1(__half, uelu_f16, elu_fwd(x, param)) UNARY_OP(__half, usilu_f16, silu_fwd(x)) UNARY_OP1(__half, upowf_f16, powg(x, param)) +UNARY_OP(__half, usign_f16, sign_(x)) #endif UNARY_OP(uint8_t, ucopy_u8, x) @@ -184,3 +191,5 @@ UNARY_OP(float, usilu_f32, silu_fwd(x)) UNARY_OP(double, usilu_f64, silu_fwd(x)) UNARY_OP1(float, upowf_f32, powg(x, param)) UNARY_OP1(double, upowf_f64, powg(x, param)) +UNARY_OP(float, usign_f32, sign_(x)) +UNARY_OP(double, usign_f64, sign_(x)) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 140927e3..5af48fae 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -193,7 +193,7 @@ macro_rules! ops{ pub mod unary { ops!( cos, sin, exp, sqr, sqrt, neg, log, gelu, abs, ceil, floor, relu, round, erf, gelu_erf, - tanh, recip, silu + tanh, recip, silu, sign ); } pub mod binary { diff --git a/candle-metal-kernels/src/unary.metal b/candle-metal-kernels/src/unary.metal index bdc13f9e..809522d7 100644 --- a/candle-metal-kernels/src/unary.metal +++ b/candle-metal-kernels/src/unary.metal @@ -145,6 +145,7 @@ UNARY_OP(erf) UNARY_OP(tanh) UNARY_OP(recip) UNARY_OP(relu) +UNARY_OP(sign) UNARY(id, float, copy_f32, copy_f32_strided) UNARY(id, half, copy_f16, copy_f16_strided) UNARY(id, uint8_t, copy_u8, copy_u8_strided) @@ -174,6 +175,7 @@ BFLOAT_UNARY_OP(erf) BFLOAT_UNARY_OP(tanh) BFLOAT_UNARY_OP(recip) BFLOAT_UNARY_OP(relu) +BFLOAT_UNARY_OP(sign) UNARY(id, bfloat, copy_bf16, copy_bf16_strided) From c87381fc9643ca15648c2e8379e44a596ba1854b Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Thu, 4 Apr 2024 23:30:10 +0200 Subject: [PATCH 116/131] Use F16 for moondream on cuda. (#2013) --- candle-examples/examples/moondream/main.rs | 12 +++++++++--- candle-transformers/src/models/mixformer.rs | 13 ++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index bcc21337..dfd83037 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -283,6 +283,11 @@ async fn main() -> anyhow::Result<()> { let start = std::time::Instant::now(); let device = candle_examples::device(args.cpu)?; let config = moondream::Config::v2(); + let dtype = if device.is_cuda() && !args.quantized { + DType::F16 + } else { + DType::F32 + }; let model = if args.quantized { let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf( &model_file, @@ -291,15 +296,16 @@ async fn main() -> anyhow::Result<()> { let model = quantized_moondream::Model::new(&config, vb)?; Model::Quantized(model) } else { - let vb = - unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? }; + let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], dtype, &device)? }; let model = moondream::Model::new(&config, vb)?; Model::Moondream(model) }; println!("loaded the model in {:?}", start.elapsed()); let start = std::time::Instant::now(); - let image = load_image(args.image)?.to_device(&device)?; + let image = load_image(args.image)? + .to_device(&device)? + .to_dtype(dtype)?; let image_embeds = image.unsqueeze(0)?; let image_embeds = match model { Model::Moondream(ref m) => image_embeds.apply(m.vision_encoder())?, diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index 65a1665a..de15c3a5 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -135,7 +135,9 @@ fn get_mask(size: usize, device: &Device) -> Result { fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result { let shape = mask.shape(); - let on_true = Tensor::new(on_true, on_false.device())?.broadcast_as(shape.dims())?; + let on_true = Tensor::new(on_true, on_false.device())? + .to_dtype(on_false.dtype())? + .broadcast_as(shape.dims())?; let m = mask.where_cond(&on_true, on_false)?; Ok(m) } @@ -147,7 +149,7 @@ struct RotaryEmbedding { } impl RotaryEmbedding { - fn new(dim: usize, max_seq_len: usize, dev: &Device) -> Result { + fn new(dim: usize, max_seq_len: usize, dtype: DType, dev: &Device) -> Result { let inv_freq: Vec<_> = (0..dim) .step_by(2) .map(|i| 1f32 / 10000f32.powf(i as f32 / dim as f32)) @@ -159,8 +161,8 @@ impl RotaryEmbedding { .reshape((max_seq_len, 1))?; let freqs = t.matmul(&inv_freq)?; Ok(Self { - sin: freqs.sin()?, - cos: freqs.cos()?, + sin: freqs.sin()?.to_dtype(dtype)?, + cos: freqs.cos()?.to_dtype(dtype)?, }) } @@ -274,7 +276,8 @@ impl MHA { let op_size = cfg.n_embd; let wqkv = linear(cfg.n_embd, 3 * op_size, vb.pp("Wqkv"))?; let out_proj = linear(op_size, cfg.n_embd, vb.pp("out_proj"))?; - let rotary_emb = RotaryEmbedding::new(cfg.rotary_dim, MAX_SEQ_LEN, vb.device())?; + let rotary_emb = + RotaryEmbedding::new(cfg.rotary_dim, MAX_SEQ_LEN, vb.dtype(), vb.device())?; let softmax_scale = 1f64 / (head_dim as f64).sqrt(); Ok(Self { wqkv, From ace282e5c2ef24ca2fb90683babb852936d4df17 Mon Sep 17 00:00:00 2001 From: Santiago Medina Date: Thu, 4 Apr 2024 22:03:33 -0700 Subject: [PATCH 117/131] Add flag to run Moondream in f16 precision (#2015) * moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward * Use latest release special token; Fix token/s accuracy; Use GeluPytorchTanh in VisionConfig v2 * Add flag to use f16 * Avoid breaking the quantized version on cuda. --------- Co-authored-by: laurent --- candle-examples/examples/moondream/main.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index dfd83037..c7500ed9 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -194,6 +194,10 @@ struct Args { #[arg(long)] quantized: bool, + /// Use f16 precision for all the computations rather than f32. + #[arg(long)] + f16: bool, + #[arg(long)] model_file: Option, @@ -283,7 +287,12 @@ async fn main() -> anyhow::Result<()> { let start = std::time::Instant::now(); let device = candle_examples::device(args.cpu)?; let config = moondream::Config::v2(); - let dtype = if device.is_cuda() && !args.quantized { + let dtype = if args.quantized { + if args.f16 { + anyhow::bail!("Quantized model does not support f16"); + } + DType::F32 + } else if device.is_cuda() || args.f16 { DType::F16 } else { DType::F32 From 2ac302a5d170953a1d2fe850645563fc55d1567f Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 5 Apr 2024 08:32:58 +0200 Subject: [PATCH 118/131] Add the rope THD kernel. (#2014) * Add the rope THD kernel. * Cuda kernel for rope-thd. * Add the metal kernels. * Add a dedicated test. --- candle-kernels/src/reduce.cu | 48 +++- candle-metal-kernels/src/lib.rs | 45 ++++ candle-metal-kernels/src/reduce.metal | 48 +++- candle-nn/src/rotary_emb.rs | 231 ++++++++++++++++++++ candle-nn/tests/ops.rs | 31 +++ candle-transformers/src/models/mixformer.rs | 28 +-- 6 files changed, 400 insertions(+), 31 deletions(-) diff --git a/candle-kernels/src/reduce.cu b/candle-kernels/src/reduce.cu index 2af81c42..4dbd8dcc 100644 --- a/candle-kernels/src/reduce.cu +++ b/candle-kernels/src/reduce.cu @@ -179,6 +179,33 @@ __device__ void rope(const T * src, const T * cos, const T * sin, T * dst, const dst[i2] = src[i1] * s + src[i2] * c; } +template +__device__ void rope_thd( + const T * src, + const T * cos, + const T * sin, + T * dst, + const uint32_t b, + const uint32_t t, + const uint32_t h, + const uint32_t d +) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (2 * idx >= b * t * h * d) return; + + uint32_t i_bth = idx / (d / 2); + uint32_t i_d = idx - (d / 2) * i_bth; + uint32_t i_t = (i_bth / h) % t; + uint32_t i1 = i_bth * d + i_d; + uint32_t i2 = i1 + d / 2; + uint32_t i_cs = i_t * (d / 2) + i_d; + T c = cos[i_cs]; + T s = sin[i_cs]; + + dst[i1] = src[i1] * c - src[i2] * s; + dst[i2] = src[i1] * s + src[i2] * c; +} + template __device__ void fast_max(const size_t src_numel, const size_t el_to_sum_per_block, @@ -434,7 +461,7 @@ fast_argmax(const size_t src_numel, const size_t el_to_sum_per_block, rmsnorm(src, dst, alpha, n_cols, eps); \ } \ -#define ROPE_OP(TYPENAME, FN_NAME, FN_NAME_I) \ +#define ROPE_OP(TYPENAME, FN_NAME, FN_NAME_I, FN_NAME_THD) \ extern "C" __global__ void FN_NAME_I( \ const TYPENAME *src, \ const TYPENAME *cos, \ @@ -454,11 +481,22 @@ fast_argmax(const size_t src_numel, const size_t el_to_sum_per_block, const uint32_t d) { \ rope(src, cos, sin, dst, bh, td, d); \ } \ + extern "C" __global__ void FN_NAME_THD( \ + const TYPENAME *src, \ + const TYPENAME *cos, \ + const TYPENAME *sin, \ + TYPENAME *dst, \ + const uint32_t b, \ + const uint32_t t, \ + const uint32_t h, \ + const uint32_t d) { \ + rope_thd(src, cos, sin, dst, b, t, h, d); \ + } \ #if __CUDA_ARCH__ >= 800 SOFTMAX_OP(__nv_bfloat16, float, softmax_bf16) RMSNORM_OP(__nv_bfloat16, rmsnorm_bf16) -ROPE_OP(__nv_bfloat16, rope_bf16, rope_i_bf16) +ROPE_OP(__nv_bfloat16, rope_bf16, rope_i_bf16, rope_thd_bf16) SUM_OP(__nv_bfloat16, sum_bf16) FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argmax_bf16, fast_sum_bf16) #endif @@ -466,7 +504,7 @@ FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argm #if __CUDA_ARCH__ >= 530 SOFTMAX_OP(__half, float, softmax_f16) RMSNORM_OP(__half, rmsnorm_f16) -ROPE_OP(__half, rope_f16, rope_i_f16) +ROPE_OP(__half, rope_f16, rope_i_f16, rope_thd_f16) SUM_OP(__half, sum_f16) FAST_OP(__half, fast_min_f16, fast_max_f16, fast_argmin_f16, fast_argmax_f16, fast_sum_f16) #endif @@ -478,8 +516,8 @@ SOFTMAX_OP(float, float, softmax_f32) SOFTMAX_OP(double, double, softmax_f64) RMSNORM_OP(float, rmsnorm_f32) RMSNORM_OP(double, rmsnorm_f64) -ROPE_OP(float, rope_f32, rope_i_f32) -ROPE_OP(double, rope_f64, rope_i_f64) +ROPE_OP(float, rope_f32, rope_i_f32, rope_thd_f32) +ROPE_OP(double, rope_f64, rope_i_f64, rope_thd_f64) FAST_OP(float, fast_min_f32, fast_max_f32, fast_argmin_f32, fast_argmax_f32, fast_sum_f32) FAST_OP(double, fast_min_f64, fast_max_f64, fast_argmin_f64, fast_argmax_f64, fast_sum_f64) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 5af48fae..4cff9bda 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -849,6 +849,51 @@ pub fn call_rope_i( Ok(()) } +#[allow(clippy::too_many_arguments)] +pub fn call_rope_thd( + device: &Device, + command_buffer: &CommandBufferRef, + kernels: &Kernels, + kernel_name: &'static str, + b: usize, + t: usize, + h: usize, + d: usize, + src: &Buffer, + src_offset: usize, + cos: &Buffer, + cos_offset: usize, + sin: &Buffer, + sin_offset: usize, + output: &Buffer, +) -> Result<(), MetalKernelError> { + let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?; + let encoder = command_buffer.new_compute_command_encoder(); + encoder.set_compute_pipeline_state(&pipeline); + + set_params!( + encoder, + ( + b, + t, + h, + d, + (src, src_offset), + (cos, cos_offset), + (sin, sin_offset), + output + ) + ); + let (thread_group_count, thread_group_size) = linear_split(&pipeline, (b * t * h * d) / 2); + encoder.use_resource(src, metal::MTLResourceUsage::Read); + encoder.use_resource(cos, metal::MTLResourceUsage::Read); + encoder.use_resource(sin, metal::MTLResourceUsage::Read); + encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.end_encoding(); + Ok(()) +} + #[allow(clippy::too_many_arguments)] pub fn call_rope( device: &Device, diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index acb69299..14bfb297 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -418,7 +418,34 @@ METAL_FUNC void rope( dst[i2] = src[i1] * s + src[i2] * c; } -#define ROPEI(FN_NAME, FN_NAME_I, TYPENAME) \ +template +METAL_FUNC void rope_thd( + constant size_t &b, + constant size_t &t, + constant size_t &h, + constant size_t &d, + device const T *src, + device const T *cos, + device const T *sin, + device T *dst, + uint idx +) { + if (2 * idx >= b * t * h * d) { + return; + } + const size_t i_bth = idx / (d / 2); + const size_t i_d = idx - (d / 2) * i_bth; + const size_t i_t = (i_bth / h) % t; + const size_t i1 = i_bth * d + i_d; + const size_t i2 = i1 + d / 2; + const size_t i_cs = i_t * (d / 2) + i_d; + T c = cos[i_cs]; + T s = sin[i_cs]; + dst[i1] = src[i1] * c - src[i2] * s; + dst[i2] = src[i1] * s + src[i2] * c; +} + +#define ROPE(FN_NAME, FN_NAME_I, FN_NAME_THD, TYPENAME) \ kernel void FN_NAME_I( \ constant size_t &bh, \ constant size_t &td, \ @@ -442,6 +469,19 @@ kernel void FN_NAME( \ ) { \ rope(bh, td, d, src, cos, sin, dst, idx); \ }\ +kernel void FN_NAME_THD( \ + constant size_t &b, \ + constant size_t &t, \ + constant size_t &h, \ + constant size_t &d, \ + device const TYPENAME *src, \ + device const TYPENAME *cos, \ + device const TYPENAME *sin, \ + device TYPENAME *dst, \ + uint idx [[ thread_position_in_grid ]] \ +) { \ + rope_thd(b, t, h, d, src, cos, sin, dst, idx); \ +}\ REDUCE(x + y, fast_sum_f32_strided, float, 0) REDUCE(x + y, fast_sum_u32_strided, uint, 0) @@ -471,8 +511,8 @@ SOFTMAX(softmax_f32, float) SOFTMAX(softmax_f16, half) RMSNORM(rmsnorm_f32, float) RMSNORM(rmsnorm_f16, half) -ROPEI(rope_f32, rope_i_f32, float) -ROPEI(rope_f16, rope_i_f16, half) +ROPE(rope_f32, rope_i_f32, rope_thd_f32, float) +ROPE(rope_f16, rope_i_f16, rope_thd_f16, half) #if __METAL_VERSION__ >= 220 REDUCE(x + y, fast_sum_i64_strided, int64_t, 0) @@ -495,5 +535,5 @@ ARGMIN(fast_argmin_bf16, bfloat, HUGE_VALBF) ARGMAX(fast_argmax_bf16, bfloat, -HUGE_VALBF) SOFTMAX(softmax_bf16, bfloat) RMSNORM(rmsnorm_bf16, bfloat) -ROPEI(rope_bf16, rope_i_bf16, bfloat) +ROPE(rope_bf16, rope_i_bf16, rope_thd_bf16, bfloat) #endif diff --git a/candle-nn/src/rotary_emb.rs b/candle-nn/src/rotary_emb.rs index c2b41482..1084cfb5 100644 --- a/candle-nn/src/rotary_emb.rs +++ b/candle-nn/src/rotary_emb.rs @@ -497,3 +497,234 @@ pub fn rope_slow(x: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { let sin = sin.unsqueeze(0)?.unsqueeze(0)?; x.broadcast_mul(&cos)? + rotate_half(x)?.broadcast_mul(&sin)? } + +/// T (seqlen)/H (num-heads)/D (head-dim) contiguous variant of rope embeddings. +#[derive(Debug, Clone)] +struct RotaryEmbThd; + +impl candle::CustomOp3 for RotaryEmbThd { + fn name(&self) -> &'static str { + "rotary-emb" + } + + fn cpu_fwd( + &self, + s1: &CpuStorage, + l1: &Layout, + s2: &CpuStorage, + l2: &Layout, + s3: &CpuStorage, + l3: &Layout, + ) -> Result<(CpuStorage, Shape)> { + fn inner( + src: &[T], + l_src: &Layout, + cos: &[T], + l_cos: &Layout, + sin: &[T], + l_sin: &Layout, + ) -> Result<(CpuStorage, Shape)> { + let src = match l_src.contiguous_offsets() { + None => candle::bail!("input src has to be contiguous"), + Some((o1, o2)) => &src[o1..o2], + }; + let cos = match l_cos.contiguous_offsets() { + None => candle::bail!("input cos has to be contiguous"), + Some((o1, o2)) => &cos[o1..o2], + }; + let sin = match l_sin.contiguous_offsets() { + None => candle::bail!("input sin has to be contiguous"), + Some((o1, o2)) => &sin[o1..o2], + }; + let (b, t, h, d) = l_src.shape().dims4()?; + let el_count = b * h * t * d; + let mut dst = vec![T::zero(); el_count]; + src.par_chunks(t * h * d) + .zip(dst.par_chunks_mut(t * h * d)) + .for_each(|(src, dst)| { + for i_t in 0..t { + for i_d in 0..d / 2 { + let i_cs = i_t * (d / 2) + i_d; + for i_h in 0..h { + let i1 = i_t * h * d + i_h * d + i_d; + let i2 = i1 + d / 2; + dst[i1] = src[i1] * cos[i_cs] - src[i2] * sin[i_cs]; + dst[i2] = src[i1] * sin[i_cs] + src[i2] * cos[i_cs]; + } + } + } + }); + let storage = candle::WithDType::to_cpu_storage_owned(dst); + Ok((storage, (b, t, h, d).into())) + } + + use candle::backend::BackendStorage; + use CpuStorage::{BF16, F16, F32, F64}; + match (s1, s2, s3) { + (BF16(s1), BF16(s2), BF16(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F16(s1), F16(s2), F16(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F32(s1), F32(s2), F32(s3)) => inner(s1, l1, s2, l2, s3, l3), + (F64(s1), F64(s2), F64(s3)) => inner(s1, l1, s2, l2, s3, l3), + _ => candle::bail!( + "unsupported dtype for rope {:?} {:?} {:?}", + s1.dtype(), + s2.dtype(), + s3.dtype() + ), + } + } + + #[cfg(feature = "cuda")] + fn cuda_fwd( + &self, + s1: &candle::CudaStorage, + l1: &Layout, + s2: &candle::CudaStorage, + l2: &Layout, + s3: &candle::CudaStorage, + l3: &Layout, + ) -> Result<(candle::CudaStorage, Shape)> { + use candle::cuda_backend::cudarc::driver::{ + CudaSlice, DeviceRepr, LaunchAsync, LaunchConfig, + }; + use candle::cuda_backend::{kernel_name, kernels, WrapErr}; + use candle::{CudaDevice, WithDType}; + + fn inner( + src: &CudaSlice, + l_src: &Layout, + cos: &CudaSlice, + l_cos: &Layout, + sin: &CudaSlice, + l_sin: &Layout, + dev: &CudaDevice, + ) -> Result> { + let src = match l_src.contiguous_offsets() { + None => candle::bail!("src input has to be contiguous"), + Some((o1, o2)) => src.slice(o1..o2), + }; + let cos = match l_cos.contiguous_offsets() { + None => candle::bail!("cos input has to be contiguous"), + Some((o1, o2)) => cos.slice(o1..o2), + }; + let sin = match l_sin.contiguous_offsets() { + None => candle::bail!("sin input has to be contiguous"), + Some((o1, o2)) => sin.slice(o1..o2), + }; + let (b, t, h, d) = l_src.shape().dims4()?; + let el = b * h * t * d; + let cfg = LaunchConfig::for_num_elems((el / 2) as u32); + let func = dev.get_or_load_func(&kernel_name::("rope_thd"), kernels::REDUCE)?; + // SAFETY: Set later by running the kernel. + let dst = unsafe { dev.alloc::(el) }.w()?; + let params = ( + &src, &cos, &sin, &dst, b as u32, t as u32, h as u32, d as u32, + ); + // SAFETY: ffi. + unsafe { func.launch(cfg, params) }.w()?; + Ok(dst) + } + + use candle::backend::BackendStorage; + use candle::cuda_backend::CudaStorageSlice::{BF16, F16, F32, F64}; + let dev = s1.device(); + let slice = match (&s1.slice, &s2.slice, &s3.slice) { + (BF16(s1), BF16(s2), BF16(s3)) => BF16(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F16(s1), F16(s2), F16(s3)) => F16(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F32(s1), F32(s2), F32(s3)) => F32(inner(s1, l1, s2, l2, s3, l3, dev)?), + (F64(s1), F64(s2), F64(s3)) => F64(inner(s1, l1, s2, l2, s3, l3, dev)?), + _ => candle::bail!( + "unsupported dtype for rope {:?} {:?} {:?}", + s1.dtype(), + s2.dtype(), + s3.dtype() + ), + }; + let dst = candle::cuda_backend::CudaStorage { + slice, + device: dev.clone(), + }; + Ok((dst, l1.shape().clone())) + } + + #[cfg(feature = "metal")] + fn metal_fwd( + &self, + src: &candle::MetalStorage, + l_src: &Layout, + cos: &candle::MetalStorage, + l_cos: &Layout, + sin: &candle::MetalStorage, + l_sin: &Layout, + ) -> Result<(candle::MetalStorage, Shape)> { + use candle::backend::BackendStorage; + let device = src.device(); + let command_buffer = device.command_buffer()?; + let kernels = device.kernels(); + if cos.dtype() != src.dtype() || sin.dtype() != src.dtype() { + candle::bail!( + "dtype mismatch in rope {:?} {:?} {:?}", + src.dtype(), + cos.dtype(), + sin.dtype() + ) + } + let name = match src.dtype() { + candle::DType::F32 => "rope_thd_f32", + candle::DType::F16 => "rope_thd_f16", + candle::DType::BF16 => "rope_thd_bf16", + dtype => candle::bail!("rope_thd is not implemented for {dtype:?}"), + }; + let (b, t, h, d) = l_src.shape().dims4()?; + let el = b * h * t * d; + let output = device.new_buffer(el, src.dtype(), "rope-thd")?; + candle_metal_kernels::call_rope_thd( + device.metal_device(), + &command_buffer, + kernels, + name, + b, + t, + h, + d, + src.buffer(), + l_src.start_offset() * src.dtype().size_in_bytes(), + cos.buffer(), + l_cos.start_offset() * cos.dtype().size_in_bytes(), + sin.buffer(), + l_sin.start_offset() * sin.dtype().size_in_bytes(), + &output, + ) + .map_err(candle::Error::wrap)?; + let out = candle::MetalStorage::new(output, device.clone(), el, src.dtype()); + Ok((out, l_src.shape().clone())) + } +} + +pub fn rope_thd(xs: &Tensor, cos: &Tensor, sin: &Tensor) -> Result { + let (_b_sz, seq_len, _n_head, n_embd) = xs.dims4()?; + let (cos_seq_len, cos_n_embd) = cos.dims2()?; + let (sin_seq_len, sin_n_embd) = sin.dims2()?; + if cos_n_embd * 2 != n_embd + || sin_n_embd * 2 != n_embd + || seq_len > cos_seq_len + || seq_len > sin_seq_len + { + candle::bail!( + "inconsistent last dim size in rope {:?} {:?} {:?}", + xs.shape(), + cos.shape(), + sin.shape() + ) + } + if !xs.is_contiguous() { + candle::bail!("xs has to be contiguous in rope") + } + if !cos.is_contiguous() { + candle::bail!("cos has to be contiguous in rope") + } + if !sin.is_contiguous() { + candle::bail!("sin has to be contiguous in rope") + } + xs.apply_op3_no_bwd(cos, sin, &RotaryEmbThd) +} diff --git a/candle-nn/tests/ops.rs b/candle-nn/tests/ops.rs index 20a66e75..24a49d06 100644 --- a/candle-nn/tests/ops.rs +++ b/candle-nn/tests/ops.rs @@ -140,7 +140,38 @@ fn rope(device: &Device) -> Result<()> { Ok(()) } +fn rope_thd(device: &Device) -> Result<()> { + use rand::{rngs::StdRng, Rng, SeedableRng}; + + let (b_size, num_head, seq_len, head_dim) = (2, 5, 10, 16); + let el_count = b_size * num_head * seq_len * head_dim; + let mut rng = StdRng::seed_from_u64(299792458); + let src: Vec = (0..el_count).map(|_| rng.gen::()).collect(); + let cos: Vec = (0..seq_len * head_dim / 2) + .map(|_| rng.gen::()) + .collect(); + let sin: Vec = (0..seq_len * head_dim / 2) + .map(|_| rng.gen::()) + .collect(); + let src = Tensor::from_vec(src, (b_size, num_head, seq_len, head_dim), device)?; + let cos = Tensor::from_vec(cos, (seq_len, head_dim / 2), device)?; + let sin = Tensor::from_vec(sin, (seq_len, head_dim / 2), device)?; + let rope1 = { + let src = src.transpose(1, 2)?.contiguous()?; + candle_nn::rotary_emb::rope_thd(&src, &cos, &sin)?.transpose(1, 2)? + }; + let rope2 = candle_nn::rotary_emb::rope_slow(&src, &cos, &sin)?; + let sum_diff = (rope1 - rope2)?.abs()?.sum_all()?.to_vec0::()?; + if device.is_cpu() { + assert_eq!(sum_diff, 0.); + } else { + assert!(sum_diff < 1e-4); + } + Ok(()) +} + test_device!(ropei, ropei_cpu, ropei_gpu, ropei_metal); test_device!(rope, rope_cpu, rope_gpu, rope_metal); +test_device!(rope_thd, rope_thd_cpu, rope_thd_gpu, rope_thd_metal); test_device!(softmax, softmax_cpu, softmax_gpu, softmax_metal); test_device!(rms_norm, rms_norm_cpu, rms_norm_gpu, rms_norm_metal); diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index de15c3a5..e9451f0e 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -177,30 +177,14 @@ impl RotaryEmbedding { } let (_rotary_seqlen, rotary_dim) = self.cos.dims2()?; let rotary_dim = rotary_dim * 2; - let q_rot = qkv.i((.., .., 0, .., ..rotary_dim))?; + let q_rot = qkv.i((.., .., 0, .., ..rotary_dim))?.contiguous()?; let q_pass = qkv.i((.., .., 0, .., rotary_dim..))?; - let k_rot = qkv.i((.., .., 1, .., ..rotary_dim))?; + let k_rot = qkv.i((.., .., 1, .., ..rotary_dim))?.contiguous()?; let k_pass = qkv.i((.., .., 1, .., rotary_dim..))?; - let q12 = q_rot.chunk(2, D::Minus1)?; - let k12 = k_rot.chunk(2, D::Minus1)?; - let (q1, q2) = (&q12[0], &q12[1]); - let (k1, k2) = (&k12[0], &k12[1]); - let c = self.cos.narrow(0, seqlen_offset, seqlen)?.unsqueeze(1)?; - let s = self.sin.narrow(0, seqlen_offset, seqlen)?.unsqueeze(1)?; - let q_rot = Tensor::cat( - &[ - (q1.broadcast_mul(&c)? - q2.broadcast_mul(&s)?)?, - (q1.broadcast_mul(&s)? + q2.broadcast_mul(&c)?)?, - ], - D::Minus1, - )?; - let k_rot = Tensor::cat( - &[ - (k1.broadcast_mul(&c)? - k2.broadcast_mul(&s)?)?, - (k1.broadcast_mul(&s)? + k2.broadcast_mul(&c)?)?, - ], - D::Minus1, - )?; + let c = self.cos.narrow(0, seqlen_offset, seqlen)?; + let s = self.sin.narrow(0, seqlen_offset, seqlen)?; + let q_rot = candle_nn::rotary_emb::rope_thd(&q_rot, &c, &s)?; + let k_rot = candle_nn::rotary_emb::rope_thd(&k_rot, &c, &s)?; let q = Tensor::cat(&[&q_rot, &q_pass], D::Minus1)?; let k = Tensor::cat(&[&k_rot, &k_pass], D::Minus1)?; let v = qkv.i((.., .., 2))?; From 88f77935980f55b55e516cfd1feec946cc2f76b6 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 5 Apr 2024 09:11:08 +0200 Subject: [PATCH 119/131] Moondream tracing. (#2016) * Moondream tracing. * A bit more tracing. --- candle-transformers/src/models/mixformer.rs | 32 ++++++++++++++++----- candle-transformers/src/models/moondream.rs | 29 +++++++++++++++---- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index e9451f0e..d9676a35 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -198,6 +198,7 @@ struct MLP { fc1: Linear, fc2: Linear, act: Activation, + span: tracing::Span, } impl MLP { @@ -209,12 +210,14 @@ impl MLP { fc1, fc2, act: cfg.activation_function, + span: tracing::span!(tracing::Level::TRACE, "mlp"), }) } } impl Module for MLP { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2) } } @@ -252,6 +255,9 @@ struct MHA { n_head: usize, softmax_scale: f64, span: tracing::Span, + span_rope: tracing::Span, + span_mask: tracing::Span, + span_softmax: tracing::Span, } impl MHA { @@ -272,6 +278,9 @@ impl MHA { rotary_emb, softmax_scale, span: tracing::span!(tracing::Level::TRACE, "mha"), + span_rope: tracing::span!(tracing::Level::TRACE, "rope"), + span_mask: tracing::span!(tracing::Level::TRACE, "mask"), + span_softmax: tracing::span!(tracing::Level::TRACE, "softmax"), }) } @@ -287,7 +296,10 @@ impl MHA { Some((prev_k, _)) => prev_k.dim(1)?, }; // In the python implementation, a single tensor is returned with the third axis of size 3. - let (q, k, v) = self.rotary_emb.apply_rotary_emb_qkv(&qkv, seqlen_offset)?; + let (q, k, v) = { + let _enter = self.span_rope.enter(); + self.rotary_emb.apply_rotary_emb_qkv(&qkv, seqlen_offset)? + }; let (k, v) = match &self.kv_cache { None => (k, v), Some((prev_k, prev_v)) => { @@ -307,13 +319,19 @@ impl MHA { // scores = scores + causal_mask.to(dtype=scores.dtype) let attn_weights = match mask { None => attn_weights, - Some(mask) => masked_fill( - &attn_weights, - &mask.broadcast_left(b_size * self.n_head)?, - f32::NEG_INFINITY, - )?, + Some(mask) => { + let _enter = self.span_mask.enter(); + masked_fill( + &attn_weights, + &mask.broadcast_left(b_size * self.n_head)?, + f32::NEG_INFINITY, + )? + } + }; + let attn_weights = { + let _enter = self.span_softmax.enter(); + candle_nn::ops::softmax_last_dim(&attn_weights)? }; - let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?; // output = torch.einsum('bhts,bshd->bthd', attention_drop, v) // attn_weights: b*h,t,s, v: b*h,s,d diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs index 717f3bb4..7ad8c921 100644 --- a/candle-transformers/src/models/moondream.rs +++ b/candle-transformers/src/models/moondream.rs @@ -1,6 +1,7 @@ use crate::models::mixformer::{Config as PhiConfig, MixFormerSequentialForCausalLM as PhiModel}; -use candle::{IndexOp, Result, Tensor, D}; -use candle_nn::{layer_norm, linear_b, Linear, Module, VarBuilder}; +use crate::models::with_tracing::{layer_norm, linear_b, LayerNorm, Linear}; +use candle::{IndexOp, Module, Result, Tensor, D}; +use candle_nn::VarBuilder; pub struct Config { pub phi_config: PhiConfig, @@ -76,6 +77,7 @@ struct Attention { head_dim: usize, qkv: Linear, proj: Linear, + span: tracing::Span, } impl Attention { @@ -87,12 +89,14 @@ impl Attention { head_dim: dim / num_heads, qkv, proj, + span: tracing::span!(tracing::Level::TRACE, "vit-attn"), }) } } impl Module for Attention { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let (b, n, c) = xs.dims3()?; let qkv = xs .apply(&self.qkv)? @@ -114,8 +118,9 @@ impl Module for Attention { struct VitBlock { attn: Attention, mlp: Mlp, - norm1: candle_nn::LayerNorm, - norm2: candle_nn::LayerNorm, + norm1: LayerNorm, + norm2: LayerNorm, + span: tracing::Span, } impl VitBlock { @@ -129,12 +134,14 @@ impl VitBlock { mlp, norm1, norm2, + span: tracing::span!(tracing::Level::TRACE, "vit-block"), }) } } impl Module for VitBlock { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let ys = xs.apply(&self.norm1)?.apply(&self.attn)?; let xs = (xs + &ys)?; let ys = xs.apply(&self.norm2)?.apply(&self.mlp)?; @@ -148,7 +155,8 @@ struct VisionTransformer { patch_embed: LinearPatchEmbedding, pos_embed: Tensor, blocks: Vec, - norm: candle_nn::LayerNorm, + norm: LayerNorm, + span: tracing::Span, } impl VisionTransformer { @@ -171,12 +179,14 @@ impl VisionTransformer { pos_embed, blocks, norm, + span: tracing::span!(tracing::Level::TRACE, "vit"), }) } } impl Module for VisionTransformer { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); let mut xs = (&xs.apply(&self.patch_embed)? + &self.pos_embed)?; for block in self.blocks.iter() { xs = xs.apply(block)?; @@ -208,6 +218,7 @@ struct Mlp { fc1: Linear, act: candle_nn::Activation, fc2: Linear, + span: tracing::Span, } impl Mlp { @@ -220,12 +231,18 @@ impl Mlp { ) -> Result { let fc1 = linear_b(in_features, hidden_features, true, vb.pp("fc1"))?; let fc2 = linear_b(hidden_features, out_features, true, vb.pp("fc2"))?; - Ok(Self { fc1, act, fc2 }) + Ok(Self { + fc1, + act, + fc2, + span: tracing::span!(tracing::Level::TRACE, "mlp"), + }) } } impl Module for Mlp { fn forward(&self, xs: &Tensor) -> Result { + let _enter = self.span.enter(); xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2) } } From b869a659ec678763b4ba03dc73044be2ba9ad562 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 5 Apr 2024 09:38:26 +0200 Subject: [PATCH 120/131] Faster mask implementation for mixformers. (#2017) * Faster mask implementation for mixformers. * Clippy. --- candle-transformers/src/models/mixformer.rs | 27 +++++---------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index d9676a35..700829e3 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -126,20 +126,11 @@ impl Module for Embedding { } } -fn get_mask(size: usize, device: &Device) -> Result { +fn get_mask(size: usize, dtype: DType, device: &Device) -> Result { let mask: Vec<_> = (0..size) - .flat_map(|i| (0..size).map(move |j| u8::from(j > i))) + .flat_map(|i| (0..size).map(move |j| if j > i { f32::NEG_INFINITY } else { 0. })) .collect(); - Tensor::from_slice(&mask, (size, size), device) -} - -fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result { - let shape = mask.shape(); - let on_true = Tensor::new(on_true, on_false.device())? - .to_dtype(on_false.dtype())? - .broadcast_as(shape.dims())?; - let m = mask.where_cond(&on_true, on_false)?; - Ok(m) + Tensor::from_slice(&mask, (size, size), device)?.to_dtype(dtype) } #[derive(Debug, Clone)] @@ -252,7 +243,6 @@ struct MHA { rotary_emb: RotaryEmbedding, kv_cache: Option<(Tensor, Tensor)>, head_dim: usize, - n_head: usize, softmax_scale: f64, span: tracing::Span, span_rope: tracing::Span, @@ -273,7 +263,6 @@ impl MHA { wqkv, out_proj, head_dim, - n_head: cfg.n_head, kv_cache: None, rotary_emb, softmax_scale, @@ -321,11 +310,7 @@ impl MHA { None => attn_weights, Some(mask) => { let _enter = self.span_mask.enter(); - masked_fill( - &attn_weights, - &mask.broadcast_left(b_size * self.n_head)?, - f32::NEG_INFINITY, - )? + attn_weights.broadcast_add(mask)? } }; let attn_weights = { @@ -435,7 +420,7 @@ impl MixFormerSequentialForCausalLM { let mask = if seq_len <= 1 { None } else { - Some(get_mask(seq_len, xs.device())?) + Some(get_mask(seq_len, xs.dtype(), xs.device())?) }; for block in self.blocks.iter_mut() { xs = block.forward(&xs, mask.as_ref())? @@ -456,7 +441,7 @@ impl MixFormerSequentialForCausalLM { // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56 let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?; let (_b_size, seq_len, _embds) = xs.dims3()?; - let mask = Some(get_mask(seq_len, xs.device())?); + let mask = Some(get_mask(seq_len, xs.dtype(), xs.device())?); for block in self.blocks.iter_mut() { xs = block.forward(&xs, mask.as_ref())? } From ab892274d1ec5ca0c11c25e61337e33e208fab7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= Date: Fri, 5 Apr 2024 14:20:28 +0100 Subject: [PATCH 121/131] first commit (#2018) --- candle-core/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index ca73bb1c..1f57ca9b 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -77,7 +77,7 @@ pub use cuda_backend::cudnn; pub use cpu_backend::CpuStorage; pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}; pub use device::{Device, DeviceLocation, NdArray}; -pub use dtype::{DType, FloatDType, IntDType, WithDType}; +pub use dtype::{DType, DTypeParseError, FloatDType, IntDType, WithDType}; pub use error::{Error, Result}; pub use indexer::IndexOp; pub use layout::Layout; From e662431acf30e167e2d63af01090b015f89b6e3b Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 6 Apr 2024 19:35:01 +0200 Subject: [PATCH 122/131] Fix the final rmsnorm for quantized-metavoice. (#2021) --- candle-transformers/src/models/quantized_metavoice.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/candle-transformers/src/models/quantized_metavoice.rs b/candle-transformers/src/models/quantized_metavoice.rs index 84c0388c..947ab750 100644 --- a/candle-transformers/src/models/quantized_metavoice.rs +++ b/candle-transformers/src/models/quantized_metavoice.rs @@ -235,6 +235,7 @@ pub mod transformer { xs = layer.forward(&xs, pos, &mask)? } xs.narrow(1, seqlen - 1, 1)? + .contiguous()? .apply(&self.norm)? .apply(&self.output) } From 9fd52b3b71b37049d4f824379ce74b3f4d1b4eeb Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 6 Apr 2024 20:02:24 +0200 Subject: [PATCH 123/131] Handle the batch dimension in quantized MMV on metal. (#2022) --- candle-core/src/quantized/metal.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/candle-core/src/quantized/metal.rs b/candle-core/src/quantized/metal.rs index 7be0f74e..c310d766 100644 --- a/candle-core/src/quantized/metal.rs +++ b/candle-core/src/quantized/metal.rs @@ -149,8 +149,11 @@ impl QMetalStorage { let (n, k) = self_shape.dims2()?; let mut dst_shape = src_shape.dims().to_vec(); + // We always use a single batch dimension and stack all the tensors in the batch on the + // second dimension as the implementation in candle-metal-kernels doesn't handle batch + // properly. let (b, m) = match dst_shape.len() { - 3 => (dst_shape[0], dst_shape[1]), + 3 => (1, dst_shape[0] * dst_shape[1]), 2 => (1, dst_shape[0]), n => crate::bail!("Invalid rank {n} for quantized matmul metal"), }; From 33c9b6655459bd1086574cef9ba8f2e72a8804c8 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 6 Apr 2024 21:25:38 +0200 Subject: [PATCH 124/131] Add the new gemma models. (#2023) * Add the new gemma models. * Revert the lightning changes. * Support for the 1.1 models. --- candle-examples/examples/gemma/main.rs | 35 ++++++++++++++++++++----- candle-transformers/src/models/gemma.rs | 1 + 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/candle-examples/examples/gemma/main.rs b/candle-examples/examples/gemma/main.rs index e1df8790..0e37f5cd 100644 --- a/candle-examples/examples/gemma/main.rs +++ b/candle-examples/examples/gemma/main.rs @@ -16,6 +16,22 @@ use candle_transformers::generation::LogitsProcessor; use hf_hub::{api::sync::Api, Repo, RepoType}; use tokenizers::Tokenizer; +#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)] +enum Which { + #[value(name = "2b")] + Base2B, + #[value(name = "7b")] + Base7B, + #[value(name = "2b-it")] + Instruct2B, + #[value(name = "7b-it")] + Instruct7B, + #[value(name = "1.1-2b-it")] + InstructV1_1_2B, + #[value(name = "1.1-7b-it")] + InstructV1_1_7B, +} + struct TextGeneration { model: Model, device: Device, @@ -165,6 +181,10 @@ struct Args { /// The context size to consider for the repeat penalty. #[arg(long, default_value_t = 64)] repeat_last_n: usize, + + /// The model to use. + #[arg(long, default_value = "2b")] + which: Which, } fn main() -> Result<()> { @@ -196,14 +216,15 @@ fn main() -> Result<()> { let start = std::time::Instant::now(); let api = Api::new()?; let model_id = match &args.model_id { - Some(model_id) => match model_id.as_str() { - "7b-it" => "google/gemma-7b-it".to_string(), - "7b" => "google/gemma-7b".to_string(), - "2b-it" => "google/gemma-2b-it".to_string(), - "2b" => "google/gemma-2b".to_string(), - _ => model_id.to_string(), + Some(model_id) => model_id.to_string(), + None => match args.which { + Which::InstructV1_1_2B => "google/gemma-1.1-2b-it".to_string(), + Which::InstructV1_1_7B => "google/gemma-1.1-7b-it".to_string(), + Which::Base2B => "google/gemma-2b".to_string(), + Which::Base7B => "google/gemma-7b".to_string(), + Which::Instruct2B => "google/gemma-2b-it".to_string(), + Which::Instruct7B => "google/gemma-7b-it".to_string(), }, - None => "google/gemma-2b".to_string(), }; let repo = api.repo(Repo::with_revision( model_id, diff --git a/candle-transformers/src/models/gemma.rs b/candle-transformers/src/models/gemma.rs index 282d5eb2..ab2a9582 100644 --- a/candle-transformers/src/models/gemma.rs +++ b/candle-transformers/src/models/gemma.rs @@ -11,6 +11,7 @@ fn default_max_position_embeddings() -> usize { pub struct Config { pub attention_bias: bool, pub head_dim: usize, + #[serde(alias = "hidden_activation")] pub hidden_act: candle_nn::Activation, pub hidden_size: usize, pub intermediate_size: usize, From 7f354473cf495db4554e08f84be44ed498f1aa5e Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 7 Apr 2024 12:34:16 +0200 Subject: [PATCH 125/131] Optimize copy-2d for metal. (#2024) * Optimize copy-2d for metal. * Add a hacky stopping rule for moondream. --- candle-examples/examples/moondream/main.rs | 2 +- candle-metal-kernels/src/lib.rs | 57 +++++++++++++++++++--- candle-metal-kernels/src/unary.metal | 20 +++----- 3 files changed, 58 insertions(+), 21 deletions(-) diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs index c7500ed9..646ef258 100644 --- a/candle-examples/examples/moondream/main.rs +++ b/candle-examples/examples/moondream/main.rs @@ -123,7 +123,7 @@ impl TextGeneration { let next_token = self.logits_processor.sample(&logits)?; tokens.push(next_token); generated_tokens += 1; - if next_token == eos_token { + if next_token == eos_token || tokens.ends_with(&[27, 10619, 29] /* */) { break; } let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?; diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 4cff9bda..8b9be670 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -40,6 +40,44 @@ fn linear_split(pipeline: &ComputePipelineState, length: usize) -> (MTLSize, MTL (thread_group_count, thread_group_size) } +// https://github.com/ml-explore/mlx/blob/bddf23f175726a57f0e443cd45518c0757daa166/mlx/backend/metal/utils.h#L96 +fn get_block_dims(dim0: u64, dim1: u64, dim2: u64) -> MTLSize { + let mut pows0 = 0u64; + let mut pows1 = 0u64; + let mut pows2 = 0u64; + let mut sum = 0u64; + loop { + let presum = sum; + // Check all the pows + if dim0 >= (1 << (pows0 + 1)) { + pows0 += 1; + sum += 1; + } + if sum == 10 { + break; + } + if dim1 >= (1 << (pows1 + 1)) { + pows1 += 1; + sum += 1; + } + if sum == 10 { + break; + } + if dim2 >= (1 << (pows2 + 1)) { + pows2 += 1; + sum += 1; + } + if sum == presum || sum == 10 { + break; + } + } + MTLSize { + width: 1 << pows0, + height: 1 << pows1, + depth: 1 << pows2, + } +} + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: P) {

::set_param(encoder, position, data) } @@ -396,21 +434,24 @@ pub fn call_copy2d( set_params!( encoder, ( - d1, - d2, - src_s, - dst_s, + d1 as i64, + d2 as i64, + src_s as i64, + dst_s as i64, (input, src_o_in_bytes), (output, dst_o_in_bytes) ) ); - let width: usize = d1 * d2; - let (thread_group_count, thread_group_size) = linear_split(&pipeline, width); - + let grid_dims = MTLSize { + width: d1 as u64, + height: d2 as u64, + depth: 1, + }; + let group_dims = get_block_dims(d1 as u64, d2 as u64, 1); encoder.use_resource(input, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); - encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + encoder.dispatch_threads(grid_dims, group_dims); encoder.end_encoding(); Ok(()) } diff --git a/candle-metal-kernels/src/unary.metal b/candle-metal-kernels/src/unary.metal index 809522d7..4b6363ed 100644 --- a/candle-metal-kernels/src/unary.metal +++ b/candle-metal-kernels/src/unary.metal @@ -104,21 +104,17 @@ UNARY(NAME, bfloat, NAME##_bf16, NAME##_bf16_strided); #define COPY2D(FN_NAME, TYPENAME) \ kernel void FN_NAME( \ - constant size_t &d1, \ - constant size_t &d2, \ - constant size_t &src_s, \ - constant size_t &dst_s, \ + constant int64_t &d1, \ + constant int64_t &d2, \ + constant int64_t &src_s, \ + constant int64_t &dst_s, \ device const TYPENAME *input, \ device TYPENAME *output, \ - uint tid [[ thread_position_in_grid ]] \ + uint2 idx [[thread_position_in_grid]] \ ) { \ - if (tid >= d1 * d2) { \ - return; \ - } \ - size_t idx1 = tid / d2; \ - size_t idx2 = tid - idx1 * d2; \ - size_t src_idx = idx1 * src_s + idx2; \ - size_t dst_idx = idx1 * dst_s + idx2; \ + if (idx.x >= d1 || idx.y >= d2) return; \ + int64_t src_idx = idx.x * src_s + idx.y; \ + int64_t dst_idx = idx.x * dst_s + idx.y; \ output[dst_idx] = input[src_idx]; \ } From c5fe4a7f8983ae7c9641fa923f26ef60538aef06 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 7 Apr 2024 22:37:53 +0200 Subject: [PATCH 126/131] Rework the buffer offset logic for metal kernels (#2028) * Move the metal kernels utils in a separate module. * Use the BufferOffset for unary ops. * Fix clippy lints. * Use the new BufferOffset. * Adapt the binary ops. * Affine. * More ops (powf, elu, cast). --- candle-core/src/metal_backend/mod.rs | 82 ++++---- candle-metal-kernels/src/lib.rs | 289 ++++++--------------------- candle-metal-kernels/src/tests.rs | 58 +++--- candle-metal-kernels/src/utils.rs | 162 +++++++++++++++ 4 files changed, 305 insertions(+), 286 deletions(-) create mode 100644 candle-metal-kernels/src/utils.rs diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index 0e058b45..4adcda05 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -2,8 +2,7 @@ use crate::backend::{BackendDevice, BackendStorage}; use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvTranspose2D}; use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT}; use crate::{CpuStorage, DType, Layout, Result, Shape}; -use candle_metal_kernels::CallConvTranspose2dCfg; -use candle_metal_kernels::Kernels; +use candle_metal_kernels::{BufferOffset, CallConvTranspose2dCfg, Kernels}; use metal::{Buffer, MTLResourceOptions, NSUInteger}; use std::collections::HashMap; use std::ffi::c_void; @@ -12,6 +11,12 @@ use std::sync::{Arc, Mutex, RwLock, TryLockError}; mod device; pub use device::{DeviceId, MetalDevice}; +fn buffer_o<'a>(buffer: &'a Buffer, l: &Layout, dtype: DType) -> BufferOffset<'a> { + BufferOffset { + buffer, + offset_in_bytes: l.start_offset() * dtype.size_in_bytes(), + } +} /// Simple way to catch lock error without /// depending on T #[derive(thiserror::Error, Debug)] @@ -102,7 +107,8 @@ impl BackendStorage for MetalStorage { let buffer = device.new_buffer(el, self.dtype, "affine")?; let command_buffer = self.device.command_buffer()?; - if layout.is_contiguous() && layout.start_offset() == 0 { + let src = buffer_o(&self.buffer, layout, dtype); + if layout.is_contiguous() { let name = match self.dtype { DType::F32 => "affine_f32", DType::F16 => "affine_f16", @@ -115,7 +121,7 @@ impl BackendStorage for MetalStorage { &device.kernels, name, el, - &self.buffer, + src, &buffer, mul as f32, add as f32, @@ -134,9 +140,8 @@ impl BackendStorage for MetalStorage { &device.kernels, name, layout.dims(), - &self.buffer, + src, layout.stride(), - layout.start_offset() * dtype.size_in_bytes(), &buffer, mul as f32, add as f32, @@ -155,7 +160,8 @@ impl BackendStorage for MetalStorage { let buffer = device.new_buffer(el, self.dtype, "powf")?; let command_buffer = self.device.command_buffer()?; - if layout.is_contiguous() && layout.start_offset() == 0 { + let src = buffer_o(&self.buffer, layout, dtype); + if layout.is_contiguous() { let name = match self.dtype { DType::F32 => "powf_f32", DType::F16 => "powf_f16", @@ -168,7 +174,7 @@ impl BackendStorage for MetalStorage { &device.kernels, name, el, - &self.buffer, + src, &buffer, pow as f32, ) @@ -186,9 +192,8 @@ impl BackendStorage for MetalStorage { &device.kernels, name, layout.dims(), - &self.buffer, + src, layout.stride(), - layout.start_offset() * dtype.size_in_bytes(), &buffer, pow as f32, ) @@ -206,7 +211,8 @@ impl BackendStorage for MetalStorage { let buffer = device.new_buffer(el, self.dtype, "elu")?; let command_buffer = self.device.command_buffer()?; - if layout.is_contiguous() && layout.start_offset() == 0 { + let src = buffer_o(&self.buffer, layout, self.dtype); + if layout.is_contiguous() { let name = match self.dtype { DType::F32 => "elu_f32", DType::F16 => "elu_f16", @@ -219,7 +225,7 @@ impl BackendStorage for MetalStorage { &device.kernels, name, el, - &self.buffer, + src, &buffer, alpha as f32, ) @@ -237,9 +243,8 @@ impl BackendStorage for MetalStorage { &device.kernels, name, layout.dims(), - &self.buffer, + src, layout.stride(), - layout.start_offset() * dtype.size_in_bytes(), &buffer, alpha as f32, ) @@ -344,7 +349,8 @@ impl BackendStorage for MetalStorage { let el_count = shape.elem_count(); let buffer = device.new_buffer(el_count, dtype, "todtype")?; let command_buffer = device.command_buffer()?; - if layout.is_contiguous() && layout.start_offset() == 0 { + let src = buffer_o(&self.buffer, layout, self.dtype); + if layout.is_contiguous() { let kernel_name = match (self.dtype, dtype) { (DType::U32, DType::BF16) => "cast_u32_bf16", (DType::U32, DType::F16) => "cast_u32_f16", @@ -392,8 +398,7 @@ impl BackendStorage for MetalStorage { &device.kernels, kernel_name, el_count, - &self.buffer, - layout.start_offset() * self.dtype.size_in_bytes(), + src, &buffer, ) .map_err(MetalError::from)?; @@ -420,9 +425,8 @@ impl BackendStorage for MetalStorage { &device.kernels, kernel_name, layout.dims(), - &self.buffer, + src, layout.stride(), - layout.start_offset() * self.dtype.size_in_bytes(), &buffer, ) .map_err(MetalError::from)?; @@ -439,7 +443,8 @@ impl BackendStorage for MetalStorage { let buffer = device.new_buffer(el_count, dtype, B::KERNEL)?; let command_buffer = device.command_buffer()?; command_buffer.set_label(B::KERNEL); - if layout.is_contiguous() && layout.start_offset() == 0 { + let src = buffer_o(&self.buffer, layout, self.dtype); + if layout.is_contiguous() { use candle_metal_kernels::unary::contiguous; let kernel_name = match (B::KERNEL, dtype) { @@ -511,7 +516,7 @@ impl BackendStorage for MetalStorage { &device.kernels, kernel_name, el_count, - &self.buffer, + src, &buffer, ) .map_err(MetalError::from)?; @@ -556,17 +561,16 @@ impl BackendStorage for MetalStorage { crate::bail!("Metal strided unary {name} {dtype:?} not implemented") } }; + let dst = BufferOffset::zero_offset(&buffer); candle_metal_kernels::call_unary_strided( &device.device, &command_buffer, &device.kernels, kernel_name, layout.dims(), - &self.buffer, + src, layout.stride(), - layout.start_offset() * self.dtype.size_in_bytes(), - &buffer, - 0, + dst, ) .map_err(MetalError::from)?; } @@ -1358,17 +1362,20 @@ impl BackendStorage for MetalStorage { DType::U8 => candle_metal_kernels::unary::strided::copy::U8, dtype => crate::bail!("Metal copy_strided {dtype:?} not implemented"), }; + let src = buffer_o(&self.buffer, src_l, self.dtype); + let dst = BufferOffset { + buffer: &dst.buffer, + offset_in_bytes: dst_offset * dst.dtype.size_in_bytes(), + }; candle_metal_kernels::call_unary_strided( &self.device.device, &command_buffer, &self.device.kernels, kernel_name, src_l.dims(), - &self.buffer, + src, src_l.stride(), - src_l.start_offset() * self.dtype.size_in_bytes(), - &dst.buffer, - dst_offset * dst.dtype.size_in_bytes(), + dst, ) .map_err(MetalError::from)?; command_buffer.set_label("copy_strided"); @@ -1402,10 +1409,9 @@ impl MetalStorage { let shape = lhs_l.shape(); let el_count = shape.elem_count(); let command_buffer = device.command_buffer()?; - let (buffer, dtype) = if (lhs_l.is_contiguous() && lhs_l.start_offset() == 0) - && (rhs_l.is_contiguous() && rhs_l.start_offset() == 0) - && &op[..1] != "b" - { + let lhs = buffer_o(&self.buffer, lhs_l, self.dtype); + let rhs = buffer_o(&rhs.buffer, rhs_l, rhs.dtype); + let (buffer, dtype) = if lhs_l.is_contiguous() && rhs_l.is_contiguous() && &op[..1] != "b" { use candle_metal_kernels::binary::contiguous; let (kernel_name, dtype) = match (op, self.dtype) { @@ -1486,8 +1492,8 @@ impl MetalStorage { &device.kernels, kernel_name, el_count, - &self.buffer, - &rhs.buffer, + lhs, + rhs, &buffer, ) .map_err(MetalError::from)?; @@ -1585,12 +1591,10 @@ impl MetalStorage { &device.kernels, kernel_name, lhs_l.dims(), - &self.buffer, + lhs, lhs_l.stride(), - lhs_l.start_offset() * self.dtype.size_in_bytes(), - &rhs.buffer, + rhs, rhs_l.stride(), - rhs_l.start_offset() * rhs.dtype.size_in_bytes(), &buffer, ) .map_err(MetalError::from)?; diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 8b9be670..23c072af 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -1,11 +1,15 @@ use metal::{ - Buffer, CommandBufferRef, CompileOptions, ComputeCommandEncoderRef, ComputePipelineState, - Device, Function, FunctionConstantValues, Library, MTLDataType, MTLSize, NSUInteger, + Buffer, CommandBufferRef, CompileOptions, ComputePipelineState, Device, Function, + FunctionConstantValues, Library, MTLDataType, MTLSize, NSUInteger, }; use std::collections::HashMap; use std::ffi::c_void; use std::sync::RwLock; +mod utils; +pub use utils::BufferOffset; +use utils::{get_block_dims, linear_split}; + const AFFINE: &str = include_str!("affine.metal"); const INDEXING: &str = include_str!("indexing.metal"); const UNARY: &str = include_str!("unary.metal"); @@ -18,138 +22,6 @@ const RANDOM: &str = include_str!("random.metal"); const MFA: &[u8] = include_bytes!("libMetalFlashAttention.metallib"); const QUANTIZED: &str = include_str!("quantized.metal"); -/// Most kernels apply similarly across the tensors -/// This creates a strategy that uses the maximum amount of threads per threadgroup (capped at the -/// actual total buffer length). -/// Then kernels can just do their op on their single point in the buffer. -fn linear_split(pipeline: &ComputePipelineState, length: usize) -> (MTLSize, MTLSize) { - let size = length as u64; - let width = std::cmp::min(pipeline.max_total_threads_per_threadgroup(), size); - let count = (size + width - 1) / width; - let thread_group_count = MTLSize { - width: count, - height: 1, - depth: 1, - }; - - let thread_group_size = MTLSize { - width, - height: 1, - depth: 1, - }; - (thread_group_count, thread_group_size) -} - -// https://github.com/ml-explore/mlx/blob/bddf23f175726a57f0e443cd45518c0757daa166/mlx/backend/metal/utils.h#L96 -fn get_block_dims(dim0: u64, dim1: u64, dim2: u64) -> MTLSize { - let mut pows0 = 0u64; - let mut pows1 = 0u64; - let mut pows2 = 0u64; - let mut sum = 0u64; - loop { - let presum = sum; - // Check all the pows - if dim0 >= (1 << (pows0 + 1)) { - pows0 += 1; - sum += 1; - } - if sum == 10 { - break; - } - if dim1 >= (1 << (pows1 + 1)) { - pows1 += 1; - sum += 1; - } - if sum == 10 { - break; - } - if dim2 >= (1 << (pows2 + 1)) { - pows2 += 1; - sum += 1; - } - if sum == presum || sum == 10 { - break; - } - } - MTLSize { - width: 1 << pows0, - height: 1 << pows1, - depth: 1 << pows2, - } -} - -fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: P) { -

::set_param(encoder, position, data) -} - -/// Helper functions to create the various objects on the compute command encoder -/// on a single line. -/// Prevents getting wrong some arguments number and mixing length and size in bytes. -trait EncoderParam { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self); -} -macro_rules! primitive { - ($type:ty) => { - impl EncoderParam for $type { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { - encoder.set_bytes( - position, - core::mem::size_of::<$type>() as u64, - &data as *const $type as *const c_void, - ); - } - } - }; -} -primitive!(bool); -primitive!(usize); -primitive!(i32); -primitive!(i64); -primitive!(u32); -primitive!(u64); -primitive!(f32); - -impl EncoderParam for &[T] { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { - encoder.set_bytes( - position, - core::mem::size_of_val(data) as u64, - data.as_ptr() as *const c_void, - ); - } -} - -impl EncoderParam for &Buffer { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { - encoder.set_buffer(position, Some(data), 0); - } -} -impl EncoderParam for (&Buffer, usize) { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { - encoder.set_buffer(position, Some(data.0), data.1 as u64); - } -} -impl EncoderParam for &mut Buffer { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { - encoder.set_buffer(position, Some(data), 0); - } -} -impl EncoderParam for (&mut Buffer, usize) { - fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { - encoder.set_buffer(position, Some(data.0), data.1 as u64); - } -} - -macro_rules! set_params { - ($encoder:ident, ($($param:expr),+)) => ( - let mut _index = 0; - $( - set_param($encoder, _index, $param); - _index += 1; - )* - ); -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Source { Affine, @@ -273,6 +145,12 @@ pub struct Kernels { pipelines: RwLock, } +impl Default for Kernels { + fn default() -> Self { + Self::new() + } +} + impl Kernels { pub fn new() -> Self { let libraries = RwLock::new(Libraries::new()); @@ -396,17 +274,17 @@ pub fn call_unary_contiguous( kernels: &Kernels, kernel_name: unary::contiguous::Kernel, length: usize, - input: &Buffer, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Unary, kernel_name.0)?; let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (length, input, output)); + set_params!(encoder, (length, &input, output)); let (thread_group_count, thread_group_size) = linear_split(&pipeline, length); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -463,11 +341,9 @@ pub fn call_unary_strided( kernels: &Kernels, name: unary::strided::Kernel, shape: &[usize], - input: &Buffer, + input: BufferOffset, strides: &[usize], - offset: usize, - output: &Buffer, - output_offset: usize, + output: BufferOffset, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Unary, name.0)?; @@ -476,23 +352,13 @@ pub fn call_unary_strided( encoder.set_compute_pipeline_state(&pipeline); let length: usize = shape.iter().product(); - set_params!( - encoder, - ( - length, - num_dims, - shape, - strides, - (input, offset), - (output, output_offset) - ) - ); + set_params!(encoder, (length, num_dims, shape, strides, &input, &output)); let width: usize = shape.iter().product(); let (thread_group_count, thread_group_size) = linear_split(&pipeline, width); - encoder.use_resource(input, metal::MTLResourceUsage::Read); - encoder.use_resource(output, metal::MTLResourceUsage::Write); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(output.buffer, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); Ok(()) @@ -505,8 +371,8 @@ pub fn call_binary_contiguous( kernels: &Kernels, kernel_name: binary::contiguous::Kernel, length: usize, - left: &Buffer, - right: &Buffer, + left: BufferOffset, + right: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Binary, kernel_name.0)?; @@ -514,12 +380,12 @@ pub fn call_binary_contiguous( let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (length, left, right, output)); + set_params!(encoder, (length, &left, &right, output)); let (thread_group_count, thread_group_size) = linear_split(&pipeline, length); - encoder.use_resource(left, metal::MTLResourceUsage::Read); - encoder.use_resource(right, metal::MTLResourceUsage::Read); + encoder.use_resource(left.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(right.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -533,12 +399,10 @@ pub fn call_binary_strided( kernels: &Kernels, name: binary::strided::Kernel, shape: &[usize], - left_input: &Buffer, + left_input: BufferOffset, left_strides: &[usize], - left_offset: usize, - right_input: &Buffer, + right_input: BufferOffset, right_strides: &[usize], - right_offset: usize, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Binary, name.0)?; @@ -558,16 +422,16 @@ pub fn call_binary_strided( shape, left_strides, right_strides, - (left_input, left_offset), - (right_input, right_offset), + &left_input, + &right_input, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, width); - encoder.use_resource(left_input, metal::MTLResourceUsage::Read); - encoder.use_resource(right_input, metal::MTLResourceUsage::Read); + encoder.use_resource(left_input.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(right_input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -581,8 +445,7 @@ pub fn call_cast_contiguous( kernels: &Kernels, kernel_name: &'static str, length: usize, - input: &Buffer, - input_offset: usize, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Cast, kernel_name)?; @@ -590,10 +453,10 @@ pub fn call_cast_contiguous( let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (length, (input, input_offset), output)); + set_params!(encoder, (length, &input, output)); let (thread_group_count, thread_group_size) = linear_split(&pipeline, length); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -607,9 +470,8 @@ pub fn call_cast_strided( kernels: &Kernels, kernel_name: &'static str, shape: &[usize], - input: &Buffer, + input: BufferOffset, input_strides: &[usize], - input_offset: usize, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Cast, kernel_name)?; @@ -621,25 +483,19 @@ pub fn call_cast_strided( set_params!( encoder, - ( - length, - shape.len(), - shape, - input_strides, - (input, input_offset), - output - ) + (length, shape.len(), shape, input_strides, &input, output) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, length); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); Ok(()) } +#[allow(clippy::too_many_arguments)] pub fn call_reduce_contiguous( device: &Device, command_buffer: &CommandBufferRef, @@ -687,6 +543,7 @@ pub fn call_reduce_contiguous( Ok(()) } +#[allow(clippy::too_many_arguments)] pub fn call_reduce_strided( device: &Device, command_buffer: &CommandBufferRef, @@ -985,7 +842,7 @@ pub fn call_affine( kernels: &Kernels, name: &'static str, size: usize, - input: &Buffer, + input: BufferOffset, output: &Buffer, mul: f32, add: f32, @@ -995,10 +852,10 @@ pub fn call_affine( let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (size, mul, add, input, output)); + set_params!(encoder, (size, mul, add, &input, output)); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1012,9 +869,8 @@ pub fn call_affine_strided( kernels: &Kernels, name: &'static str, shape: &[usize], - input: &Buffer, + input: BufferOffset, input_stride: &[usize], - input_offset: usize, output: &Buffer, mul: f32, add: f32, @@ -1034,13 +890,13 @@ pub fn call_affine_strided( input_stride, mul, add, - (input, input_offset), + &input, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1054,7 +910,7 @@ pub fn call_powf( kernels: &Kernels, name: &'static str, size: usize, - input: &Buffer, + input: BufferOffset, output: &Buffer, mul: f32, ) -> Result<(), MetalKernelError> { @@ -1063,10 +919,10 @@ pub fn call_powf( let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (size, mul, input, output)); + set_params!(encoder, (size, mul, &input, output)); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1080,9 +936,8 @@ pub fn call_powf_strided( kernels: &Kernels, name: &'static str, shape: &[usize], - input: &Buffer, + input: BufferOffset, input_stride: &[usize], - input_offset: usize, output: &Buffer, mul: f32, ) -> Result<(), MetalKernelError> { @@ -1094,19 +949,11 @@ pub fn call_powf_strided( set_params!( encoder, - ( - size, - shape.len(), - shape, - input_stride, - mul, - (input, input_offset), - output - ) + (size, shape.len(), shape, input_stride, mul, &input, output) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1120,7 +967,7 @@ pub fn call_elu( kernels: &Kernels, name: &'static str, size: usize, - input: &Buffer, + input: BufferOffset, output: &Buffer, mul: f32, ) -> Result<(), MetalKernelError> { @@ -1129,10 +976,10 @@ pub fn call_elu( let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (size, mul, input, output)); + set_params!(encoder, (size, mul, &input, output)); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1146,9 +993,8 @@ pub fn call_elu_strided( kernels: &Kernels, name: &'static str, shape: &[usize], - input: &Buffer, + input: BufferOffset, input_stride: &[usize], - input_offset: usize, output: &Buffer, mul: f32, ) -> Result<(), MetalKernelError> { @@ -1160,25 +1006,18 @@ pub fn call_elu_strided( set_params!( encoder, - ( - size, - shape.len(), - shape, - input_stride, - mul, - (input, input_offset), - output - ) + (size, shape.len(), shape, input_stride, mul, &input, output) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); Ok(()) } +#[allow(clippy::too_many_arguments)] pub fn call_where_cond_strided( device: &Device, command_buffer: &CommandBufferRef, @@ -1334,6 +1173,7 @@ pub fn call_gather( Ok(()) } +#[allow(clippy::too_many_arguments)] pub fn call_scatter_add( device: &Device, command_buffer: &CommandBufferRef, @@ -1384,6 +1224,7 @@ pub fn call_scatter_add( Ok(()) } +#[allow(clippy::too_many_arguments)] pub fn call_index_add( device: &Device, command_buffer: &CommandBufferRef, @@ -1910,6 +1751,7 @@ pub enum GgmlDType { F32, } +#[allow(clippy::too_many_arguments)] pub fn call_quantized_matmul_t( device: &Device, command_buffer: &CommandBufferRef, @@ -1925,16 +1767,16 @@ pub fn call_quantized_matmul_t( let ne00 = k as i64; let ne01 = n as i64; let ne02 = b as i64; - let ne03 = 1 as i64; + let ne03 = 1i64; let nb00 = 0i64; - let nb01 = 0 as i64; - let nb02 = 0 as i64; + let nb01 = 0i64; + let nb02 = 0i64; let ne10 = k as i64; let ne11 = m as i64; let ne12 = b as i64; - let ne13 = 1 as i64; + let ne13 = 1i64; let nb10 = 0i64; let nb11 = 0i64; @@ -2169,6 +2011,7 @@ pub struct CallConvTranspose2dCfg<'a> { pub kernel_offset: usize, } +#[allow(clippy::too_many_arguments)] pub fn call_conv_transpose2d( device: &Device, command_buffer: &CommandBufferRef, diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index b15d9b36..b91c92d8 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -12,7 +12,7 @@ fn read_to_vec(buffer: &Buffer, n: usize) -> Vec { fn new_buffer(device: &Device, data: &[T]) -> Buffer { let options = MTLResourceOptions::StorageModeManaged; let ptr = data.as_ptr() as *const c_void; - let size = (data.len() * std::mem::size_of::()) as u64; + let size = std::mem::size_of_val(data) as u64; device.new_buffer_with_data(ptr, size, options) } @@ -41,6 +41,10 @@ fn run(v: &[T], name: unary::contiguous::Kernel) -> Vec { let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); let input = new_buffer(&device, v); + let input = BufferOffset { + buffer: &input, + offset_in_bytes: 0, + }; let output = new_buffer(&device, v); call_unary_contiguous( &device, @@ -48,7 +52,7 @@ fn run(v: &[T], name: unary::contiguous::Kernel) -> Vec { &kernels, name, v.len(), - &input, + input, &output, ) .unwrap(); @@ -72,8 +76,8 @@ fn run_binary(x: &[T], y: &[T], name: binary::contiguous::Kernel) -> V &kernels, name, x.len(), - &left, - &right, + BufferOffset::zero_offset(&left), + BufferOffset::zero_offset(&right), &output, ) .unwrap(); @@ -93,7 +97,15 @@ fn run_strided( let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); let input = new_buffer(&device, v); - let output = new_buffer(&device, v); + let input = BufferOffset { + buffer: &input, + offset_in_bytes: offset, + }; + let output_b = new_buffer(&device, v); + let output = BufferOffset { + buffer: &output_b, + offset_in_bytes: 0, + }; let kernels = Kernels::new(); call_unary_strided( &device, @@ -101,16 +113,14 @@ fn run_strided( &kernels, kernel, shape, - &input, + input, strides, - offset, - &output, - 0, + output, ) .unwrap(); command_buffer.commit(); command_buffer.wait_until_completed(); - read_to_vec(&output, v.len()) + read_to_vec(&output_b, v.len()) } #[test] @@ -308,8 +318,7 @@ fn run_cast(v: &[T], name: &'static str) -> Vec { &kernels, name, v.len(), - &input, - 0, + BufferOffset::zero_offset(&input), &output, ) .unwrap(); @@ -521,7 +530,7 @@ fn run_affine(v: &[T], mul: f64, add: f64) -> Vec { &kernels, "affine_f32", size, - &input, + BufferOffset::zero_offset(&input), &output, mul as f32, add as f32, @@ -554,9 +563,8 @@ fn run_affine_strided( &kernels, "affine_f32_strided", shape, - &input, + BufferOffset::zero_offset(&input), strides, - 0, &output, mul as f32, add as f32, @@ -633,7 +641,7 @@ fn index_select_strided() { fn index_select_f16() { let embedding: Vec<_> = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] .into_iter() - .map(|x| f16::from_f32(x)) + .map(f16::from_f32) .collect(); let shape = [5, 2]; let stride = [2, 1]; @@ -700,8 +708,8 @@ fn run_index_select( let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let embeddings_buffer = new_buffer(&device, &embeddings); - let ids_buffer = new_buffer(&device, &ids); + let embeddings_buffer = new_buffer(&device, embeddings); + let ids_buffer = new_buffer(&device, ids); let left_size: usize = shape[..dim].iter().product(); let right_size: usize = shape[dim + 1..].iter().product(); @@ -711,7 +719,7 @@ fn run_index_select( let kernels = Kernels::new(); call_index_select( &device, - &command_buffer, + command_buffer, &kernels, name, shape, @@ -746,8 +754,8 @@ fn run_index_select_strided( let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let embeddings_buffer = new_buffer(&device, &embeddings); - let ids_buffer = new_buffer(&device, &ids); + let embeddings_buffer = new_buffer(&device, embeddings); + let ids_buffer = new_buffer(&device, ids); let left_size: usize = shape[..dim].iter().product(); let right_size: usize = shape[dim + 1..].iter().product(); @@ -757,7 +765,7 @@ fn run_index_select_strided( let kernels = Kernels::new(); call_index_select( &device, - &command_buffer, + command_buffer, &kernels, name, shape, @@ -931,6 +939,7 @@ fn softmax() { ); } +#[allow(clippy::too_many_arguments)] fn run_where_cond( shape: &[usize], cond: &[I], @@ -1148,7 +1157,7 @@ fn run_random(name: &'static str, seed: u32, length: usize, a: f32, b: #[test] fn random() { fn calc_mean(data: &[f32]) -> f32 { - let sum = data.iter().sum::() as f32; + let sum = data.iter().sum::(); let count = data.len(); assert!(count > 0); sum / count as f32 @@ -1162,7 +1171,7 @@ fn random() { let variance = data .iter() .map(|value| { - let diff = mean - (*value as f32); + let diff = mean - *value; diff * diff }) .sum::() @@ -1787,6 +1796,7 @@ fn avg_pool2d_u32() { assert_eq!(results, expected); } +#[allow(clippy::too_many_arguments)] fn run_conv_transpose1d( input: &[T], input_shape: &[usize], diff --git a/candle-metal-kernels/src/utils.rs b/candle-metal-kernels/src/utils.rs new file mode 100644 index 00000000..194cddf4 --- /dev/null +++ b/candle-metal-kernels/src/utils.rs @@ -0,0 +1,162 @@ +use metal::{Buffer, ComputeCommandEncoderRef, ComputePipelineState, MTLSize}; +use std::ffi::c_void; + +/// Most kernels apply similarly across the tensors +/// This creates a strategy that uses the maximum amount of threads per threadgroup (capped at the +/// actual total buffer length). +/// Then kernels can just do their op on their single point in the buffer. +pub(crate) fn linear_split(pipeline: &ComputePipelineState, length: usize) -> (MTLSize, MTLSize) { + let size = length as u64; + let width = std::cmp::min(pipeline.max_total_threads_per_threadgroup(), size); + let count = (size + width - 1) / width; + let thread_group_count = MTLSize { + width: count, + height: 1, + depth: 1, + }; + + let thread_group_size = MTLSize { + width, + height: 1, + depth: 1, + }; + (thread_group_count, thread_group_size) +} + +// https://github.com/ml-explore/mlx/blob/bddf23f175726a57f0e443cd45518c0757daa166/mlx/backend/metal/utils.h#L96 +pub(crate) fn get_block_dims(dim0: u64, dim1: u64, dim2: u64) -> MTLSize { + let mut pows0 = 0u64; + let mut pows1 = 0u64; + let mut pows2 = 0u64; + let mut sum = 0u64; + loop { + let presum = sum; + // Check all the pows + if dim0 >= (1 << (pows0 + 1)) { + pows0 += 1; + sum += 1; + } + if sum == 10 { + break; + } + if dim1 >= (1 << (pows1 + 1)) { + pows1 += 1; + sum += 1; + } + if sum == 10 { + break; + } + if dim2 >= (1 << (pows2 + 1)) { + pows2 += 1; + sum += 1; + } + if sum == presum || sum == 10 { + break; + } + } + MTLSize { + width: 1 << pows0, + height: 1 << pows1, + depth: 1 << pows2, + } +} + +pub(crate) fn set_param( + encoder: &ComputeCommandEncoderRef, + position: u64, + data: P, +) { +

::set_param(encoder, position, data) +} + +/// Helper functions to create the various objects on the compute command encoder +/// on a single line. +/// Prevents getting wrong some arguments number and mixing length and size in bytes. +pub(crate) trait EncoderParam { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self); +} +macro_rules! primitive { + ($type:ty) => { + impl EncoderParam for $type { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_bytes( + position, + core::mem::size_of::<$type>() as u64, + &data as *const $type as *const c_void, + ); + } + } + }; +} +primitive!(bool); +primitive!(usize); +primitive!(i32); +primitive!(i64); +primitive!(u32); +primitive!(u64); +primitive!(f32); + +pub struct BufferOffset<'a> { + pub buffer: &'a Buffer, + pub offset_in_bytes: usize, +} + +impl<'a> BufferOffset<'a> { + pub fn zero_offset(buffer: &'a Buffer) -> Self { + Self { + buffer, + offset_in_bytes: 0, + } + } +} + +impl EncoderParam for &[T] { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_bytes( + position, + core::mem::size_of_val(data) as u64, + data.as_ptr() as *const c_void, + ); + } +} + +impl EncoderParam for &Buffer { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_buffer(position, Some(data), 0); + } +} + +impl EncoderParam for (&Buffer, usize) { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_buffer(position, Some(data.0), data.1 as u64); + } +} + +impl<'a> EncoderParam for &BufferOffset<'a> { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_buffer(position, Some(data.buffer), data.offset_in_bytes as u64); + } +} + +impl EncoderParam for &mut Buffer { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_buffer(position, Some(data), 0); + } +} + +impl EncoderParam for (&mut Buffer, usize) { + fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self) { + encoder.set_buffer(position, Some(data.0), data.1 as u64); + } +} + +#[macro_export] +macro_rules! set_params { + ($encoder:ident, ($($param:expr),+)) => ( + let mut _index = 0; + $( + $crate::utils::set_param($encoder, _index, $param); + _index += 1; + )* + ); +} From 718671a0d5b751458033fb6425fb518ca4dc3b5f Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 8 Apr 2024 09:37:25 +0200 Subject: [PATCH 127/131] Use BufferOffset in metal backend ops. (#2029) * Use BufferOffset in the metal backend. * More BufferOffset usage. * Use in where-cond. --- candle-core/src/metal_backend/mod.rs | 89 +++++++-------- candle-metal-kernels/src/lib.rs | 155 +++++++++------------------ candle-metal-kernels/src/tests.rs | 51 ++++----- 3 files changed, 117 insertions(+), 178 deletions(-) diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index 4adcda05..50149a9d 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -314,6 +314,7 @@ impl BackendStorage for MetalStorage { let dtype = if return_index { DType::U32 } else { self.dtype }; let buffer = device.new_buffer(dst_el, dtype, "reduce")?; let command_buffer = self.device.command_buffer()?; + let src = buffer_o(&self.buffer, layout, self.dtype); candle_metal_kernels::call_reduce_strided( &device.device, &command_buffer, @@ -322,8 +323,7 @@ impl BackendStorage for MetalStorage { &dims, &stride, dst_el, - &self.buffer, - layout.start_offset() * self.dtype.size_in_bytes(), + src, &buffer, ) .map_err(MetalError::from)?; @@ -617,21 +617,21 @@ impl BackendStorage for MetalStorage { (DType::U8, DType::U8) => "where_u8_u8", (left, right) => crate::bail!("Metal where_cond {left:?} {right:?} not implemented"), }; + let src = buffer_o(&self.buffer, layout, self.dtype); + let t = buffer_o(&t.buffer, t_l, t.dtype); + let f = buffer_o(&f.buffer, f_l, f.dtype); candle_metal_kernels::call_where_cond_strided( &device.device, &command_buffer, &device.kernels, name, dims, - &self.buffer, - ( - layout.stride(), - layout.start_offset() * self.dtype.size_in_bytes(), - ), - &t.buffer, - (t_l.stride(), t_l.start_offset() * t.dtype.size_in_bytes()), - &f.buffer, - (f_l.stride(), f_l.start_offset() * f.dtype.size_in_bytes()), + src, + layout.stride(), + t, + t_l.stride(), + f, + f_l.stride(), &buffer, ) .map_err(MetalError::from)?; @@ -664,6 +664,7 @@ impl BackendStorage for MetalStorage { DType::F32 => "im2col1d_f32", dtype => crate::bail!("Metal conv1d {dtype:?} not implemented"), }; + let src = buffer_o(&self.buffer, layout, self.dtype); candle_metal_kernels::call_im2col1d_strided( &self.device.device, &command_buffer, @@ -672,8 +673,7 @@ impl BackendStorage for MetalStorage { layout.shape().dims(), strides, (k_size, stride, padding, dilation), - &self.buffer, - layout.start_offset() * self.dtype.size_in_bytes(), + src, &dst, ) .map_err(MetalError::from)?; @@ -791,6 +791,7 @@ impl BackendStorage for MetalStorage { DType::U32 => "im2col_u32", dtype => crate::bail!("Metal conv2d {dtype:?} not implemented"), }; + let src = buffer_o(&self.buffer, layout, self.dtype); candle_metal_kernels::call_im2col_strided( &self.device.device, &command_buffer, @@ -799,8 +800,7 @@ impl BackendStorage for MetalStorage { layout.shape().dims(), layout.stride(), (h_k, w_k, stride, padding, dilation), - &self.buffer, - layout.start_offset() * self.dtype.size_in_bytes(), + src, &dst, ) .map_err(MetalError::from)?; @@ -1013,6 +1013,7 @@ impl BackendStorage for MetalStorage { .device .new_buffer(dst_el, self.dtype, "upsample_nearest2d")?; let command_buffer = self.device.command_buffer()?; + let src = buffer_o(&self.buffer, inp_l, self.dtype); candle_metal_kernels::call_upsample_nearest_2d( &self.device.device, &command_buffer, @@ -1022,8 +1023,7 @@ impl BackendStorage for MetalStorage { strides, out_w, out_h, - &self.buffer, - inp_l.start_offset() * self.dtype.size_in_bytes(), + src, &buffer, ) .map_err(MetalError::from)?; @@ -1031,9 +1031,8 @@ impl BackendStorage for MetalStorage { } fn gather(&self, src_l: &Layout, ids: &Self, ids_l: &Layout, dim: usize) -> Result { - let (ids_o1, _) = match ids_l.contiguous_offsets() { - Some(o12) => o12, - None => Err(crate::Error::RequiresContiguous { op: "gather" }.bt())?, + if !ids_l.is_contiguous() { + return Err(crate::Error::RequiresContiguous { op: "gather" }.bt()); }; let ids_el = ids_l.dims()[dim]; let dst_el = ids_l.shape().elem_count(); @@ -1046,6 +1045,8 @@ impl BackendStorage for MetalStorage { (left, right) => crate::bail!("Metal gather {left:?} {right:?} not implemented"), }; let command_buffer = self.device.command_buffer()?; + let src = buffer_o(&self.buffer, src_l, dtype); + let ids = buffer_o(&ids.buffer, ids_l, ids.dtype); candle_metal_kernels::call_gather( &device.device, &command_buffer, @@ -1054,10 +1055,8 @@ impl BackendStorage for MetalStorage { src_l.dims(), ids_el, dim, - &self.buffer, - src_l.start_offset() * dtype.size_in_bytes(), - &ids.buffer, - ids_o1 * ids.dtype.size_in_bytes(), + src, + ids, &buffer, ) .map_err(MetalError::from)?; @@ -1075,13 +1074,8 @@ impl BackendStorage for MetalStorage { ) -> Result { let mut acc = self.device.zeros_impl(l.shape(), self.dtype())?; self.copy_strided_src(&mut acc, 0, l)?; - let (ids_offset, _) = match ids_l.contiguous_offsets() { - Some(o12) => o12, - None => Err(crate::Error::RequiresContiguous { op: "scatter-add" }.bt())?, - }; - let src_offset = match src_l.contiguous_offsets() { - Some((o1, _)) => o1, - None => Err(crate::Error::RequiresContiguous { op: "scatter-add" }.bt())?, + if !ids_l.is_contiguous() || !src_l.is_contiguous() { + return Err(crate::Error::RequiresContiguous { op: "scatter-add" }.bt()); }; let name = match (ids.dtype, self.dtype) { (DType::U8, DType::F32) => "sa_u8_f32", @@ -1100,6 +1094,8 @@ impl BackendStorage for MetalStorage { })?, }; let command_buffer = self.device.command_buffer()?; + let src = buffer_o(&src.buffer, src_l, src.dtype); + let ids = buffer_o(&ids.buffer, ids_l, ids.dtype); candle_metal_kernels::call_scatter_add( &self.device.device, &command_buffer, @@ -1108,10 +1104,8 @@ impl BackendStorage for MetalStorage { src_l.dims(), l.dims(), dim, - &src.buffer, - src_offset * src.dtype.size_in_bytes(), - &ids.buffer, - ids_offset * ids.dtype.size_in_bytes(), + src, + ids, &acc.buffer, ) .map_err(MetalError::from)?; @@ -1147,6 +1141,8 @@ impl BackendStorage for MetalStorage { } }; let command_buffer = self.device.command_buffer()?; + let src = buffer_o(&self.buffer, src_l, dtype); + let ids = buffer_o(&ids.buffer, ids_l, ids.dtype); candle_metal_kernels::call_index_select( &device.device, &command_buffer, @@ -1158,10 +1154,8 @@ impl BackendStorage for MetalStorage { src_l.is_contiguous(), src_l.dims(), src_l.stride(), - &self.buffer, - src_l.start_offset() * dtype.size_in_bytes(), - &ids.buffer, - ids_l.start_offset() * ids.dtype.size_in_bytes(), + src, + ids, &buffer, ) .map_err(MetalError::from)?; @@ -1179,13 +1173,8 @@ impl BackendStorage for MetalStorage { ) -> Result { let mut acc = self.device.zeros_impl(l.shape(), self.dtype())?; self.copy_strided_src(&mut acc, 0, l)?; - let (ids_offset, _) = match ids_l.contiguous_offsets() { - Some(o12) => o12, - None => Err(crate::Error::RequiresContiguous { op: "index-add" }.bt())?, - }; - let src_offset = match src_l.contiguous_offsets() { - Some((o1, _)) => o1, - None => Err(crate::Error::RequiresContiguous { op: "index-add" }.bt())?, + if !ids_l.is_contiguous() || !src_l.is_contiguous() { + return Err(crate::Error::RequiresContiguous { op: "index-add" }.bt()); }; let name = match (ids.dtype, self.dtype) { (DType::I64, DType::BF16) => "ia_i64_bf16", @@ -1216,6 +1205,8 @@ impl BackendStorage for MetalStorage { })?, }; let command_buffer = self.device.command_buffer()?; + let src = buffer_o(&src.buffer, src_l, src.dtype); + let ids = buffer_o(&ids.buffer, ids_l, ids.dtype); candle_metal_kernels::call_index_add( &self.device.device, &command_buffer, @@ -1225,10 +1216,8 @@ impl BackendStorage for MetalStorage { l.dims(), ids_l.dims(), dim, - &src.buffer, - src_offset * src.dtype.size_in_bytes(), - &ids.buffer, - ids_offset * ids.dtype.size_in_bytes(), + src, + ids, &acc.buffer, ) .map_err(MetalError::from)?; diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index 23c072af..78108127 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -503,8 +503,7 @@ pub fn call_reduce_contiguous( kernel_name: &'static str, length: usize, out_length: usize, - input: &Buffer, - input_offset: usize, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Reduce, kernel_name)?; @@ -513,10 +512,7 @@ pub fn call_reduce_contiguous( let encoder = command_buffer.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline); - set_params!( - encoder, - (length, elements_to_sum, (input, input_offset), output) - ); + set_params!(encoder, (length, elements_to_sum, &input, output)); let thread_group_count = MTLSize { width: out_length as u64, @@ -536,7 +532,7 @@ pub fn call_reduce_contiguous( depth: 1, }; - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -552,8 +548,7 @@ pub fn call_reduce_strided( shape: &[usize], strides: &[usize], out_length: usize, - input: &Buffer, - input_offset: usize, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let length: usize = shape.iter().product(); @@ -565,14 +560,7 @@ pub fn call_reduce_strided( set_params!( encoder, - ( - shape.len(), - shape, - strides, - elements_to_sum, - (input, input_offset), - output - ) + (shape.len(), shape, strides, elements_to_sum, &input, output) ); let thread_group_count = MTLSize { @@ -593,7 +581,7 @@ pub fn call_reduce_strided( depth: 1, }; - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1024,12 +1012,12 @@ pub fn call_where_cond_strided( kernels: &Kernels, name: &'static str, shape: &[usize], - cond: &Buffer, - (cond_stride, cond_offset): (&[usize], usize), - left: &Buffer, - (left_stride, left_offset): (&[usize], usize), - right: &Buffer, - (right_stride, right_offset): (&[usize], usize), + cond: BufferOffset, + cond_stride: &[usize], + left: BufferOffset, + left_stride: &[usize], + right: BufferOffset, + right_stride: &[usize], output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Ternary, name)?; @@ -1049,18 +1037,18 @@ pub fn call_where_cond_strided( cond_stride, left_stride, right_stride, - (cond, cond_offset), - (left, left_offset), - (right, right_offset), + &cond, + &left, + &right, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, size); - encoder.use_resource(cond, metal::MTLResourceUsage::Read); - encoder.use_resource(left, metal::MTLResourceUsage::Read); - encoder.use_resource(right, metal::MTLResourceUsage::Read); + encoder.use_resource(cond.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(left.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(right.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1079,10 +1067,8 @@ pub fn call_index_select( contiguous: bool, src_dims: &[usize], src_strides: &[usize], - input: &Buffer, - src_offset: usize, - ids: &Buffer, - ids_offset: usize, + input: BufferOffset, + ids: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let left_size: usize = shape[..dim].iter().product(); @@ -1107,16 +1093,16 @@ pub fn call_index_select( contiguous, src_dims, src_strides, - (input, src_offset), - (ids, ids_offset), + &input, + &ids, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); - encoder.use_resource(input, metal::MTLResourceUsage::Read); - encoder.use_resource(ids, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(ids.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1132,10 +1118,8 @@ pub fn call_gather( shape: &[usize], ids_size: usize, dim: usize, - input: &Buffer, - input_offset: usize, - ids: &Buffer, - ids_offset: usize, + input: BufferOffset, + ids: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let left_size: usize = shape[..dim].iter().product(); @@ -1157,16 +1141,16 @@ pub fn call_gather( src_dim_size, right_size, ids_size, - (input, input_offset), - (ids, ids_offset), + &input, + &ids, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); - encoder.use_resource(input, metal::MTLResourceUsage::Read); - encoder.use_resource(ids, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(ids.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1182,10 +1166,8 @@ pub fn call_scatter_add( src_shape: &[usize], dst_shape: &[usize], dim: usize, - input: &Buffer, - input_offset: usize, - ids: &Buffer, - ids_offset: usize, + input: BufferOffset, + ids: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let left_size: usize = src_shape[..dim].iter().product(); @@ -1208,16 +1190,16 @@ pub fn call_scatter_add( src_dim_size, right_size, dst_dim_size, - (input, input_offset), - (ids, ids_offset), + &input, + &ids, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); - encoder.use_resource(input, metal::MTLResourceUsage::Read); - encoder.use_resource(ids, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(ids.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1234,10 +1216,8 @@ pub fn call_index_add( dst_shape: &[usize], ids_shape: &[usize], dim: usize, - input: &Buffer, - input_offset: usize, - ids: &Buffer, - ids_offset: usize, + input: BufferOffset, + ids: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let left_size: usize = src_shape[..dim].iter().product(); @@ -1261,16 +1241,16 @@ pub fn call_index_add( right_size, dst_dim_size, ids_dim_size, - (input, input_offset), - (ids, ids_offset), + &input, + &ids, output ) ); let (thread_group_count, thread_group_size) = linear_split(&pipeline, dst_el); - encoder.use_resource(input, metal::MTLResourceUsage::Read); - encoder.use_resource(ids, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); + encoder.use_resource(ids.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1536,8 +1516,7 @@ pub fn call_im2col1d_strided( shape: &[usize], strides: &[usize], (k_size, stride, padding, dilation): (usize, usize, usize, usize), - input: &Buffer, - input_offset: usize, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Conv, name)?; @@ -1549,20 +1528,9 @@ pub fn call_im2col1d_strided( encoder.set_compute_pipeline_state(&pipeline); set_params!( encoder, - ( - dst_el, - l_out, - k_size, - stride, - padding, - dilation, - shape, - strides, - (input, input_offset), - output - ) + (dst_el, l_out, k_size, stride, padding, dilation, shape, strides, &input, output) ); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1579,8 +1547,7 @@ pub fn call_im2col_strided( shape: &[usize], strides: &[usize], (h_k, w_k, stride, padding, dilation): (usize, usize, usize, usize, usize), - input: &Buffer, - input_offset: usize, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Conv, name)?; @@ -1598,21 +1565,11 @@ pub fn call_im2col_strided( set_params!( encoder, ( - dst_el, - h_out, - w_out, - h_k, - w_k, - stride, - padding, - dilation, - shape, - strides, - (input, input_offset), + dst_el, h_out, w_out, h_k, w_k, stride, padding, dilation, shape, strides, &input, output ) ); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); @@ -1630,8 +1587,7 @@ pub fn call_upsample_nearest_2d( strides: &[usize], out_w: usize, out_h: usize, - input: &Buffer, - input_offset: usize, + input: BufferOffset, output: &Buffer, ) -> Result<(), MetalKernelError> { let pipeline = kernels.load_pipeline(device, Source::Conv, name)?; @@ -1643,18 +1599,9 @@ pub fn call_upsample_nearest_2d( encoder.set_compute_pipeline_state(&pipeline); set_params!( encoder, - ( - out_w, - out_h, - scale_w, - scale_h, - shape, - strides, - (input, input_offset), - output - ) + (out_w, out_h, scale_w, scale_h, shape, strides, &input, output) ); - encoder.use_resource(input, metal::MTLResourceUsage::Read); + encoder.use_resource(input.buffer, metal::MTLResourceUsage::Read); encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); encoder.end_encoding(); diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index b91c92d8..960ae1df 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -728,10 +728,8 @@ fn run_index_select( true, shape, stride, - &embeddings_buffer, - 0, - &ids_buffer, - 0, + BufferOffset::zero_offset(&embeddings_buffer), + BufferOffset::zero_offset(&ids_buffer), &dst_buffer, ) .unwrap(); @@ -774,10 +772,8 @@ fn run_index_select_strided( false, shape, stride, - &embeddings_buffer, - 0, - &ids_buffer, - 0, + BufferOffset::zero_offset(&embeddings_buffer), + BufferOffset::zero_offset(&ids_buffer), &dst_buffer, ) .unwrap(); @@ -819,8 +815,7 @@ fn run_reduce(v: &[T], out_length: usize, name: &'static str) -> Vec( ); let output = device.new_buffer((length * core::mem::size_of::()) as u64, options); + let cond = BufferOffset { + buffer: &cond, + offset_in_bytes: cond_offset, + }; + let left = BufferOffset { + buffer: &left, + offset_in_bytes: left_offset, + }; + let right = BufferOffset { + buffer: &right, + offset_in_bytes: cond_offset, + }; call_where_cond_strided( &device, command_buffer, &kernels, name, shape, - &cond, - (&cond_stride, cond_offset), - &left, - (&left_stride, left_offset), - &right, - (&cond_stride, cond_offset), + cond, + &cond_stride, + left, + &left_stride, + right, + &cond_stride, &output, ) .unwrap(); @@ -1250,10 +1257,8 @@ fn run_scatter_add( shape, shape, dim, - &input_buffer, - 0, - &ids_buffer, - 0, + BufferOffset::zero_offset(&input_buffer), + BufferOffset::zero_offset(&ids_buffer), &output, ) .unwrap(); @@ -1355,10 +1360,8 @@ fn run_index_add( shape, shape, dim, - &input_buffer, - 0, - &indices_buffer, - 0, + BufferOffset::zero_offset(&input_buffer), + BufferOffset::zero_offset(&indices_buffer), &output, ) .unwrap(); From 798e0335cd2c4661f0fd0429cdf06abe3b45f4ea Mon Sep 17 00:00:00 2001 From: Gabriel <45515538+gabotechs@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:06:14 +0200 Subject: [PATCH 128/131] Handle more tensor shapes in onnx "Gather" operation (#2026) * Handle more tensor shapes in onnx "Gather" operation * Add more tests * Add comment * Fix typo --- candle-onnx/src/eval.rs | 30 ++++++--- candle-onnx/tests/ops.rs | 131 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 152 insertions(+), 9 deletions(-) diff --git a/candle-onnx/src/eval.rs b/candle-onnx/src/eval.rs index 15cadf1d..f7cae31c 100644 --- a/candle-onnx/src/eval.rs +++ b/candle-onnx/src/eval.rs @@ -508,17 +508,33 @@ pub fn simple_eval( values.insert(node.output[0].clone(), xs); } "Gather" => { + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather let xs = get(&node.input[0])?; let indices = get(&node.input[1])?; let axis = get_attr_opt::(node, "axis")?.copied().unwrap_or(0); let axis = xs.normalize_axis(axis)?; - // TODO: Provide an op to handle the ONNX generalized gather op ideally in a - // differentiable way. - let xs = if indices.rank() == 0 { - let index = indices.to_vec0::()? as usize; - xs.narrow(axis, index, 1)?.squeeze(axis)? - } else { - todo!("implement gather for {xs:?} {indices:?} axis {axis}") + + // In Pytorch or Numpy this can be done by indexing the xs tensor using the indices + // tensor directly, but candle does not support tensor indexing at the moment, so + // some workarounds must be done. + let xs = match indices.dims() { + [] => { + let index = indices.to_vec0::()? as usize; + xs.narrow(axis, index, 1)?.squeeze(axis)? + } + [_] => xs.index_select(indices, axis)?, + [first, _] => { + let mut v = Vec::with_capacity(*first); + for i in 0..*first { + v.push(xs.index_select(&indices.get(i)?, axis)?) + } + Tensor::stack(&v, axis)? + } + _ => { + // TODO: Provide an op to handle the ONNX generalized gather op ideally in a + // differentiable way. + todo!("implement gather for {xs:?} {indices:?} axis {axis}") + } }; values.insert(node.output[0].clone(), xs); } diff --git a/candle-onnx/tests/ops.rs b/candle-onnx/tests/ops.rs index a686f198..18cd53c9 100644 --- a/candle-onnx/tests/ops.rs +++ b/candle-onnx/tests/ops.rs @@ -4,7 +4,7 @@ extern crate intel_mkl_src; #[cfg(feature = "accelerate")] extern crate accelerate_src; -use candle::{Device, Result, Tensor}; +use candle::{Device, NdArray, Result, Tensor}; use candle_onnx::onnx::{AttributeProto, GraphProto, ModelProto, NodeProto, ValueInfoProto}; use std::collections::HashMap; @@ -829,7 +829,134 @@ fn test_flatten_operation() -> Result<()> { // #[test] // "Gather" -// #[test] +#[test] +fn test_gather_operation() -> Result<()> { + // test taken from https://onnx.ai/onnx/operators/onnx__Gather.html#summary. + test( + &[[1.0, 1.2], [2.3, 3.4], [4.5, 5.7]], + &[[0i64, 1], [1, 2]], + 0, + &[[[1.0, 1.2], [2.3, 3.4]], [[2.3, 3.4], [4.5, 5.7]]], + )?; + + // test taken from https://onnx.ai/onnx/operators/onnx__Gather.html#summary. + test( + &[[1.0, 1.2, 1.9], [2.3, 3.4, 3.9], [4.5, 5.7, 5.9]], + &[[0i64, 2]], + 1, + &[[[1.0, 1.9]], [[2.3, 3.9]], [[4.5, 5.9]]], + )?; + + // all the tests below are generated from numpy.take, which works like + // onnx's Gather operation. + test(&[1.0, 2.0, 3.0, 4.0], 3i64, 0, 4.0)?; + + test(&[[1.0, 2.0, 3.0, 4.0]], 3i64, 1, &[4.0])?; + + test( + &[[1.0], [2.0], [3.0], [4.0]], + &[3i64, 2], + 0, + &[[4.0], [3.0]], + )?; + + test( + &[ + [[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]], + [[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]], + ], + 1i64, + 0, + &[[5.0, 6.0], [7.0, 8.0]], + )?; + + test( + &[ + [[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]], + [[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]], + ], + &[1i64, 0], + 0, + &[[[5.0, 6.0], [7.0, 8.0]], [[1.0, 2.0], [3.0, 4.0]]], + )?; + + fn test( + data: impl NdArray, + indices: impl NdArray, + axis: i64, + expected: impl NdArray, + ) -> Result<()> { + let att_axis = AttributeProto { + name: "axis".to_string(), + ref_attr_name: "axis".to_string(), + i: axis, + doc_string: "axis".to_string(), + r#type: 2, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: vec![], + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }; + + let manual_graph = create_model_proto_with_graph(Some(GraphProto { + node: vec![NodeProto { + op_type: "Gather".to_string(), + domain: "".to_string(), + attribute: vec![att_axis], + input: vec![INPUT_X.to_string(), INPUT_Y.to_string()], + output: vec![OUTPUT_Z.to_string()], + name: "".to_string(), + doc_string: "".to_string(), + }], + name: "".to_string(), + initializer: vec![], + input: vec![], + output: vec![ValueInfoProto { + name: OUTPUT_Z.to_string(), + doc_string: "".to_string(), + r#type: None, + }], + value_info: vec![], + doc_string: "".to_string(), + sparse_initializer: vec![], + quantization_annotation: vec![], + })); + + let mut inputs: HashMap = HashMap::new(); + inputs.insert(INPUT_X.to_string(), Tensor::new(data, &Device::Cpu)?); + inputs.insert(INPUT_Y.to_string(), Tensor::new(indices, &Device::Cpu)?); + + let eval = candle_onnx::simple_eval(&manual_graph, inputs)?; + assert_eq!(eval.len(), 1); + + let z = eval.get(OUTPUT_Z).expect("Output 'z' not found"); + + let expected = Tensor::new(expected, &Device::Cpu)?; + match expected.dims().len() { + 0 => assert_eq!(z.to_vec0::()?, expected.to_vec0::()?), + 1 => assert_eq!(z.to_vec1::()?, expected.to_vec1::()?), + 2 => assert_eq!(z.to_vec2::()?, expected.to_vec2::()?), + 3 => assert_eq!(z.to_vec3::()?, expected.to_vec3::()?), + _ => unreachable!(), + }; + + Ok(()) + } + Ok(()) +} // "Shape" #[test] From a4d5a414e3ae79642ecfd6b7bb410c26a8a62a06 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 10 Apr 2024 12:49:25 +0200 Subject: [PATCH 129/131] Support gather on bf16 for metal. (#2035) --- candle-core/src/metal_backend/mod.rs | 1 + candle-metal-kernels/src/indexing.metal | 3 +++ 2 files changed, 4 insertions(+) diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index 50149a9d..158eb8e0 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -1042,6 +1042,7 @@ impl BackendStorage for MetalStorage { let name = match (ids.dtype, self.dtype) { (DType::U32, DType::F32) => "gather_u32_f32", (DType::U32, DType::F16) => "gather_u32_f16", + (DType::U32, DType::BF16) => "gather_u32_bf16", (left, right) => crate::bail!("Metal gather {left:?} {right:?} not implemented"), }; let command_buffer = self.device.command_buffer()?; diff --git a/candle-metal-kernels/src/indexing.metal b/candle-metal-kernels/src/indexing.metal index 762b42be..9eee97ca 100644 --- a/candle-metal-kernels/src/indexing.metal +++ b/candle-metal-kernels/src/indexing.metal @@ -207,6 +207,9 @@ INDEX_OP(is_u8_bf16, uint8_t, bfloat) GATHER_OP(gather_u32_f32, uint, float) GATHER_OP(gather_u32_f16, uint, half) +#if defined(__HAVE_BFLOAT__) +GATHER_OP(gather_u32_bf16, uint, bfloat) +#endif SCATTER_ADD_OP(sa_u32_f32, uint32_t, float) SCATTER_ADD_OP(sa_u8_f32, uint8_t, float) From b81ecf712d1854598d6c9f9cfa06fbf0093f3bc9 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 10 Apr 2024 18:10:01 +0200 Subject: [PATCH 130/131] Support alternative dtypes for mamba (#2036) * Allow different dtypes in mamba. * Add a dtype flag. --- candle-examples/examples/mamba/main.rs | 12 +++++++++--- .../examples/yolo-v8/assets/bike.pp.jpg | Bin 0 -> 178789 bytes candle-transformers/src/models/falcon.rs | 4 +++- candle-transformers/src/models/mamba.rs | 17 +++++++++++------ candle-transformers/src/utils.rs | 2 +- 5 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 candle-examples/examples/yolo-v8/assets/bike.pp.jpg diff --git a/candle-examples/examples/mamba/main.rs b/candle-examples/examples/mamba/main.rs index 4802f960..b8c8bb70 100644 --- a/candle-examples/examples/mamba/main.rs +++ b/candle-examples/examples/mamba/main.rs @@ -54,6 +54,7 @@ impl TextGeneration { fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> { use std::io::Write; self.tokenizer.clear(); + let dtype = self.model.dtype(); let mut tokens = self .tokenizer .tokenizer() @@ -66,7 +67,7 @@ impl TextGeneration { Some(token) => token, None => anyhow::bail!("cannot find the token"), }; - let mut state = State::new(1, &self.config, &self.device)?; + let mut state = State::new(1, &self.config, dtype, &self.device)?; let mut next_logits = None; for &t in tokens.iter() { let input = Tensor::new(&[t], &self.device)?; @@ -84,7 +85,7 @@ impl TextGeneration { Some(logits) => logits, None => anyhow::bail!("cannot work on an empty prompt"), }; - let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; + let logits = logits.squeeze(0)?.to_dtype(dtype)?; let logits = if self.repeat_penalty == 1. { logits } else { @@ -210,6 +211,9 @@ struct Args { #[arg(long)] config_file: Option, + #[arg(long, default_value = "f32")] + dtype: String, + /// Penalty to be applied for repeating tokens, 1. means no penalty. #[arg(long, default_value_t = 1.1)] repeat_penalty: f32, @@ -220,6 +224,7 @@ struct Args { } fn main() -> Result<()> { + use std::str::FromStr; use tracing_chrome::ChromeLayerBuilder; use tracing_subscriber::prelude::*; @@ -279,7 +284,8 @@ fn main() -> Result<()> { let start = std::time::Instant::now(); let config: Config = serde_json::from_slice(&std::fs::read(config_filename)?)?; let device = candle_examples::device(args.cpu)?; - let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, DType::F32, &device)? }; + let dtype = DType::from_str(&args.dtype)?; + let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? }; let model = Model::new(&config, vb.pp("backbone"))?; println!("loaded the model in {:?}", start.elapsed()); diff --git a/candle-examples/examples/yolo-v8/assets/bike.pp.jpg b/candle-examples/examples/yolo-v8/assets/bike.pp.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a46b8e8413a90295ad20ef90f60a44ddee46d7cc GIT binary patch literal 178789 zcmbSybyQT*_wUf%A)`Z&(nvD{3@P2+N|%6i3KD{d!%!kM44ndkw16~2$spYzLpY?g zAmZcud$rd4>%F({y7!NJ&pPXV);??RefH;b{>}ee2heJ%X{rJ6@Bsk4y9@BI8$d^j zFG#>afJXuB0|Eu(Ghpw0fcl!^qitf z#0-Y^BwW6XV$rFEq}~IR!^_7nAt@ydfy$^nQB_md z(9|+EF*P%{u(Wb?a&~cbbNBEM2n-4i2@Q*V9T)#5;ca4C`umIznOWI6Ma3nhW#tu> zRi7Igo0?l%+rISn^$!f9hlWQcr>19S=jIm{e{5hkx3+hF;&zWuPS4ISE`MKL{|6Ty zfZ)Hwx_kb2VE+fMJ9psW6A}^-lKcl39)9rMO+ZIT#3@QluVhGK@5{g?7EQ{ioLX4l zL&hy`^oz;C?>jj&SmMXSMiR(N-((0u5*4-U1b}kiT}mZ!726apHJ}M z9=3cP_KE>#-S$|H=#gyvmTyr$&~eC(0)y4GW?`|x-<<6MsV8e5N!jq+J_&ussw8%Hsb!{p@#r~_W4XlNN=@Le(vJQw1>+l1Ejo_uI}%%#^<^%E9z7U_y_%CK`(y_`by0?p z>Qlafhvz?4pbj&8VZzM*aMkRQ>%(8t0*>WRVZPU99;hNGzi5T=F+hSBN5lqAA?h+AS~p_%Fl-Mkb2tOGOXTj9C{s%pb4uuq*M}SvJ~CQxQeuaCvS#f_sp5xNgF?_ zOFVRPXrTF0Vi)@}LY;l5)M@FRGZ(9AHup0j?I$BrPQIv{YigP6aEC9yl5?1yhQfNM zaGh$uR8nyWYS=59+2%Nnsl51X>-|YiDh|AmUEfwmC~yyVUW+qJR<6}MA#7r^QSixI zkbNUx{;T`udFF=QNIf*&TmBf$s)#vbukp_3=ge8)(W5IC`&>C}B%8lAo3J5SuaUN} z5X)<~$9B&bFo_LmOW-gd5S}T4StRc)!BFvxS*X%QovHFPJ7PMu`VWkV1MeS=aE6H*qi| zwO?A+pdUz4zIp~zmv581HqLorPd$A==&LH{SD(;E))Wm0=5B@d^sr(i3YLRe-RlPtr}K))jRYsR={#td!RUzg^gx(k8s+TuX z12=m`d9f8x-#C8yZMc7E_0~-FlpFN&U2jw{ZQD;7N4ek|{tN-8AyYRo%~hXoNh_^A z=%+N2pA*`Mvo3U_@SuMRc3@YGIZSUs>fFpu4x|9PGw*U1?9@`?aG3zvmcGR3pU=5U zvdzawQ=Q}+^{fMV?RiK(o6Qe|C~*{o@YIxfz@cO|b?L+3t3Jxg&4UXJ;5Msfs!a+U zT98V^FzS`Ue5L>cJPa% zzxtkL{`0g~!)>bkrIrnie8b&8MP$#f%|yP#lay5G$*33PZEL!LXrplBumZKj{`#fH zrR6b`T0bdTk{F0J7EdQC-qFMSwZplp= zt~ddDVY+7IhlBiA{{UtJA!R8N-oLFin`b1g&vRdXRYf+a){ormcJZyBB0_K`b6+Iv z`!`gD4})cVTUbfvTAodQIA&lWOdVq!y4UJJb{%lOk!wY|(5&s!!e-|vwi856c1Z#< zCtNM%GS$w0QXw^XoFd&eMiVy<8>EGwE1>gPLiree|$b2NVNzQ`*f)XI2=%(am`9H?;Vj7#Q>4n|{ zot&u;OV%A*Z$zioW*aLFXQoqmC@6bPpf69_Kht>97(@)vdHrMtC-95wEcp-8>Fc#| z1fX#^-C8@YuV30*KaiZHsGYE)eW=C(f+5|wE7Qr>1}w4k&t%srM}i5V1ng61l0Pl| z=FMlfrn(NiJ`1l2hZu;b9-u^%!v>n3(^R_C*=Bk64HYI=0D=gjAZ(p`!q4kusJ!~E zq~C1VkxG3QB@BF+;*XoGE&2i+vgH)wAKY=@tly*rMdy1XO^wA}9PI!qL0H7hH0Mz8 zmkilgaSgbstBFBae0TT3khc$1Qks?&n@*}Z1glT(zYV~n9J5c2Ay#D%hM|8fR#GCY zqmHX8pu3?r3j8HeFTuPJs`jY~gYI}mrCpDBN#711nRVa4ym({X>%K?Hr<2j7BaD2Q z)xA`>0RdRCfbO?zL4h6I_aJ{ukz9P3BmZJJLbnTS?YjX-|z^wEkWIFFH+{>sqI(_;S{!8I; zjZ+RjFOTpP-KiW;N>yh#J?YMQ&-IL^Ev3*Ad{Rp+i`<7vanm1}Y>Z}&fL5nWWJ&`* z!tJAehJargUmFewnU2jb?+3zbu|tN*r{A8&2s-Boz9Ka~5g|P88CNLPR^f~;C1(N% zp+tLjuFfzMrZrJ{3$ZM=Wboz_|2YU&Zap^t^c%UdlY6p^c*Zi`<_;WN?sl!mi%)a8D=!u7$J_)pDGPh9T@M`sM)TD zmyVOL95e>YZ?_j~37E+;e^qNq6Dot02foOPZKe4FJ3g?`D(A*0R*TMd@ZSP)ivnEX zDT+t8{$^GFh4(+C$SzpB#^Kd{JU>E|pD`I0*NmuUG%oZu6Z`q1>v6VkadwU!ALK%7 zrV4m&{sFFk>QH`DLQ!!Yns{!wq2*$yT^Sby2TmVZU_Yp`S@V+GIMqE}cjbWZAcLw) z!9&6P->P&{-&3JNuvdS>u3g?6@fbyaLFtjY5PPNLgNy-b(j&#A7to_M>4N4(p zuCcb6T%?akBrOXP4p5wnp6d3!n*cz2k2_uD+zu#y^WaI&*+!(fLQ zYXwp(_}eE{&3g=3@a7O2N9R>vm`41x<9h5aiQ~XZSC>RGr_ssA$k{D`_ zI&Ny!AQ$IH-$WiEaI}Q;CBbMp6D?`{esk-?psU)CDDqfzvux_F0AKGAU9qeZ$RSAg zDE)&!nl0*8l0D;s649if)|i-oD)ME4k9_AV!O4Jb`H4v~LT}19JTEePFeFe2JNBC- zb5^b0P>BM+kGyU933j&qeMZcV*ROfLW2%1biTPHq{xHfIb&va(CKA(<^-t{YQMZ>AB15^gi`cjHc&pi$NSdiah z@PYqUG7l&P0)w70^)cBweb<(g+JLpsai{L(>!B+ukp_<}gOs&L%*(j#kj20)9a``Cm%kL1dec*npZwq= zr=|VW2xuM2H;8~L{PpO4Mn|Kj>9fz|-@^YoMjr;N|3<4U`O;2Gy;!NWNc}F$cdQSs zeOIj67f;Q*+^Xw)aSTw#mJ`gCYfXxZI}-~%S{GwvYaM)rVfsvaCa(pc4!^JCnD(RP z*GIT7%J(or7gWS%>o?vWvXg#*%DXnwAT=|V+Av|dI54$O9jPjzLv^Z$l@J)#p{Js#B0odvrx@v30g)=$Pi^_n zb`+b1Q&_-_8(_J^4unnr2be7&qKbPAhr3FUDqi`v9;c$hCWxY*#OVR_UVC@TpHq?M zJLuipff|mSeN(}EQhEPZ)VePQa@!<@XXZ>^P(G$lSuA%KWGW@_!P;$O_R%_cYr&sR z(QNNUxZBl2M%71O-A9G&VafzFM04wPJ`#Lv`10kz)Gh=HB0!0R-h`QhHk7$EX^!C0 z@1Quuvn5#c?R&7e-pb@K^(6)8LsCWZWaYM_y?lKwj9Jf5sR7d?xlMy!Ikm#y59)ua zkwGz@5voj&?ew-XKVUJcVw!O;vSKtTZYPc=!u^ek+7zuRj-$4U$t$79!jn!#K*$(i#*ILgvMt-xoUUVUn=@YZA| zYu{*TYslHY=PDR4`YxOssW*FA(RoNSh!c~KI1)szl>_f0ZHCK+X!juStx~7r4)T@- zJDKhXMtG$R9dV4WQ>%q=xAhF_Gm!~ITXUHki5xU|6xjTt4!QgZ+cTL|l#%43*G?D} zwQv93`d8s^ps!80Abo1WippJdQAlN5Mcp>N$5zmP9tgUx(T9)3L;8j6LvLUTka@h| zCnz&^upPC$@m#@PKF*Y-%|nd0_@pe9&9*S64am{LO4c(#9|1_)-5($MOzTrj^d3M{ zPcjh!t?-?o5C{gnt1Dtu6iuQ+qIGBifjSn$J;j;NjK^=tE&Ho0~E z?TMUzlPZ+;B-;{0vL=81lz1Q)1JK^?fdF@)p+$*DzVAGnZ~6yTD6r=M-RV*V)~t6a z^c3pmtWRzAMfrtX&qY9+6Q8`nS;gL0e+t7`J+1)WuZsWf-6uWd3a1qx7xP8yASthD zJiOAzomWF2Dp67zG!FoFcw*t#0noFGM9%jg6dH$2Zns1dL(P5h-6_$ zoC;N(jg%wbASp+RKQcxP3SmlrM-QGZI*}H8vNLYHE&p=hDtVQJFqi=De`Lx^tV-Jw zu4#(xNr?Nq#L&f4?BnO7d9aoPj=op>SSbIS8O0Y#3g*=UO1t%AB;L{-?72kNp?PS! z9$iz>+YhF)ZChjzj^i_VW*m^=7igij>m;O}S)( zbP^6dD}-lknx;vs2fcUp*#re)Z;rKk+a9QcJPBf;tiFR`X9Z+jF2M<^gON{>h>$y1 z*m-F1#QeY`RU>2ZjXfXg-oz)C3EZtYko80-$rr<}Xx7f9>dd=53)bd?J_N0++IKiN7v zkiRr;4)xkU6lRuuu*k9yL+P5=DLW_RJ}%ZsQ#|4mA~OxDCwqEf)$~2I7CWcgs;%El zVfpo2`wOIDv+Go_&Mezv%_sl4+o936HfZ3%*wIy{THeR*iGINsbcYlyZ&G?nxNBF% zd=DneY>s>d8-yC%g`_W0`3XYJb-f|4?^SytaM5ax_g>}x>7=pB)jVUraIvX5tO>eA zea%ViGjC~@P9o|LI;q9L3OOz4rh-I=KJHXxrp%gWLj5W}UTnYGHoyJej=gN_M)NE5 zHa!0GfZ!(o-9Lb+{FFME=Sdu3c(QV1U5d#!hkfX7puuxUIcCmC*R(1L>5cn zr0cPaT&Jrl^p$w*F8*-PiJA=d%1o}Kc8q#TXYxjpJ`S}A({dVK^Y`hFdbMCw;F@oI z^fXR}aj~6UG}1y2yax)FXP-=i+#b%kVPK(@H03G^r5}1s$is>XuNlwszyradYPdBwW-oJ7uDp2|j|p$Y1=501hIUIEqMuP*pQh z0rNu-+qzCtcmplF_aM%};ta!bq)1JWYzM`Dnp%v;hf4*WBK~$=pHX@f(DeWUhmQ{3 z@WucJcDEaJ- zPhY~?gKw{2KMXp<*0>sw$63B_p2m~6kK2v-`-Mj9-P`n0!4}<42PNUjadYx}6OKIj z-knuj4;h8NX(B_I)s>Vt2lDcsd!0@*V?1}WW-h~j*21JjKWLWl{D}NSwK7EU4QJ2AKRz!9<9pbVSl&ATYo;3r4L_ zzc@NTITWzNGJHexs+@@n)F0a8d--NG-*9-GW;*Asz+wwnEA!xbqyUM7ljU}PXZbQ=LnV?_L-)-o)SU7d;&*vV#b<~N<{{S3z<-sI8 zo3N0Z)6bNL? z3uic*aGY(hw|^~g%-CiWl(8B6n-bf0q}D4GilZv>P8Plg@Vx86o+3)}!{TdYGt<^J z-@j(I6M2U!fS>NeWX;86^$nNDKe%dtLJDx=P!Ydq*!2Y51|$oGa+oJ2^mgz~(rY!w zMR907e}fUdzDGHXskRE?-FW5*R7n6R`kTC2PkUOux_aodaC(bs6;yGV>}I5LE%GXW zwx3FMtEjg=YMRk;x#Ac`N;Av(PTyRi%qG}L6rbc{lZl;h)kYCm)Twj9{X+6JKNDUbC~rBh zZ{lMug|ZI@mco)Haoz*{L`EoLXV}~Hm~vRextnVs>q)t&H4Zo z^yVYkNT2fHC))I}H*|+~OUa7rN=w|$;RxP$t=UAS<10$eE}SY(-arU?D}CsC<|Ib> z8?eQY<{NjKrrb$6pw~&*@8iAVjCQ zQ&AoMit#cV59x?@(q!r{SR0J(X}^^0kBym{`rWJKnFtHpoUJ#b)8H|MU{g^URnv|t zM8i7>HUDcir5kK~R%D*4xdFxXn)(jL0 zyWjzZZW+C?mQ49+^XTk`PtGr0Q(2^Mt9Zh4?m?rj7_w#S0RwJ?j zR&)EN+h~WE!rF5+AM9rrQ+u@5s2g${z5rQJ}qW*?nJr! z^o;a3{h)(eTE%N@&Q`x9@cGttzJB`mJS{Q9g;SnWLyp#Cp8SHEs5hhOL(Ka|kmk>1 za^t1ah;ZUv@dKOM;H2a*$WyD1%gM{V#g&8KbgY5T`WrkDX|Lal9S~VWnR}DksZa)S zr=l7C|19@icUDwsmtULByYF58 zl=?FesX9}7bP-uJJ6rL@3j(0G27B8O0caVivq%=36XghmxBq&di%V8NW^IUbKms6u z^yugh7(G16I90vtLyci?uDoJ3abV`wGIa<^nY!U`<2Mq)^JYB9y8Dxw<;3h^%Mg}9 zGBDj4?)Ti2`*ka0J^q|J^u3q6m@j}!N2=1*_ zAFI&Xymq4Ix8O>oSuSvGqr*bdj~3k%|L2?xi2(G=ZaLV@*Q0nAiBFv=GGlhVbKLW7N$1AB=5LZA3+( zk2`aNUWJ4ZuxhZi-qhO&e|(`t9nV``GYL7=l6!NR&nwt{B8rPfb;2rF>Y0n25?`gc zWI~D6ZCtrQfpR{Ovh`|%%hZ}4O#n^fIY}xya^1B{ZZ^1%R&7$E;9K1b4!?o}SbY=; z0VtCEbu1{DEHzq#M%(`ZE=k z!(nMA5Re4Y7`VJA7a{(eW(Q}Tpf3h}^6rGYo-mLH28GCk=9XGd&cO4m6wgKF# zUpVFD*d&0ndm!-cd!g;5?lgDHG9j?%yq*%zV zo<{Yfa|XV_@94C9f?{NtE*}M&-jCL#aY`+37d6O>rB*{Rk!^dcP2cO&1Y=AbPvG7} z>L5qBBSQ1ww~4Ohvdc#zYm4vkb6VybDLo* zZY@S_+p(lL*+}owkl}xT`#XNyj-2lIEx%{od!i6t&-VMsO3MnbXvk`k@2QhZ3e}SN zp`Q_8V>=mm4DKY%s@&)chMPH=~6p{&E`<-(%c!l)^4Im=is39p=?YP zWGDUXtla1Ef@Lh;D^a@oOKbS)Jhtm8On^^sFrh4tk?QJk`xYVg^z7ig%8OPAlXjy5 z6zph7Pe1Ekr_rjYPgTp!>f+$Vj1ChZRu`Cb1fNnp>?^j;7Adbh-wee4eJ)EqVEr!p zDFcISXn2bkPnVv4^DASMVtdLbcKyZTL>QJHbOpQ0)l9St^r7b0skY!4poW47S`M!p zqpuauS^Kk6@RjRdtXQ23p;+`ZAe>G1cs5PZ-F;#JR^i@Ui`O5gXWK9|gj@Pl-sviB zry@M*=FK!Q(lZ><4(OK0R@}AvQ1wjyO23+yoREX!9fy>mo2lbrt|uRxUP~4V2xu7= zlaOq&7?BBOtP$j7=ThzN;RwVYSE&eSe6O`qB~n^}V|_5( zI4glblI8OVo{gsS_)K#7OxHg#L|?g2zT>f6DdGmzu~w4mxyGtMP3V$i{IPJsD(A7_ zbxK3xqUOSr)TJ`wA3fk#A;-gev&9*}l?G}40wvy5PDSl~rA~xzqiZzAKLxhS7|17( zTWtEPX7;zS{&e+{&w^a=Q}-W?w7a9HRO@$q^$nVpZ{)REm0~A(j?l_xaCc8h)A?8$ zcW==`sb_J#EFS9x{FffA%II}4f3AmlJx=V#aHe`I_P}IPSN< zRmD^UnP2v|y{8gQb1l}GAK&=3TR5)15ifj!umA8xy*cJeFDII?nG|~oB+A=v^K}k% zvnd+EFr*x(6&wKA)*VSYA9YS?sGfqyBIL)(I^MUtn{g>=YHjiiRg0rx5Y{WSjp7fA5qzkX^L2$Pgq;>LaY{HoeDGIXJSuR zb}1+h_j|U=Q;%=~Xj>Yu0ehtdn(s`MFt$LAtZWJY13V@#NzKuIcXYd*auvN=`C3CX2`9l-k`mGsP&tD z!c0E$_^qdWm`lxdFL^IWwBxe=nQ)UJrVTP&z1V5~vr8&)<{!Yf{1$p^GD*W} zb_PqEzo?jy+2qMUP3`?&tiI;PI7oD5_$po6Ss;qF|;bkWcvDE99{)*mtM7`yLqni){1yto%OAVc?q|3VChYmhs7I z*IlIdn`59VJ+)~G_+_`9v$|D9?-q}vYk%d##h;TMp0&-6{*T9)wA&eC)W@K#8eBC4 zo~E^aE#E2H*kmE!<$h+*N;+to67Hn==!wIxu}x03+0s-{*J+}>nhwE9{eC-2aU|VI znj(kYA4>lx87;(amLd8!Us_{iy~H*&or4z>O+Xw((Zehd^izFL6t8Yep?jcENZ6{- zVB=@z=hXGQO5>U`4c#09)kGws@}2DDKwu-nejEYsnx>p z75=`m$|Zf?QSAp;g8)z({{Yak6uK)<(jjixT}Sc({&n?JqnGnN=TP+IXHOEiF4vE2 z=);il$P8J^4A$5ZBX8w$jv$WHaoK->B!OkBqPoHDVAAx1#Xs-k?0+iCA=fug6wf}% zE{-f0h-rtGrj+Pm30@pA5}mChI$u#v>ON6>ZT8GLaO5pzDD{aQW%LAyR!#cEVy{%kHW6PFZt=}-)_>Od@-lypRg^NCzHnSE+wELz7G&2x;iDrD;Du6kt3O8%p*N6z===UeXimm>z^ z5Ed2=@1dQ4fErybERSW7b))@PbHZi?<%r|d^XipD167uge}Lder`)7h&$95~=NCJt z2P%0{0nJm^B0R6s)JFzcT{X`UL#w^1qk5fpvZ_X;FiE_soG@Cj$ z4lezO_fZg58t&+BmU8b6m*-p3N!4$B*HCS&brAw(QfZvbo+kQqNvmSDz8o)pz*NGq zi6fuBjH+rsPipyWz3vm@jjRoo3)r(hS(T;>qY-m|<}wIgcaArgkJ^cefYy3^NCf)H zhE7N1^&SxLxDPw%AG>B8*o^x|{-R$FjWkxSYFFoW*s&Wve5LTK!aVX9-RlRl7Be=S zK>I15omA5ZfcEGcE;R+=U&QLF-8lF!(&3%O=8|*{{XdZJG%8nUZaB5Oj-4EsQlY%Kk>z6cuXg+&PRobP45(>2vC%Q1ax}PWkGSbhnFo+4Q{ALwX+qjN*__g*>r^e? zfwg^8w!VV^a8=;pJcZum{=6Opg^-K{sEAN5z2Ew zXEBurjnU1(1dibK$c&`G!W!ra^3B^vVrw6gbbK3MQ42j_>aHCoxYHpO%Y{|A&05d%Z7=G70D&^vk!x{KKH+4r~(9~z(Tn}Y_UvK zWJwvtD5D`|2yk;VU|&y@=Mzp7(BH=X1yDt(gessXlfzc-!Xn>mNz*k^Eq@-=?bi{F zr{3h_I9sQ_iDaqJeqS=o=J5SxbsmH}HrW%Q{wypUu|+!jd?_xP4rS(-SVe~Q_HT5; z0_Dhs?B}ZvKS=v?OQS@a*j=P)o~rryqE)6!-I^HgC%^D(=oYJh(t;vTa+(Kkl$kuw$Xgdf5>>1j9*8iIy{g{fJVL2M95i zU#rTonED}>?b-`9dNY@ODTW32d(uL$wlc%j*bEjGQ$)Z>ywpa7i1@e%Xlnv@-6A9@ zki*M4d3IduZnd(vwq%oF22}&jAmok8WANk);J4C1-9ufADle|`e(mt~7pz*|D_fz} z>lwChA4bC|cRmr)VM-qszFrdf4$_uba0erZ0v-$l$4Plw384GXkDX&vu9m;HdHw-j zkB{3`aepz#Q}&|l=UaD-J_W!OE5rJ0gmac(#ND49TEvi*9Ot(}R| z?UG5Ur{JumYSe(UN4c742kYstlTUlRasFI3ne=LM{bF7Z62Js`1TIeMvkviPHEkulq>2_k#i34E1^eH zJw`@?P790Lf+IiToB6!X-PkS3JU>=PoRClG9Vc<|)BImM6NQf|t#^-`J+TdX&Hwlq8@S zeVu@<{t*K$L#O6y$Dh|Xv!t`1-&0A1&)OaqLWb;PDEwo@kwIw5*~O+I7rh#)v&+oP zOwLtJ=hQ{@6mP%QWepMzfYJ_ol8sX}>8K?&Fjy?D%jS-ksO&-SnTU;vh)x}D3-^O% z$0z+th7t?l{?pi&Ege;$W5DT|p&N(kyi*>j5|CVNq=OydG3qb$Yuyy3i8_6!|GL8gJ#1gkqWox~xN*{h zM3y(3hg66X--8@Qn?!?T6jbgI$imi?OMAZ1h{`58bBjzjJbb%q34Q(-^;}gXk3`?wSMPgeV z$&rI}L)8zEj^)1Ss{>tW`}u(SL3Z*H^I$=0e>CvoZhxF^%G}!vC`nGo^Zh`w8|M1y zcR)pOD|IGa{OZ>5I8aSQz5YhTf6s)_tm{i#fFQHR+?|q_TAgGKT2wSaF=?Za1x{T^ zp;o>`M|MX1nH5#x5@$?#tF|WLH-FBy@`eCEM;Xi?E!_yWsih{C4gyg6qEE1lx%j0s z9lmn!w&`B4Wrv}#tZh)xDW)^fFqmF#1J}&_g{X8|CLw&-&?)UP-&Z5XAO61`eY^Jt ziEORRCCo%}Mrgmu$l3QDj=B07|j7(BTGP*V?@`pLiUX3Bcv05ln<{A zS?d5pi^QUHl(6effkMQwy zrPdEDtA`tsW-B1)d%oxE(bF+{;>f_k=iowu?7vQQ^I~R-xs4 zd;QAu?%|$2oNAK0sYet3X@`}DwR6D0G!ZfQ^b=4o$hQ&Sx`N{p zBX_n@#XE!%Ow%MU5{xua5GD8kPSb=9tw%s=?(@+tbEK_}Qk%oXiV091!2|dBg{9}G zat;vLMCF+vpQh2FS`9)bJOpHD2+c}{4tswN%Sla=lM7h>x&HN?(XYWb#eGiF53azL zm-u~Qsf~QBebkz}B#b8JE-WPXq%rja&sbXq%oK!_A93v`!D&fq{m~8{HHCeKx91l! z64v3JkGA{ULg7JN%*GkiU}rEwq+L5g;XcaTvCU*>k6ZV+ z;yzzHQv`!(jij5WgG-}3{i3_peu{utfs>DT^bmiSalTbnkId(@XSpy1@1H;1bzj{D zy|Wz0^S5VSrcDC5oPL>enN*@Jc0g)tPGbU&&H5cgEGS)bQe!hu&oPbPA^kZqp>Iq% zH{*wR+4Rt4?)??aOQc=+j1-OXZF11Wo>YzZo%T&I_ijr^HXe4G$Z+2rCarNyTnM&W zSD9Fg{o|4n78hnR~~ zXA1@33pyL*O2p#VkJn8YyH6@-t|OVcHSJ}yqAP#G`q*s7E=7kc?xAY{n(ov$(4j&hTuzP_Ffm6We$>YW#r;rO7S^%#MqkvVl6sQ018 z9P8|qe$(3oi^8PeUgacP>(_rH_GE2>>#YX2rMut)On)F5+=M$ZNomx68vL^Uwlk9f zMi4itk5XniPY{b#pS9RRizz?LGP|WC@qI`I{UBHZ*1hqveCQJQ^M0ewu0;E6W@-FY zu2C)$l}p4VDF>VZ-5G4Si{QU>alR+Mp^Y(mGd(hL z83B@Ios+WnFxjjasbAW3SG|?|V`Hz65hM`SV?@*Uhh9lxl#_?|m$bh;38KgFFMZ^F zn4xETcjSW6>5{l1XXe2qB9n%C1w)(>5d==nVWC%LKM-KTvWk@4I)$4BU8drA!sG_B zIG)=|_NT|IC)H*=DLy*1=tTy2iqp#mev`51iq{;RW->&q0`R42;huut=248m2pu zq-kNcZSG(nxgX@2wL z{%F=ZSu-5{YH{(Jf9bMa9bNYW2jmM!4nh!H6exO^h_Z;{r+22-=T;Hj3yCpZS*6#~ zP#3;d%CvhVgxA-0S~`=}Sf?Uz*Ux%Qnb>WBOi>02zaHKYhMT+K+x-kv6?FzS=acMA z&M1iS9CwZ3m~y1&V>1kZx+_D3Do)efsKY6y#3%gpP7_3Ow?IIyr2r9Oi^ob6{{V6L zJn)Pm0@UW|ARj4Q#N4>i4s>?w@D5Kjlb6Mg@=ReTF_$zA>Qnvt)3=_LTYn?5aCdth zCqJ3%9l5{?H)Y-X@xpl_SE8wjn$c2&De((2z8^T&eGwr)`Ji}&{`c~Ros7)m?SG2H zFL=taS9TLQkFEIJvNv5!9q1C9H9(G_%?k|VygG-UclCM2B#oe2beSpJbKz~1aV4M} zGdD8u0%WFV(`^=x9xWpfo&<0>`7WK|=9s9tAD2!W4m!>+1TT*jKTm_~Lb;dLI$a~Q zp3I*1c;4^(uZl)<+%fVQfIi0N{=! zeAnXrAzyvOXDbN^Vx@&dx1fSG6pK<<)NgVs_no(tXomuomvb$g2oKW5>R4s%@I#OC z%bu&6*{VC#489X&M%i|L)E9KhE`9rs-|m-2_Z|6$#4a(_$o$%6=C94DQ0eGwP~Q^X zR@ZT@*mO;e3cV;YO?S$8*&z0nR8WmDLYlhk&F>kskw#H~G;bEA7Vnk+>B|*a|M_Dy z?4zX+iNndFUiFnSvv?Dl$S@w#Er*K2w$%p7GQoA_ayK0q2y02PCpUljnl&z83G4I% z#pf=wu}f1poIw+Q=_+m`b>>xw8qFBV)75V3=V5xi^)GvTK14rz?u?Je;LXh)LwbY| za5a$>D^B{>t<&bj!mrLf5d+~S=+fi;QuobGJ>B!xktwj}N{@o_5{&$FZ1c@FT5ZcZ zXeE#APt8g6p~g)_!_a}1GOwz;Ftw%iTVnr~F7GTfd%4vNmnS&bd$Mw8YhCQpyX~gR zR)Xk{cAK>i<`aj~wO z{gmTn-x>*wZ0A_g>{5xaMfYK~!OFt+s7B zJqhmlsfpn@WBXlQ8|Tk$(A6GqNYIg{yX}4Y5V(g1?jZ9G#1CvGG(Ec!imvn;mKkEc z(E zeW$jM^lnqInLgpPVR^xsyKO&#dd021&@U|UGKwF>I}TWmtBMM;moq3f;MUeW30Acy!pFNNQ}SP8(NetEn{D7qPCT#Kl0ixEXyCk zBJOxVSd~M6qUfUq>Ho&kSqHS$H2*#nij_hO#kD0sad&ru7I!Fa4enkjP%J>OVgZV~ zI}|54#oda#LxDhVp7;L#%}zGEXV1x*o$t(NKaJbe&H|KO2%E^T9j|w$6f{!DwDLp@iH-ucyz4A(0 zNXx%Wm%d7(1F%B4=L;MhU{Z;`GDRM^LvqgfTeQEgzdX2N zCX+Pvr0nZl521aw`o{<&q2>fuvqqvx%st`wFnmX=?k;O_4ew)H1>kYas{Qx?Sa>}A zL4Ug`HjQ`~)teQHi-+{j4?wGUe{?uJ94v}}9Kcv0ejCV%QRj(_BG3B zhA?U-o~bX|jSxHLL9}0SdF?Ph$%2&@<*DDvR*V-zp)VN=&#`YDf|kSBK#e|;%kGT- zbP3BxE?Urt09x*h>w`gAMwYO{mZf(^VNz!1k#A^faL5NW9Ql#XAHQR)8?;Ij-?i<| zu>UcXr)+r1N`>#d^x!G*GV$oLcH_Bzs>22uoARw(U^3mx-a%RZ1AB$}qd3L~DS^av zN{QaN{GB`ZOxPVCU74=hdg~oJy3)u`RGSoY{tnKRqwzhPC>MUrn(F>4E4^( z4Wo_Jv!JbFC_0EDyBjanc$#PQlFvT&)S0h!GOiW+=KlJUBEtOJ8#yWt+yKAK7#mVS zvf1($A7!UE|3$6DlcH~(hl>fnVv|^+Kyp6E+0SJr-}}Ohax404;>M;3A;+f)-zPAt zY4&1Xc<;MR_^uw+pckx0fjkpdK57X`r)7ApdEbROaxQ9qT|j1lDo)K{lT{>+c(V29 z0iA%HgHarV3ZGEA1%cb!mu}3PO_C&4lA}Lfe7bIBf+b zYHDde*=tEG?IKdbKj8s4;_*l+o|t6#mMG7#PwtzTg9+Wn0oO+d*~1n1l76&CARu(u zM4S8-zdyd<_V)F=a;`cl8QaIsIeY-Tko!aes~#t$SKx+vOYsk(0{Ms+#pf-G-<Sl@3B0Al*C4*?E zbdh@k0^`Eh-CGs76x}rqh5APJQ*f^m)wx#bwpOvtRyh%Kjzt=md6JU}gDDu%ZbBSq zoyW#e+cE=m+<7}@Adzi<;kKJ(DX??6>@YKfY#1s}{CX~jwTQsD7KF)p?-MY%IF300 z*oFw~HM*$_0B;4eCLO!CZBKvAGt2Y1BwsGP$)n^9?V?CrP9LyyvlYk_Q(>I@nACGF z0KmEEWT1I?BX~CSKmo?!yFrUTVpcG}D*F8FD?T{f^-+9vz=r|y_XzGgrxFup1+ijF z_!#M$CxDY<0oZU>6S=$PP5P#R&y#;I?Sj-ImKQupZYC59Bk|RDlqXBGxkb<2t{*M~ zlU#$3zE24xlkeDUp9&1ZDYhU;QMdFn-0te0>62oinAClAwsNqLO^MiSe*AlzuTT)# zj8*f#4?G70Dk?aJeXRS@$i!NTueQ`B;!to1&1D+KEUUU5+nl6u!SWejj|c-6m{kX- z{J2v+xZP8omH1a-fh1vq;wt;O7Z0m0-Dv7I9mwlK<(}yybH@Xi(VY_d`IC;zTi#_R`wQ zuINx~2Fl^WxN3l$ukXyFk!7Pg*Kr&&AK#hZ^XUGGDRV&8VvVn~bW=b_RQOVY;>ke_ zEY)iPv1l%h=L~Ji41$GH+!is0Me8UZ9v_xcJECvZzYuksZmDeNzWZhxZWw9xLXgJ$ z)+)`2g1YO{SseSi{1lbWZ=W=9vPQRwc8}tkbgz*KT1`J1J!C-!uydH0HQFo9*8 zr_-FAksLeE#KDi-zL!I(Hr?PM_s5qi$tAeiFNJN(FcML8 zH4>3O4GYEn3gt^Zl%8Ow#h1(djFD@&fF&{Op^cAcP!8_j*mkP~9~T#vGt^+1ADWk{ zoH)dcio26rFXGiy14%3GbWt;IN>|%UCNvQF2I+$KVizPTcp4TOc(;Q~ku=2ak@O zyoG_ph8nF%k*S(<#e^$D4LoGLtG-s@LxafePP&p_rhn{P1pxd!Ua`ipFL^+mthZ!t zr{Mdkx~}SUfj;`*O>=qYfiIu4i^Rh|NL^~TB#u)DWJPZks$#--gP3f?RV*i-G6GPz zeDcfCjG#MSV3O-6fwIrio?dz~-zfTl-$}n7&7hoJ`95eRfxUGlLQS%b8tmM-4)7;z zwa%wXbf~gIaeZx;A}BG4U^}EA9HPe*m3L=0i7}v_Az0U4dob=JeOA(Zi?Nmjo&>$c zs>MpfG3O&^l#QzMEvojt#3!by08mYkTrE;1%spnYOi|;NiGC})+VfFhuZZGyfSE4H z{$w$*2rT>M_5Tz!8wXE$m6lZ1gr86cSFB{bPjXVQMasWN>x2#SZsBuW*lu&MCutoa z%$TX~y#Pbh#2DWxzXgLEUW0=Q(hoI3Yd9f?E?Dg?&BuIaIm>|QZ?=*w1lYroK| z)N$8$Zy{<%vuj*4-T9DA@786FS}@<~@`?ILuiRg=?Cqizia3KbDdC?Vyo@*x7J zzgh^O9oRxkTESi_;3MNOm>0HXJFrNbT z{22NhOzYL3a#Z3SqU0f%f|VEdie{PYop2MLhSXIi&h>7AsO)eId0C5fO?_R1F4Siy zeu$pSa=GhJ0DziFC9(Ie@T_!2k&o|7XC2LjjIO9Vy!CA!bIn3o_g)KA-(K~T4|&?-Sr#N=6A$|zz_J_6SLNi#!%vxOLgTM?Mly_ju+1zpQ@Ev#VT*0+&Z0^kv3iS z6AI2TUzIhr;C5!F-v)`7X4ciQayfDOdwvHq0&}Gbxt0FiJTyAbycbUJ_HRkP#amMwkvpk?D^(lPm|>vMa`Yi!>5Oi$hnGvv~y5cOGz-V+DQV z7y37w3pXsff&;EX0Wayp!zEofk}#L|zI2u5LGb_(@drd)1<|WKLz&>2Ak!ZD4Ik7nN3srA{hneAJJr&kCthTT3Tbm(V%8IHzs(~hXht9G(k^A(m zgnR@sHjq$JyLcXg-+NQRRJfj319n_0B%jkAe&j~+Cl@{zM}ZiATa=VDHU3;$IUTXn zD2s|=LI|xF$&H~D*gJUKzBTh?Q#`#CNm*~I^xP7t6!VQ;c<4NCuS-QHI^JjBCmuAW zLe3cLTcAtpZsS8(;IHVasseWpvzl(1<>Ml7_U=!;!&WjEV>8^Xwbn9fk!YH029n*nTc_NU09Wi&P=3`_T^&|CqaWC3TaD*U)ne>?7Smw# zPWX~4cKEp4zzS?}aOil+(~QEBj>@+1rlEd4@D`zDKorVSy=n@)0D(!po8&6>dy;`w zfESV*oEia4bQmLU@QK5cPqD)Kt72K%H-#|kq-Wwbu6&RS7Bej{8Y&Xbq=(UU6y}&l zLDrzMaIZv4>`83kKe@A+MHg!cQAlfS-hK;3ImF!d6d0o@?fNOI%x7P#Skt(nVCBl; zmT-M2qq|uMwav!LZM5I923n<5&0Fkp*1XV$A6X zOnIeeW+*UvKu~QpxO0g_oVb`kLlh2*n90|u+oI+M1af^W-`9MI2VvNbdv^i$4&Zmi zT5zVAiPrPo=n4m+*kv^jipsL;o{<{W$wQ$`n^2=!=k{eT0_8mqHC!WaHR<;DE7}e{ zZS71-$hY8k$wMnxVx8*!ZX&eFDzYP%pKXPf)&|Qym|`X?ECegI?Bxm!;Cc&^(07DM zz15KyPE@Y(D(VGX{gFlkUnVkTl`eD*2vd0rcEb&>YP5V_#eCoC+qI+G9xVMzf-(Xr zmn7YSaB)bI(iS+gs$fqsFAiS4CB?;EZNXWCL-IOFNRE_Kw1la49Qn^(pafxv)(s0p zdi__d;%2@AvUk`OEp**0Y7D>!Do#Tw!07%d$@ke{8D*5PF41$fz!u)JuXO!Ub{c9K zJ1oR5;+$V@Z>UE;6iL*)g5xV3=A<}M2bq4c=|rzrVBuH$pylzgI;1>rd&qH13<0^` zj(7)RF>Ph~v7(g_J|G;gHba&aWvdO*--}|&k)+~sNn8QoA~U%Qb!ezo&7dmF-=dT- z&vCm?RRf;Jq|9G=`Z$WLj_ONHnBY}0d`*vBu$&xK#VMN4If0(zWe~WP*O2e0Ce8Y(5q3G$5O? z7@dZilf+r;)n*c%#oUhj?#-8!?vOf?=*UMp5^%Qk!k>;Ig4ZH2KkS3vfhhtWuxvqQ z)2r=#PQ}ODQfFt(Q&lX114velNqT7_Fpsk_efPiwZ500b-mTufYup)^yKBG~mK<|L zOisvv1t>{$fF{V~K)<&WF~z?W90r9^T2=(B(!vg%`tJ4}|xAettV(E;VY*>}A++Nbg9EBE@1S3JlTr)iU^>}+}mw_bN zy*&0|Ut-MFMBo#%ZPj}^YY#*A8&*`c=ijaR@f=}%pYO3h)#4-!eyQGSzNxU6TE#W+ zE?`<4?TOF|AeXfKF7`NNIA%GnPVjZ2h4o%bHm~5Ze?&N~2y&!X_d?(ct2$}AGS~t> zh}ZNFP&GSju5O!$cJ&k@7j8k4*4|>|tnei(=kWsOtHrpuFrS!abjs67pu*xy9nPZ! zf+@M0__(#P_$I=FKgZ_G#i=OYq(sPD`8gDKT`nn*T7WB?2T=BVe^aZvNG9;#_XFq^ zwytu}FI51*?8CU+t($jEsC~M05WDOi!~% zD_bHnWjUUtE@fh=I9C$>B?cI=q~5eHpIi>QXhoaLG^%GDKO3$o2`iJPeGOZ@<8uZu zFkGDa&L|qB575V!D+Lv|T-Db>Ul_TC7_z7zkFNhhSH`pGp$=<`y z2z%dPpw%W^izNX+h<+I31^$ZfOkuTnn)-2BOK7g=1Ozy{{Z34nN@SJWsx)hDouOJi z{RuH;HQ+PafVH6uQ9uPZ%uAq>-UGsg;VZNsfo9peUE@Lc&4@67X2T2{m@S{^%cpuL zOwOA7q-_FNKJ{GI^@mUXOJqWYr8Ra1;0)*HtPp7CXBT9~!G_yXZDWxdf19}|s#c7W z^Yo%2hqr(A5Ozq~gxFrCJ+sy~uv^v3;Iio5TQIX>ka*&;;l=Bz^;H8uw&l1M}M(E-8uF@c=FZJh>>WZ=;hx-X)8j-`x8~sX!+hD5l1ie2&rB#*;-jIAp z&G9uc#)5PY<}(Hc$BB0Eah#+qYc^XZ+K*4|4DWm@basPakZhyCr!pG}Bb{zGmNF+x zWm(grk8|E(yxrzzN}Og8fjE29^11}>r%5SI1_a)5$#QCIW0x*RF>L^p2;k~}A2rOG z5Wjrh<`18B;G&3d&;op8wjVzY==<$g6ud$mzF>H0N|36dL2eG*Nl8T}&4g37=j4j2 zM5nrHSaxtG>YJD_M*_SlTTc34zhL+kzOrn<1g!bcGv!ENIcg#e<-U`ka27n%8#k(P zJ@z!?_y^FhBI9p(KvZh3@X6$QexFSO15q=tZuTzv-JYC`8MFpVYR6a5wt;Zv1urd ztQg+ze5yI%K-i>Dy$qR`Ry00`YbK!~jP z0O=XN!fAaRCoQL$*SjYI!xS==hIq<)J&sB{P(xs-0^5CN7m3z-g z&Qv=tODM>530)ZHQB!Y-KVM28hxS5yedHOdpul@wyDokF87ySDEbMdH_I!|NMc zc0FsXous^HswiBo7BS*c*O(Y?pGBPgcrnLx2N6N zPeznVZ0ZL=TQE^*4+xck6eoY%5@o)~_lh>wv8g7#j%mUz8qHAvHS~Ty=Y1_ecfEwC zk5it~B~I~+!uok~pH3if`m}PF5l#1F-(Jo*2vLTe&Ro~V9-8uAPPl>ni7sRAnZB_h z#z5YFcg7$fljo-(wU}M1fE9C0rUljgm~%-%?k%)OQ7x^*DX;|W2-$hH{+4HMp31z% z$BC|$r4LEQl9K@7Zcr@IQYT|M;en~ZK#plu3SJPB?6 zwqmIetg^ngqtp!8}?v)Xg$XEGi;ma_Pn5vO8xDJj~@+1 zJK&m#Qns~T{gfBOH{^AlOaB1x(>+@P3Jpdmi^HVLQP+lm94X5+7xhNP_duR)L zwhK5yL>R8D7_6|I8%?|`%T5=>hT^FwjPa^hL_AlS1*MxJ3jk=r?H1*to2@CfJBjuv zy93lOF^1)iF9jdA_jqgjRp+R0(aG-%?y4?@#Bset{=DW=l1m~z?G=IYM71yo0BQg( z&;Zbb?iXq<<@*>FI>qvwMwOgt#~Wd#`6#pRxsoDbmKwPv+mu~f2NS$?if|!be`8l( zyw}pMG)s_uBPytDFjwHW+-}PL&(kdh{N#(7_;%?rjnQ;jE9zY!<8sAp=`M__nhMSA zMefX!0Z+@{rvCsv;&eL&eJ@V!(7fXnwLYAT1G@KE=a-sY2n)Q>XlTQ4u zk55+0J^FEXg?VMo$x$Xu5L=m7_w#s+Q-A%QVS&`=GGHecvs+i%<%UnL9DVF5wmR4-fc1fMM01VUH0! z?Z-~1#?D`V>jU-AHMIM_ z-{#MA8(Q(WoD}AWShg_Y@hoOwYd7VZ(h3RnZAa>l%leyo-zPHOerln-6qvZqTXXzV zXNXo)E>p)9pY!c?Ory8p?H+ZOYNY+}&aXc}vr4mM=Pr~xXEmyv2lwmpCNH|mUMFo6 z3GK$v;E|aS?qKh&4!J0qrBTh!MGftqcizhodwgyG`B>f(xb--1f0d-Y#Fxr<@DDKv6~bt zVbnQju=O-hRUurK; z9Zj?YS0)g{aNSNCRKczR^vUOUZe0HWHPxm=e><=E!IfwguU0z8Y79^BXl$rWnopLC zUf2wn=aPBsVIF6gMiu}i^j22#Z}ZIWYy0=CB?>~&X)b2IPKboYXu3@8NQKz}x6r4T z<%{+A@9=-*^7H)zbbeQiO2tMvEzY_jBp?D4Cdu|(YI4fevx)6F&%1U9jVY<4=Z9pw z*#)K5YnCZ|iBw-fUE(IN^)1HU$_rI)Eb*^nBtp57tPS982-3IKOl%_mNE7z$p zQ3In=y;Xc2{9Bqz&^@%chcvP;YHq>5Rvl>~<>Y*REF} zQS0~RRogpVB^(=0d1s{S9-1}TJVDJI?6$As1t*XyU;}{y_M*QJU(_El#pf{`5I@WD z-W*q1p=dsCF^_y;`Ue=13bg2Y%#Zmx-QiXw`_ym;gZ6B_D=$%8-wfJTC%Td*CsAOe z8BGulp|ClE|DF+?4!=~}jU{?Tz<^*3+F(b-wvoq*8_pyhX|5dHR`*EBf{8US08|n~ z4-d}F{6aya(R#|J2FU5?2sEHikKWD`c@B=s&1GU~%j+!9JPFUT@@CI}8Y=F+kPuHckX~m%Zt2_U6Xlh)v82x0_&v5bI zcQvE?=NL;&jOzL6==BnG?YCcQ2=29rC$m>hAH$z3&B6b(vgVx+hj2O-7n0Kef1oa( z1u>}+;~~e#e&a=qm_&NvIO%)6Cl)^V-2uSFfw%rs`7|q7aIv+$1C0xpD>f0)NW)=q zBvpczS63k#)DiwlAOF-H6H1Vb>I{6i9z^(QJy{DMf}cXapHX-G10=i*EvP4E`~ba@ z-VUcY(a5Lzg;Z;UjN;O=A37udXA*xTyd3R)_C9gMz3m&E*@A7I!))l9w&}~1cQb!Q zkBE4rsr`Sy_+dN@&G%be;gQMvM5Sp$DuvY#KAk=ZUmJWaZ5Woyjy?s-X%5zCJ-;j1 z{|6W!AXVzDCl>rdogD4ACmv@|Rjzqz@e~{Ttkxg&Bi8zafFD$sxkvZ)P*n7_?wp+$ z*^o=;(BG;n8lqG#{?P&eiH4rdL3hDTy3UGt&Hr!0|I?#++4rsVvc>EKWX@Q;wD}YV z`LDmUDQ|WV7rYE4`c0Rs_k{?j5~nP1xYVs7!dmo*Y&uM)#iI#XlYG zVa0Aro1dt3e(Y(G*k`bjfA$aj7E!v~W_7J9%0)TP9@fzj^#!11;d1}kFFgMFW4 z%jW{P%)HHnVwtVg`D2A+)5kf(>fA%qOz5{#e2c+;98@jet65Y4MOK6>6Np zSuY1_NlyG4uejm5DsXmFA?PTb9HECvfAR8ry@Tz^XdJ^L)Y|zQJKc|%U-s8>ydu9An3@=( z#nk#DN@1u=@MK;rQ#w=Ly`J?TwqVV>NH+YHgX#NILg1&d4;(C*0%ikpZjg=x?HC`P zOT^gaeZ%*5Q&7+W%VbmQq|T|*JLaX0(bvq44Svc}ave|pFu@jh zRtNs&{P@H(O5SZOPH$nFO4RX3Y@lJd|4)8FY;u&_Q=N5~vrBgwT$;+@F01P9{Z2C7pmkzj6J%TFbrIPBg3MD^t9h)y4-B*@kL; ztckO}?nS<>P?(n|HM^#Y%E$YEfbI{4R2mgebEnTo26s`ER zNXVo3`_2V7LxX^Vq+;94cW*#AEf|!|9~!hN4Rn8v1=^mUpMT?=eNPE3?Ysn$;6u~U zTpe<`OS5~N+6~(YxbtnyC$Q^-&q985H5Ju?M%=Fyi|ko^N}m_E2-NPfwv zPjI4IAsxMR8@l&I)*2p7wf}N*Wgdj{&6DaPG!P51wkG2%L-m`z=%`F6BBA>daLT&* zbkCYyy%gl}4-HLo?n8Ucen*B@jr0pK`2vyL^)_fwTzLwJogzrfI4D10hm0$QDBrxr z(C=sv%s&b~8G41@Ns_*Ny&Kwv@1z22dFcpn;ud1C9*nzyrC+Jy8S+KnAlK`^d34&y@HsZ_C(9= zX*KvX>W1ig3^Dhhs`@M3u4+L^#S;xy41SbJc79@3QxAdEB^ery_J_KBj4p+SLxM_9 zOhnDL=L#E+@JDGRzviv~cf=zl`)zvmpP;lL*$X39w_@_0*bmLEAscelEF;@F9+%Fm zVmbh7d$$r2f8|U*+(!E6Vg9D{Cne`2g8+EwPD6Wpl5sfeI}E?iFW*ZDz+X3&9AwK5 zFcZI8tay#|sJUOQY9cJ2EqG2GRN~aa1&x0uNUm|HjjhDsKEwW2BBA*5HusqxwK&7B zyPuGTR?%6SvM~Nt*37Sc&(=p8R!%E95f1N$H@i=TD9(Mh`m~FLAWBYuF{XEZh^SKI zVY{jDI=1?&v>jE(gzP@{j`Qq;Pd2b9Zbwghcd?1@awC%XU-<4U7f}%CHN9r>eOvV1 zXk6tOc`tc95vQSN6)#Ry7!SD-!4iC|1yDkGw8^{CUc}^YiBu0!UPXZ_!HQ*yE`>>k zoyMk4TF+e+O|_Zz>AR|{;Lb%vIfZIVvgQnajZbylH()j^YA?K;2h*5F?=r;{u|<9A-G z>osu{vdILyBn&&^sp6{lWi^`%mc<%>L|T?-s!yIpKUSA}806MBH&i!(<+x;L!vE}$ zE)WJ}|1w_*f3K=58UaYmK=TZIcRrU}wyNY-lBaJ+P-5s{1BVU22ooI80z?jNCu1D> z*ZM_p$;_^%&$Gk4Yi>SzlRI`EZ&#cG7l@zt>M~-|Pchw(~xnO*Y&?H;2cWH3w=5r6Pjf z!m)8l$pT}rxEI6Ad>&Co3FYDMUerXa$ zyIGyk+A+;KsT67Py-R`-_hdPerp!O~Yv^wq(vof7huO7g6{4pTC*yhb^j~2_8E8|| z-qZEOFV;T)K1PX3d8xXUaG9+(5GmEK-X|1GU*X|Vg+Rrx6AB3yHasDat7~gmi+T_F zqu?kd0@3|q7Mz4Dz31ua-;6QUAARK{#k;BuX$-F!hoAt14X!3A-Z?K*r&%mOFHu0`lL&m&3|8s}{ z&7Aztbo+g)_Zf{kDRDVG{{Yq@f6T23L{h~JC0@J;lXG?j9pbut4IQ(E?g%oe+E=kI z85k<98wXXewv@2N=F?{tDfFPw8XLfAfh!>LJW{Otp^aef1`ZI#ZKd9#Kg7eNE;BW$ z^~>~h(c!6^L4L}msRAq5yeXVZU5DR#1x1{bMNI&%j4OmauJbFxgN}}H^z_*@S2cg8 zBQGty1gps%6bLR@)cva_wSs;v)?6BUmKz&rYJK1M+==@vZvp0@V!pz+dgLVPv`6>u z%VZea<=mX1xDS4t8MAS{C?0hB+&Ow>Lw)PVTh#%kY&uv(GMpU46+!4Fn1;3l0fV^U2t5S>%>zl;xAn7}^5QAD#K+cO`q{9Tr!EIeu zYjk>9U$}}h1_pMwXI|F5#IbtDN0%Fq2%Qc40^Jr_)JmNL2e73)8@D1=fcU-uG2sq; zn`y9ya2st0|C=b;z9_wvRRm)QQZjLUNoH*KWK>uxFr%m@fAv?_!NZ!btuTvQ(_1Pm zW_y6&a)Z+(;Ic@xL!e(~%i4YO@KM3E>KYlQf@@^D4pFubL%GT`)MuzjYy`xkLe;(e zD5=KTM9$pj|D=(AqSZMCQI8<(ObuW6yE6K8{swbN1q&&-8?)^DdR$Mhuq3vz^g6?F zy1?Xazj1DkjNcYvpq-U;T@Z@a=UQ1ct=M?Cfda@iw5MsC9T?S3Fj~LwDhE54a}+@X z#9;aFRSoO4->2GtP+c11&E}t`gyQT^dUq3&D!7L| zZA8GPdYlHBacOpDw;IQr`1At6nlDI#*M#r2W``Ui_0V(VX(0RW-n}LltKMs&1Co4a z6G^m;&=*}*7O-#C3A(hfasVlkM_{rtD+j6>&XDbF z=np732=Vl>v~ahvBK5K(;dAu{W}@!n(MF|Yb5abE2x-k5$WFFm7JwD>CHJvzX^T27 zqL$bOGd{@C{t(s8;sJf={V#IJ?9mq-YE%nuxc6m-50l3=chD+uhFOTDUU&w_8`-nw^LSP-$N?FF8B5wdi6P#D z42@#RoW0%Gcqf@QSIE8~w2vy^F|ymjDLndhXG?B*^at3HJC>Kv@>cnVc4Hem9j!`k zUhfmRjg|L{g+8JPWrhw2lU0>z99xNOQBDPgAm10zmNwgHIK>e+JLtn>{u8O>pSZHA z!>`tV_v-)aNyc-2Dpkx=;TV9*6JQeOG^#OnvC8n5{Zcp%E2u|QBr3h6J_Upw7G)~R zmwC++K^<#fT~9U{$TIS1!=Yx7{VizlW<)#!hTVeqeyRr!%rHkkH0}kUqd6YC>9z9; zAA_IjVw>Q|4n=PFdTzOn_GGK4t<$X%hm(3h-V2;)h;d>xiwYHG5iNok58~T7{{Kx- zt|2?DU6c%1f+{*}Dg#OEd;H{68k6rx13>~Plkp2kmRE&1?qO{C_jIBx4|J3F6gM4n zQBBy6vN26R2VY#sCG=Qbf%YvdS9dmS9EI+7Tc+cTCvuXQ4Od%Z5z7r7n(GtiT&Fj$ zNRHWw2G+)auv_DtJ~~De_2BZOcAy(@ztFd*nY}7f;}1?hp5A z#v0==6q#t}i6ptb_tKxmR)?3b#&HFBz4t9fOs*Q9{{ci(8kemr_DU$y!!y8d6LeLP zdkxDMI)+-d(N2k;v&{AWFg*d(c$XvDsT;B)4b`RMquF&UhogW~x7O^%fij<4k$}{R zAO=c4N8uo49oL3c$db|8xX9z8QJIFVyBI-h$oRA1sS|A!i48<2O3r0kyxq8+n_WON z^K{>*g0Vhpu|MWp5e7+FWBS3JIK#^1`Dq(CS3jUu|7UBmIvOXj^HHRO z?auS9Q4#T5jb80o>xZCTak$~rQ-l@x zYzCEvw7F}f?wPNA70JK4R$Cf)u^x3X*dlFqwkANAmoF4xB~VmB7M%WK#MC^`Go@&VS|?=MZP_)?zW#_G#4;($H}RO&&~RWJ7`&b z)StKABiXy3Q#ACYmX7V4i|Q*LE7JnmUz@g1`q(PprsSr{=3~SJbcw%x=Jg-E_|^9P zA0Qk0xES##ys367br6Dt=Pbp%LGQUk7I}wxm_bwaQz3o-024z2$6Ff6hvE)O4IXIg zNERDRj{U#SkFUkq^pG;#n~ILMvG*w)UKC(;Z&16#jSF7j$=>SInjcY79$`QgLT??7 z4t@1s>r0N&Coj!DfbNWAecBm|mJQb1Q)}ybc}LZgqU$mD>LH$}?qFStQW~SgeYcZP&Ed*~rpA&H z!2>Hl|F*V&0OJv;x8oFJLA>F*+M6f_$Ql+(LT?+}h9YV`;Q%&j3hK)I2QIym`s67u z-;R3S#z({=*I%hQO&15WS`{}6U9RXjF`jQM8t>4;Dalc@dPyff#923*ShjyjE}*mW zF4J!+?cCgV`v+Jsh*t{dTlNok_ps6ak~0#Vztad&MS=THBxGgl6}eY+bW;9$4t8gi zVa>hN^jMTc3v_h~Sa&}eKlV$A^A#isFwgF&T;4w^g26!!IYvX9Wj`$I8zEYemQl__ zHCJ3ndA{C}2ByzviOW6XmakPlf79s}@n&DuIVqK|!oG68&Cj)=$nQD4;5L}&>622PKAWrl?pOUkkH(D6wqrsKFQQi~g1;{IiMZPO zf(f?We*lf%po!+E!J`i?7c5J!4!8Q!E4B3p7?N^&!1OLP7$hu}yQ%rx>(ReTr z*giAXny>>a8zp*$lQk>wG??tK{RjBi`dr+pG=ycAfG_9YNMjRk=hKovjN$)Z(h<~N;^Fh<6qF9Z7;e*FQx z<}>W?1*1;A$1(3MX+!5b#O=h6<4S*b+!L?Fs^BYinCdsw!yc^3fqIB}>sQ~*F{**s zS%!k#IK1LzBPopHYTR<`N-Z5>)_PNDq6|sLvVs`Fu7*^>fywonHgF-_Ija*sN})NL z@hPLUH^R9BpA-(FerP8BjFaXlqOP0^T1(v>^bpRHyQgog_C7(9qtx|6mJ?7>zsC4? zv~KT~Na?t!SxolM<%SojYMU?HAI=>xvh7+JIH&JtsAY|2*8>z}$ERfQgi8ZrrI+Sm|?uT!DSTLea?J)5avZqbw#~D{>)I(Y%7lxq=Wq5Ue zvUQP%Z0f8^NV!9?nI$SYNlG0zRMvvULY8T{o(4cD0b`!aJ(jOizvD<0U}JN$`x z)_Za4zi6!Ikpz8{`xRB2+x3r%pj)v{&X+EojiP?D0ZNbSU!(uXWOZc!mms#&poaAi zV6|djExIL=4u&Ux`0lKaw;KO?6<1d*jnY1_WLNVKv*Dd-90FBXev*EcmlSN}VoPI# zUmwaEHo(y_G*PuBL6ptKoT9m!L;`P_vnB2_5pVu~)Fz)7g zmVQ36Nl`pC{ewJ>XGiyj5rqf|k-{sCYHw5fk=ESE^gUhv^M)Iwx!XbB*~RbkzEnfSb>8@FYiHEAZH_?{HXTyk z#{Q;X+y0Ovbpm7>-ns)d)|dKd^v;=f<66S^{JfDCuwikZ{f`fcoqfvQkzYyz#m}*Q z)p{PmCghB)2npGSV*(>Mv;OC>$VoxtC&cKZ#V_x1JtZymPWcB)2T|m+k@*Chidt_? zO-<-OYE`mji8oe|7PvAO7D&Jv*pL))SwC2U_B>w!6ghPUdMSB#6@ zcJ8WAawO=uBC}?R_c}-HL8Go>ADR$t$V@;iA2cH8G?l5rmLsaUXO#yn5i(PU^Ot={ zmHfY{0FFe}fvru`)dpN;DQ3U<$AX2O?FspZe+8vYi#XiYsv9&A-}Q^n98@Cw9DEVb&$=Yt-Eh^MK;AJ!?3tGRQ-81zNb4@*NGQ3TxK- zQ|GFi##zniBL>x4ef7)^!^c@6`;RkS8V>i!jWK_DAdN|7^qw*DEsvt)^T~fe?Ka8A z88!23cBKPTnt`@vWKXhezN%Bt+t-S>w%d)hN!;WaWgQX<+Al5*ZQ*FOvH6$CGKhtO zcZE~OotiVnNVfzzyZ@@SB9bj!yhD#u4U%4HQaIsA@g;lhp-7JZ!Q~E?xcyV_Qumv2 z?wd|=Tcu)&6Xkej`tcjtoe(xa3JQY+XAGW%5LDDnMUs~6N1AellL*HM&q%%roCgvn z?z*flK|Y9q$|4s4V4z6vz?I+Cj8#24$U^&nEPZuAQ}6pW3W}(JARtIgrKGz{P#7Vj zyQI52C!lnvv~-SUFhW4OVf03KjqaH5KHuN_|F(0UbI#8FT=#v2)c#R8zYJcNWz;q< z?RgQI1us93!KZjhF&HyNHT(x{j0^!cNbrL8aVifJg9m>zB)KZc2#CRN5nFp4%$qZF zb^7o9`&>7O>Mf~Esef8c^R{-%)ru^F;Ae*3ZY2&s3v9_*s9dNx6$@U$IyyRpa87L_ z+}>pkB(Xc#1aMAV00^Xu^ygkEg#9oXbZ`71t4viFB)c4+Es~f%Q^9n~NPXDO>$$ny z3tlP3fCkx-5Tr8fcsd!mm+oWTSSu>!TU)j_)Uk&VS<3dkTaTEB>8XYDyUcJf+lGtP zLx1b#wfbVr#-LhsHQ2b{m2!`I%v~yMY+v`3?`Wkqs&!E+vLukCJ_@9C`qXhClIaSs z4}&R*I!z2}1X9P|kP1xIyQf$nj#g3Qt-m@i^)mHP`Zo!rtVB(#0Vk+L#aw<@Xv&g% z3nxx(BFqJ=5BL6og@3-@BvPhxIG(K~pG+S*b6319A)jqwwaZ}RrP&KNB1a-@h?_g; zP6wi>>duw9Mg@qjHEX8+dg$dSR-i%EGXRsNI13f0(&Xrj#HN2(Uh6{VV$6CqDRX) zn3KHaRi2b<<~jL2OYCcrzl(17qxZG#cbbTjk%#d_(F{;Hho2&T+!xLo=DS}dh6LI< zHko42KzIW*A*Si3AJ@tze;gzgq(qu6$HT3K142EZL*8~XdzGrz>6(;*TAkED4Qkx& z^xg6wgp1}r<+$=$=doLEK;GW= z8*fZ60l?M_+!74;_^?eMyRN8~-^j7`p(bWZoMw+Isb(187b>FdCn2WTpGi`oTYi*5VE;)r#NhGvbd#4j4~09 z(}>+1#og}--zcOeD^{fuO*|W-jE6+5Ur!CDZvlfro+T@Nb>HwDE44D$qdF9 z!yraNL()RGyz~h+@0Diuk;KByVX(!r0M{#Z+{Iv2_C~htA95@COL6Z?^%1#-Fo{~k zd>TT#!{p~vI4;JW^7+~OpEg`M#p|YvNsbw75{CllM4qTRI~(Q$%4c^O)J{*8E=czg zed!^1;uF*l9pnU(2w^P!x1(SZF|ds1YJ0HJ{fmEC3W4MFQ{#hhqzI7C!CO@l(|B43 z1se_u9JIf;03S`a_hAsBef=_`S#m0TVH2u#G8URf)V&>^Y;27!GK{CT0T{i^Erw*BgTYT5p{b<4VHoznN`*r)7Vvlw6RTP=Sop0pJhmB!}g z48*zF+7qz4y=nAKYxecpsJ@8+!m3|dV&i-}q?#Rpm#l|n znq65fvv&jWWC}fLY~`6}p?d*sF-972Eg;cfL?n;C_~hHZz?k3Jwv(4E`C?v{_qWZM z>i@WO@>kKBlMCllt=p~Qd4qW#uWhY=SgGLH`$z8mM!*=6+<@8FgGdlY&J{7gnD}sD ziYUv%72@*`4Ijyu$^_K4bGKDNAs(kN!qAHrFZ&CQhORBTl>NstX77m2wD!1SUZLH# zr6)af7r}8mrt6WxB`Bp71cEoAfi@4qODmPAbqz*u zTX%$j!ZG-Gu8@=Z`893HJz`YmKZ}L%LCH;n1gIGrH(JXBh?*d)&0bh9DV$$@3O5Hy! zJe(QbuO6zBROS$bV4;Wku;P0YsqL4Zj|F^vU#jb8VqOF2DwI&63Y()X1m&pqSq(Nz z_tTw!!m>Vu32~_q`~K3q)_y2q;ZYo?utlzhnn%p)lh7iVxr$xNzM1-;P_2{4_?px- z8;GAQ+mhJzV6Vh8Q>xd2J6P1E12FEAnJ#!NN4=2`x_u?-!$gA|imoM8u{8l9(a!y; z=gnTnH?>O*iDXQt+@Q;F2`=G7T5q87KE!g8gNzAj#{TUaoujni#<3SoGCLbfJ-`5i zxceV=><}`AjxR>Uvph*;xet%I%VJ%Cq*+3}l}U>ae7MnF95>#8Hhj!u+j91<5t?qD zlxwvvHT-DBj8GN0#R3G`$YI7x?u}`_($ztW10Jr~&UNOIOQ+ma&SoR-S=f zFZ-=L!5_b{YxLC3 z8)cb-zD2zRnW|kP-3ZX3R!*TvhUDULdD*7+r|bui!c<_o{U}L>EmTU6;w~Y#bJT|$ zVlKd5RGq+MZY@f?qUdtywGpb`{0~b~!@cWEl+&`*C)4XnL5Nl7$A$C;wFF=`@8mOD z*h^RZ_7RNF9`E0a$v;HNCaI4~>pIujvcYu&NH{uJ)0;y3NjFbK5n} z*13)sUW}8K??Nbon@J4T1Cfc#ZJ!HLx;003K^X=het)q+*sITNV^qiaNU?$Y(x(1n z|F*?L{fS&*=?%5PLGor&4ZC$)?E9b2CMh-OKT}Et{d+xs^Yk$Ed*^8pd#Mw9Kl#7j zFFg?TEUYf)M4(@){xzqG2Yqm8A?;A#ggP=wmH@IwNvhD8zP4DHqCH9M_oLToo`TMG zKk9O2R-KWiL7NLW+-}P!)>hXiC znj-q*mSZG4=7YQ!Gk7g$6T1q=b&c#$)zht3<7SQjE#YD|`sjQQI8kSyYM?~Jk>Vi~ zi?w&AyXz_Ji>qVjpt^1$D7+uuH<%2W4C`q-Hexse=V`>%~w2#8L-n)C6{tdTD zuokD+6O;TT-Ii6HhSf{{Z1gpq z%Hj)g=99s)IFJC13?5^|lCz`hPaY8+bO&t8VdDXA4cC1(r@QMA5ksKmK_qFSy%O9{ zEm(1&#T5+B`&>otV5SMJjwXHA7_ta~OO=u4Y*7e_$&P}&^De1}VZ8Y4ODKM1NetaW zPcShi#76uVBO=i!S~kK!oW-72y*^@%qi#@ji{4^`+kx&;%#YV&Cd^g8_1AMfAIqv| z3Ibx7MDY0|XWr*G_|_u70 zJOo3o2k-V%@Q;;xdFs_oaa2@(@G>LL8;;7Yc8$%w%**m|z1Hb^^E!93%H9D&=Y^Xz z^4=A1=$j=Y4nc6W=sGD+q@{)sEjRiifg$9Z2|-sX4q4|8&$W96+?lG7#me*5PO)Jf z+-%ArB(li*QsE^rUrWCkt7i(^_Xv>daf4kk@I-bDnY>#$@jhe_HTKh*)NEIgoYZ{ zO_dtWwy3VAIWv53fx8V(fMNU431p{yA~sL(OyX*9c|CP&1fNkkRj$>EduH~<<_Jaa zGsfTxYX7ie-RA{DpbyJo+ZjMO>t=-h&55y&uel`celJVUU*(6rv#HBEz)b{(j8gs& z%Z0}0w7x2ZF)=}j??{G;=GnBL;B(oHmuTLOrMfnPO_L{cjaa6Eb7l`rff(0%o0(WH zb-l1a#16OMwu5V8uYcf?N{!qC=prY;0GBX>R8yOA23$x7)9{ zryMk;kK!BIIdKZU5v+d{x_z*5s@*Cko#*J^87W0l=={U-G0G4b!#4!?h~&vtw*W1f zNmOR9B5-FL{Cc?)y-L|p0YxOHe)vWwU8f~ZsXT_Zf9PGbQQ<=(RB>4A7rTLmD8S<1 z;dEzPQ!=a}E7Ri}kQbd?^x9Ho)#6y526!$wWDmDz=zxH5g$ zpnuuStBfWHxvxJ;9JW8OTYQMWq^QaCu347=Lb8*!@*iFrqc_zs)vkHRg zW?;%F?kvSxrJ}U{R=(sKT|V{57=a;Q;hD?`R2iuR8<3cRoxHj{*Ols4b<>M#1eyf; zT`ll)M!VoK!WjNo$ScETCOtbQ|6x&KN~8G$Gd*m|{vEhU|LKbH_knk}8-WJbAMfNA z))H$5tpeCuCoQJv(ua*esUs}qrHqx&$Z@eTK-e&b^I>8wXW>k4%}LKp zsioPI)>@vzme^)0OKOqVyNJx{!$N@BQc*Snz9lmj?daI5QMCJJXlqHtnTN>Srq2&uz4vIjkx%nLJM8_D$BJZX1k1wGpRF0@_fA*{)jR5_IW zWcuKUx9W}d4*{~76W*8v{~3KjbAUndZoKY*;fIt25j;^&K3g`P1J^FcV%-8&oxEJT z5-AdXCG!4_*E#YV5T$JPxJj;tEao#MHEL9uqUnd)0Zuy^k|^W4{xOi(RV5XHVYPKV zf2H?mv~SThNN}Y(%i*wF^+k9-?o%LBy>DQh)%M9o#JL`*!YJ{90Kxg^c^-I-a4zl%p(dR9A+w+#U_+@~c zoTE#4GHLL(YRXU_t0X33qJf%xH|@#bU1#=s5w#Jis@GV`Ys06v&$Sr3@EonM+QsyQZFY^3Ro%;bbtz|ZcbKNjWG>8DP} zZm-|r;yttA%tSl+T_&bxm;Y721WjwTe?rX;}`N zSdrC=7h-eEa#wwKr(}|V{ zvD-SaD0JLred`_-NN?4vkhyL{@I8~Jl@Q)UOIeqvS}WGxbsObkb!q)P45d3yWTN)B zh?B%$hz7RR=v*u0T{)T$oOFw{)?8awg38_30MbhYlCd2P?_UCR2NvMmu!){cX$P(b zTIr4h62c7@k+%ys+iM1heH!XF82ujGXfDgP#KJ<;EfyB!cm4aqD%X%H))cnqI_oR` z*_>SxhiA955P`c48nL&d+A%UF*8KSHG=8bh?GB$tj6;g?*eyKPY)<(~vUUHo&^JZL z9)%&Vr#dCqSO-g4RJUP}H6FIChRo*>Lbmwvb3+NEnWP0%)x6jP(Lf+~#v1 zFyC{^6YH$|P$!6eO1h%n&R;xTv;Z+3qD%5F;FV#*U6Oi6o4s<%Vz%n&rk4MapS@3x z9{?&Yyn|LVuS2(oYVV`iOs4I4dz;L`)rRbs4l*;tiq|jm0pb_UhJvJu%z|5oA^zoQu!UDVZR}uXf3yHeW&&fYa-L$GK<-IyQyyR)WD99Ww)>U zjNhB0J%Cr}qhADA?!EqhzBP=oQB2~xd7J;3*i&N2@_ds)+8yv^H3lx9SRd=uBq6DD zq=JD!S#?GSn4UwQ#9&(0>KGvae*t8Fl&v7-)A@mb-0gkfJ~F-^vd(Ban$k1h%4hCf zMXkArz}pI6YYBSZjB#itkFbQ6um_t7J#(<>&|AH8tbKW zq#G~YN4S=+JcKAP$9mN#mOSr`SRsj7T!#_TS$xvb3SrF%q_d>hUTTf8XaFj z_2c1~5ft=+uf|07UdfxgkL{xSv7-(SH1%i2gqc;%(~tA>*cc85Hq*}M$C7~0h{P=E*%RInzzY>a-dQv@+RFCz>sZ#L z8xp@(=Xy^NM%T|VoZr2j-h|UFNvbMm9@z?~gN^u(qwAc}y352Zhof8A(s8|5l-7ra zW$Ae``+H!e?w1eH8FzA1oVfT!;lOKb4V2~LuLm`Pe^{G{yZBc#4I0+kHlDdfU~W}L zyB1T*jBF6r#sta+%yZYZ=LP;B@1yWx!;;BUICGojJr&x`SLe#~7V_ls%|a-&w1l(E zXp!iM2OLc^<3ERn05JrMu%C+^%+lSzs}@DZ0P?O~YFP(C~Cn*^%?WC6yiL>obL45x?lE(=c?t=a=j9$LCr;ZXy1kgshj|hp6whJfeN~N)kIU z$5<*W?WY~B?gxBaou))(SWR+a6W2hPTvUyCdB>=Ktf-b!gCdn`!Fn!I$7`;kU`D`W zZS}1L8)W-sKJI202RXG*3^P5cis5?@k=j)ho9AjzPZ-bcRiv}=hnNF0WYp#WPVLEe zAS3O`Is`?JOXj2FF00ntt1xzIDdN5=lEH7v_Yl`8J+xq<7~%N@8J!U>MEz)e$M)x> zZRy2vuZ*wsGBjt~u{*+u@F7F-PZR7<5KE*;uHDS0enPj<>UY|#OxIXd!NHAM``Q%7 zbmu4&gw56D(0yksj|Y7SefE5zZy2QD)5tHnIO5#vK8rh8FFsY2c;gEJOIv}xluHcC zFwt`)L@GG?wrkWAf13E1L?QZHyP7%RYhuU7OW#RlO`k;8uZhvWFWPN)9VM>mw8bfwfBX{ z*mTRdPO3C+Z-vAyPVC(0-KL2#xU_uC_VWF`Dg4CwD0=uT%5(DeZa7O%9yU=WKIA6u z68Ryn0;Vb``0d&Kr;;RngR}{mdRt(+#O#7fDL*f-0jEuKSflqB6;?-9jDx3!YF{Zf zBl*TvS(JBH6wCCb@|OIO^i|eO{K^RbLG6N`z~K1LhrPzZm1@6pKdfgjhAboIL5_=# zwgBvinO5ZCKP)bb3Es})9{-oN?Bh5&u*HyfmYL2$&Nn_MW6X&g%@$7t?ICZCpG6BO zieHfrSTKq`JczMSXymZ34tuxsU#7Qk(UtcDR}_7{z^gsp9Q{Yb|FD+J!7)Y|d||{4 zzLupJZo~(u;X%xM^hcO&?92J5xSWe}#{$>Yt8?f{U zkrZe(xHJAI^~J)l%5xTC;l!vYg6;bR7F^1#ldL>*8$|LV@`X-T`U_zEH>1PH^Mbe8 zwqJR{PHjXw^;OXo?QNI}OB3!Z8t_ItY30#J{A`beP?}|rCTB0ReWrW#KKNJl#6K*P zQnX-z#)|*|Kh*lA04Hye(vm!BwN6HS_BB?_#r8Tq6XW24`|9tM?ebkRgmOCN92;$8 zmJDeemhgTL+ej;Rg2d`M%F+MTJWUUX7ooWj}9f3xO$tj9=gfB^dqop zepFnY4$RS;PmCz?p}=|FZQzE-z_rgUF28x!xqCf54##D1NbDY{`!mT7N$)(|#M0G8 zGGz?yIe~D+2Hv^uqYT}S|ID;}-6}15`ak8t#~qBn4C11=CDDey)xwxug8AFeb@A`n zt(g{WHiA}avCZC-QyD7l;%~3RDanWs`qZm^0J=2%;ALAgv_@XtLP^Wt4I1(KbbAhk z=?xqMq=@$7Ky2%vON^N0c204j$j1YcrSz=(=7QVhzxyqBc$*6SY%^#)T9=Ag0P&+8 z`7~VW5^KW~sh>AD>>PjD;t?mP3XTeJ?x~3v|E}1U>#fjAd?1ubx~bLxT;<+;T}l6krCd$49LW6l zRFs%3tOP%>4Ud+hbHg3$R0zV@5F@X)nZv^n`gOS?w>dECr%#M~+w_}b+C@rbQC zR4ju3cN~e6;2!oxNo~^FGI%0gN-3r7KM>`_?I*Mrp37wFT7x?IvHnD9%7YIvrCTSe znYb_wYYsU?_WKXUKOrV)bz2Y%;Rfov9oj|`$}@ANCWsAdare&H-fZ5~ii~)pzBM&Q z-9(DZ$_U9`Gb5d+Na=yqG}F_PRn{Cv6V%$j8s(D;@2fDT;qjr)u9xHytyEX@;G7421qioYD)@bSBb#@f8f@pQ)H-WZ{fv92@?Tp*HXB zezxim728C$a!BVd+oyYTl`n#F^8s|(TcQxObMiOMwXB~GClWL7zPsK;Ag(akx0bGj z*R_BKwx)=)6l8n9i0dKvdFr=0a8#=WTYBGTXja13niay6DD`X zg_JZ$eaL;39fnkwVm%*HS7VVGlyJDKHb1Sk{EBx{(qzkGn=_nd!3E-ljEc9@&B=|7@%C+?kB4T!?p z&g5s22krHZYQ?f}*vh?7;xD&(uDj<%F(YxawP8FNvwHPP?AdvF88J_X;O16`+?uL%&mTfQiRf96ij4{FN)__|d2^l2YCt|9pvk-l zV7aR*YgZjZHZW0q)%~&t6xlN8dCh}jCO@0qGQ|i9X>)rd%++p;)_}8||FF(3X>>m; zIwEf`m=<(w-{3hE%eBya4Zw_zfgWSSO3rU(j9EzzGIFB77`Gig)-) zPz*DFcK7cTOM*A#tPxVkxTtE-x^EJmqALgC;h$7KkWzk*O?4df0p#}SCd=67^OnZs z9TW;yrAvK&*+}<{)?01`M|F8HhpvUJkPMbM-Y6bfhWupNG|7RrfRZ%t0TqQ`=jwD6ZaKQ`)+hoTK z(FAz8y1M=$>Z+;}39>Zr?XyP(VP>gF=mtiws&_-S9%a9%4R&?I7U4i~Z|c;}p)hPt9y0Pmn)yHgVb?4e({h}tGR*vJlka3S=oJ-(>M zKWsTCVT&QFnhLBhpiqmM$XtUF{+s;l8Qz=4N|Mr#A#x&p*ia>Vxa~LuaD#_hYS7P0OELD>3Q#eL;&) zBq|;^V&d}-zaj$O5Q<-D=WMU6$6O>sGW>X!ni6&Y2vVv zZ8tjEVUUyU+Fr5*7B=jKq}!gpnLU0kFOTV!3$^^D}=9u78 z3bgxACD*laIK}8=@uT{CdbEuBNGr7Q4X1ogD-_Ib5bC8l%9Y&DD4sN>CwC=moVIF7 zAN?xDd~2=i4Zo`+WV?N%hL#=kkFi0n;-78Ozw(U`-cimXldaG`B5S z1;Gf;0AGA4a_%l1PlnD+5KFBT0v}0*aAK5b@@r@T5L-&QOaWd|X;xJ))uF+6wTr`* zA|d4HUzN;m3+5mLjJI%AY>vRptj&KTFe+2$*g}uSBkhx_I*NgBV!cc|WaD%2w1i{w zg<{k11Yqo(naDhQmJC&x9t$igN^elm(nR|t>W-kR7@k$hmz$v3dH*9t;TPB?H{-)6 z0^Cnd&ZunWieT03TuSy_7b>E=zdrwEY18VAH|G;d-%!J{(CwI!R{#b^x+(NV4665D z$WY*p|HHcI(cdjgjPNhK=}y-W;>7H*S8vpX1(6S;+zukko-_|4fs@p}TB9bWg9~~h zI|^wGo~00mRNLmm!A%uq{tb6Cp$nEtzO&Q6%ykocKHNJx@I$r+FFx5yWl@1B%`dn? zMRe=g_cZ>}B4aPt)+g`hiBv)UDGZ%&hlOS zVwLayIN)e`OY?Qt<=dY4Y3L@;8`+G3Kc52jqeI(oPzK5QUa5&%>g9#BYMOBRSDo%} zsYeOaV?czC4#60Iw<&7`5R$P%t)O0{Wbf+D_%lN>Lclj#YE!hHTAtxPaO`W~jg~)e zmW~AdP2D!vD_cB}STUv-4D5=kDjl%c_}CVtpr1Y5wK)n#JchzL`RqXnR(icOxX-kN zN2BBVfg>LXo?VOq((h(mJ^Y4@>4#X}(on;U$h~y!xh;pch#c8{f?9hx#!an{Mug~X zAwP~N$j}ta^$h>8x`e-T4Ut`V{rYxb+|Y4x;NNSdst?8Mg@2vCHZIx|r)Zh_-PY)p zreVsq`~R!)~!bRO~4iQBV+P)EG#i0@nrS`Aru?I(%;)9 zXYZ2GFEx#UUo7}nNnXYWj(^7`-*HwwgfjIQsg2&dt_hu2;aVq1^)OD`|LvFDfeisl z$pK$y>eKhS6Rgf!#DMYd#l-b*!I_$_GxbJ^_Y+7EMv7PdAGF6O1pJ)xdsDpMHSEj_ zvG+(5c><4OZ5mth>ZOd4FGUR{+2@V{er&n5$=gPM5X^sW*7%oL$V9l5VjX#0DIY(z zloS&z+q~#c)NWs$;4&;pw=TFY>hzRFhz=HFZIb`u!enNHiMGcc?UZv(_9dX5)@181FXcFTXS+J2?su ze0fTW?InJcxEtK}6lM)35RW_+4<)5s+uOawC~OJ%xW&?c;$l+Eg86NZo2LWX^MsvI zKzz(fhyGzT%8s<8RXS{?U0d3%c%0uiYfVZE=C9FBpR=qk27RPU{E5AASz$A{0%_p` z$TyME7sq_DTKr4=Bb>xVR_tVK#de@1-O~z@xz1;G2sCFyCwirNRMEXIUUYvcfsIl? zG_lYE6$hw!4JA&*hUidLPL0)f-F;3HUoBu3HwF=UiPow1p2u5zue1R3 zs>PBiR)zSp^V(EC7U3c+k*QaW{OH#PZ;lrZfVVBsTmK$Dtt)92iX8j~TZooH2I}5~%mXDnrQ8m`8Gt7;&*DZ)aVmU)j*TS`ua0;Cl9Hum;z` zHAaL6tUFl19!g1V7{7|={t87k=?64)GIn$InSCZ#fK*&OX(U zM5Bdslm%~Rl2)daB)Er}V2Ggc@-?KMy{4F>63qBDC8Zj+>0=?HEtbhfjJd3T4xgM4 z$4KU2ehhJD&Kbr)S+2;7#G_FlU85Utx`a>-;TIDU6T$9VkFe4IQSAEI$~?#*p?hb3 zM-qT;-Y?c=qZnnKz%XQB)#>10+>ZyFBsgi3qjk|2@dY0%^^%TWL5p2ThVhDKc{GQo_$1^P~bGn_hy3~eRjdoDHKoXI}1AnnLmM7vikrU{U zskPkTQpWr7`y$^E$noKlqTuI((Qy*23e|*@6za19wevA}qnfj*b-Oi7T>^CC@EqBN zzac|W_qV3MuY|VbRzjhg9f$)2=zmL)LW@pEV zAIzQT%e*J0eF|ghk>{93+g)%i_eO#TF#Bo6?v+;X@nCPaT$!~CFPTF>15{nL8UrB` zC%maVZruh|GXc$-WW43OJiqGdquHv?pA}@wSW8#Gz>I=rN4=;Usn(GUjfD(*64JMk z#E>7wj@vn#U(fEoU8Z{S*O^l$nTvg9I=Ug!`rxE=^%JFeshI8?Iw0!@HM;yvg?%&i z1=Xh`&k7%R2OOOH=5c>8;&CiJkl<#f*@gg#27X#4i)xg6#j1rFf_ljpxTRW|bCksb z8p^Xn6Tjz!3bjURFr6yZmev!4l#JpL?)G0LTeJff4$k{ASYtkzpeJ#Xb*)7)ku}m* z53*a$G-JwVD?j4~7)Rby^iocxVBofkb+%}RB{g5I60_plT9h(B4d@e90@u{WsbCq5 zDwS-DG5!sTj@TU6+2IE31R1NlUG}h?em}98{CBz#{XFOPOPG-))H7jCw%is*Nzn z!FY{tkT+ib_W`!5u!A+(AFmczdWY&y<^xB0rc7g1z4`KnnLNO=!($JI7&k)RfGadm z@=f>9m!>3WDq|}BGR9gvnOW-_O{W?lTA_F1;2##@*^{oOvHv^eaia)?eib9aZ73+A zXG46vnCpHX;QI%-DYtHQiTPFV-X+KXPLHj6p|CQHgUsj(FGk;Kan#bo^3iOo>>ZQg+>+JVwP(Ur0-Ql{FgC7 zDk`?FQ!fLJN)sO=5P0?p=;uL4G~31(U1*P4HD@Z}I%WI+R-;$t2bvK5oHtgaVC)y< zl7Y0L`p{hW=#0RLKx{u9%0R4Z1luP&*SGRG0i0ACUN3gZu3cS z5>1~={Vd~tYZPUNV$+;Keogv9@hc@htrn!6LYEy-?<&=o!#b*BKex}zL1~Th2$;Z$ zV}N^RX0@GQ%I@aCkgLA9BZXY4m8&?{2?-)LVg{zbCuibD%}d{`S@S=zn?lbgPiwAOm#V9IhZjlyV;Hw<;tAXD$HW#DuEl) zR03p^OpMI*GRm%#I2e1tn@aiN&E}ec-NOOfSGYkDz!HpncNQ(}G{6uOHHuL$L(oD2 zb}el$oVdNVOUW-W1*szDsR6tIzvMl$QYuvfG4M(~TjF;5r^h0HNLnYN?{{xcRep*! z;qgt+Q@P81BQ3dhb7~*j@!`}v_x}KlMPXJnt7{MIsW__C#36Hv)j&bo-}_p4 z^;d#aUpdH;ddMA3V##7ixl%A)#Qf{H-`f;|Vt4K(h2oADEkF#FL@d)!C;)mljhQm7 zWK_YMk$Q6{Y_*J&`8+lrB_f03zeYh|=1`Zs1~CyiIBNq7W$oqO713sL=%AJ;b|0n@ z?`oYLW!>9HRI#k&z`!JycdJLHnw^uEVavN$Z_6r2h2XkQi2N8d{@ zhoH)Xpi9ObxTaLt!j-z>L{Ihw{Mm{R{Nq-{eMUsPv3u{%fwCK>%+3l7VDcm zslvshkgq7XrK#9F%xC#D&2GmSzNQ)HfYV!iM3WtQ11PjLOcwG3I@Y?EOm5nKnQYDb zGnGAZliX_W%ilyM@CcbgSj@l;rrdmEZb20LW%Xtz&HYqHir0 z8EX9x%lGz%C&l2gf|qF%Y*W;9YI108#^n3Ukl%8W|2ejh7OUAq^)o-~d5`UP)m124<1z4QXd8>#VDGO@>U zw`!6g`E!1d-pwpv=Wu%b^!bbBgZ(X$CECQ-G%K%o#)_U@l4hFCHeb@j@RP%@3mw*5 zVB@^L~-j4-Z#_rA9- z?Mt6w>ZEUKc1K757D|8dBSSUy!Td6pa|yQpy(R4asjo#%bW5(1UaaPCxtGcd%**fZ(U z;_n!NbvxmpMZqmaXy=~~7?y&HjZST_UVWmqG@|M(lh~W-Xws}@({Mh6aVRR#RBW@4VjK;+@ixaM>#HRJkUlD&0l;pV?81I!>fBduI!%iuw@=>97i-5SYr@pp15GA7(0_t!RI+3mJP_$=>E!)T8iVZ5plr zrShbol3=mraz_R@KFiKcl*tRUFrt_&DVSGNZim}DkQz^J$Pz}y<9PS<>Vr) zHP|BWVYvvz2UrxlP$DD!OE78k6rY9S*bkMye&-gZ`9AAGmKalU=FE{iPYz#@wfw-x z`e}v0>*$8S)xg2#QV6E2C47Xtr@W}|pKzWx;P?V5uf568=z67;t15{*9%q|U;aDo$ zzPczZ_CmS!xVJ!Sphb&cfK~9@enS%A$dt<0p9SZpo;N$18yu2yw(GPnEQYXafo2ki zHkq@;?-AgNuVtN+G?So-bbdW8cje{<^Ezj(lfQsvRITR7a~%G32yW#e(XGs{xGq=u z)nzh1LZ0CP7wEs~yqNF%v#K*0aQ*=P;%WNkPMjKyA(8L8G9LT4HaCVBpV&d8Nt+2- zK;eOVPkY{ZXsAS;4;)26M_F{Vm0n}W=%!-Pjkhx6X55?uO-)ItvP$K^Kk^?axnU9(NUsgMd#mV*PxRcxm#CO2sz0m#<@h@J3ZG>-=d^>@|Eif2#Al zXX>p_#z31gVBzhmC(V2zg-0YyNcO5ip=`18ASYk{%JJLV$KdCuCjb)12j2<{Q_@Tmpj7lyf->X(5rVz$g`+EZcCA{MiTkE)J zy)xJ7t&q^h<%;K+nbW0yuUgMo;ymy(MzZwdx9&c-_5gW@DY@NQ%Yu&wgzYeMpb-*7 z+ap|XYC9`#xBM=GkFgjN6oa95$kYTk==gAq0&@i$1OE>=tXeb}2w!Z5#p53@&(aR# zxIS7yGbafU~_pSd!U$sfb9mgc;STv`sYY!)WwR6 z?Q_3{$nkTYr+*s%lPu29difgMk)l&ScFEZ=*jZK{m*3YJs1sk-oP*PaHoaGD^lmVd z^$SHnFVrXUhKDJz|3-l-*r2v>?nJk?#LL5O?dToP^Ztt}A7RsWW4rw~ zWlNeJic%=ic-vB8#ht18&3k?+AM^KeShDt_FFq7L z=6a((9GVjObC)_{`mi~SX-3V?JAI}gX4q?F&9$lf^E_IJADz&pLb@kk13Nh1ic*#N zeWG7#=TmUD_UEC&-Hu-up%pWdC)%B+9O3wz`+8amDSkj`X(1>j=Z-#jACo3ptKOTp zv0>f?eS}x2<(a{rV$6-wYub4^u;pmFKT{X}qbiilQypF0e@^%#vyqAlZMKxr1u!?o zmQKl)^9ZM9(&tH}%2t}D4W(!lj8d?G!0nRxxiLhi@(AZF>8Q=N*H3#Frmk%t=BAdK zyZ&i`yRp~QY)fEvN^ZzV$YDU0K5@VFq6mv|*T|3JadNF-G9DT(yuNr=P~Trv^uAdQ ztZS8cW^BbU!qJJPSiy2ycG3II%PSy_EZ8FbaGzdQmqp6p{0FvTFzYxO+>>bg3aU1R z;J&Z`b(kBO`s^}%WAjvpIK#1ZH7*eD10|1Ks8B6;pDy(oTwSznX&+a)AS^VX*Z1){ z>qC9Tq=>Jp?5o%pVae#?-e>_gy%06(iB)C_akO@WM6X9rf_4>CM*WyVoET%GVTE z{5-xDIXpSB_OTipdb!vk9PX>9UjFXQEE(4pq|)bYzD6oPhyF%hNoxMjSXqBI!Zsq4 zm-o>l=U4_Y^CTl+k1GWGl58i)i;+yRE^Imr@k4}~bu{<=CzX|NFOHt#aKfq6sw-z- zKQa8gzWoE-f{@gux=FOQ0y*1KuFgv%r_lEMSK&-ajU=FfF;QZ%3M#3;Ab}^5f%9st zVtrVT*waB!3Py6Mbk1>GTXxA`+(Z2?2$C`tjf|=%Ocoy7l}+ z3uVhZ>szE4S|?trG|~!`BtLhNo~M{x@#wUcFU7aBfRVP?vS6=!PK^0B&CxNK1NRl4 z9wkDkVZ^saO4kWfw3ML1y7P{nr#MG58}+E7UuTcIKp)?C2-VBzv=bA~J8hWIpY+y4 z0k^L3*Yt-#pyP#tuT+zCAqn`SoeAZf#YF=Cg6`QJ8MJ+vD^8v2WZGp72T8{=ewow$XoIuL1-0(%(Kd=^K8dB(9M`eO!(heKp2w6=22`ysarFUTsn6Sn>D8&ZjT2Q8d~DGW!{4TN?@cJE;%H&gTl3mN_*M1Ko5K zJBxh%f#WK>h8Ky|yPQnbHhC>8wF068g)(02?pFvoFKy;(2k}1@^5^vp78d2OYcT}l z3o2MrdDu1uH-)cHrj`0g+AZeh?Eh_Qm<-qURShv14@Ac{2+#3I3bukD%C(fT&}=Tz z2@kk$`L(m{9hTZhKLmXuLzV3ei|Z0C?~xuO$=UL2itd1Q90C_-IpXzlR@CY8*$v~q zy03m+{#Eafpnzcgv^*kVim6(I8e1Q#fwj<|$b=F215fW7Y&LZelUGCZZZm%sv#-ead#a)D2nA9}-!qmzu!Au`872e7cBQphq~C~pT>e`niDu(hrzF>Hi9FZ4_cGJicswU8l3rG80NSGum}kDkBQI1b zzgc`AH(K5vrQ8}wb>F)Sm|(s<{PXUq&eWPyLn>8948a1M#5Eb~qvp1BWWzgo>6Ptt zsR9;tUiVD>()`x#V(Zt?8Gr3`D-?{d%mYtY*$FsWX@*llfVPUILVko?=S#0V5~W2r zR0A56!Ur)xM)wxqH!~#V;HV>Z8dBJ+;vBZBO&8TknXeIMYe<>~nS)y&=CV=j(ipub zUm_gr=>FI#JOZ;#ZJ0T*04-|mXlpTj+lmV(X(NYhcg)8S1BUKr92XO#lh;X(L3;=g zAx(kI#lg0?sTP{x)#camjvx@WX!qw3RvB?O6?FxT%b*vo!M0PS0pZi>!1)A9fT%jRwgBToEzauwzTK++b3ElAgsg? zZm&}GnW0clcPoWi+lH7&t7B4oLrHazgC>(Wi6hMdpO46#7vbxeEa_Lvu}bw~5BEtY z-w(7@aUj&J-RTe8?0?>=sGW^8$Atz6xc(gH9?Y-+uXZV0SPP!IKJgmO6Aw~e?Fbc) z(s$`@S3ZfeSY`a#kWKZ~myY7<(W-m_%&1fAwg$RT#EPRi z91~0H26CF5oxr0WRKqJ?(Z<%-C&Es`MX=Rd} za9PQ{8FPL~TA^eIfys#ueMJ^W;}E>nkD~QlYGlf4YjodET9HKv=Fk{hYj#RDt0lwg z;!gR1MQ8g_01Fc0@o@e*>U|LFMLJ&!v zK#FC2(2OtPpoi(^m2_U;RFhu**&`N&46b;+5Bc|V1-S!V2(*mS=wdRpF|`smZkCeM z?K67LKXBC~$QUf)aO&l|bWkPUD{qv%DG3&EJ|#FysYN5$r_Lgj+QmnM2*=9D7+eYL-!{9LA0X+AR{4xn_E?B_Uh>#p9xkTKo4 z;A8-6V^ee7a8n!g80+EfZ&^k-00hlO`qgt<$2uXhC|9rKC=O8}{A-W2EJYC-zh92n zL&iz;Z(hoig|&SW@+d%$@^3Q3Jiz(E(d2-ikB4wMQ`JBj$f2j{#Z{1;ogU^b_)u}x zz`@FV96I!RX~sna?HY=+0_>MYA${7pp+#wzDi@z|s+`cmVK?`eX7uMFZmPR6kRGBS zu0o|aCV`p{E50}DUrDU^q{rWZBlhg0OJUjZOS+W>i{APrCs1Da4*as;j5An~jw^+m zPI6rb0zoNXv4NM3rq$Q!h(}$H%umOXKbD9q*&>S7C-h=tzZhc0y!){Mv(*Kc zIAq<`$fd-0y2wu^#4|0)P*U}%=A)c=sMmRl`DdtWj^9{wGZ8cRvF8)6Qls;!9qt!^ z4JjG+ppz}-U-eJD;&{ss*w_();63%%;~AV(ccZZvL)BdGU@3U&+HGE@U{6w>C)F7y zPhzuo3W}dx8bbMg?ZNjR=()x`FsoEbSq|D&0anan+K!TGlT`o&@@3Y0bs#bFa4^tWU|^(O1Ij!RV(le{fHl1@k@!a1n@f1`M;YRy z)TO9gU`&`_2-@tx`c^`g#}gmKLbF&v*Ezy-d*a=6HalUHMcnY`N~F!#V{&TKl&*>V z{+REotX{~CLY4|zfz#aE3`Ozt3tvOj8v@SKAbO5wK;MDb{1cC%2T)u_bPgJgc) z=}5=*@O#uxRcw#tQ!}$v>LFQ}v9ihvqJ?Xq+l)en0F0@biNq zugBaE*J##|7fYqz>3jvSAb z#)?tVscIPrk4-ycHlNmYgsB*$8OmjEF7Ubdrw<9-60f!SRB-P|dyGlyb#rHnUM2#U z%u_8)8`XWa47aX`qB=@tS_sC^RgR!QO@d303V2Sop>4(~Pr4a(l9G|o4$R&&CnQ5s zr+}zZZh9B?p?xH1_r(%7W0laxBEixOiYg6yh9qnur~;35zSy0A^xU@teJX1ZI9Ka>-=8q`UM1g&FNc_)H{DIk`PwrA5fc{G6Fy zT>l}# zw3&Tx;D9OJj6oxyKz&^5*PY0p;XSoJSQ#1{d3yd(L-9lV_x(G~VPwP_>sTGZJ`Jz! zxMiR>tXib2GW%EWGok>O|4-nM*FV8W8V<+fx~#IRNW zXusV0eL87GqA7R@Z;*F4M#G2SzMN_I?(2>D9dI0Cdvpoq4F^yb?_9jXK|W3c_h4;% z1Ofo1n|fM38ZqAfz-}9l=oCx4JUS)_zb!|$VfmmHsaZRABe{KMB6711Q*^4tB{2wP3vb35&B`m z7<9+&4nePqECQgCM%wqgh+k~*8K40n0kS@oaFxSc%|Xp}frW=9YAYtI*!oR{U1=Md zc0Qto$=uVsVxTk?x4Jl8kk%#FD_mrk=2a6hAJq7hwDIHx#1kQJh}J0ewJgnTp(Gex zUDV9$1w`t%ug&4+a;u!#v)DSzlCb(@U5V*)d;od%zT&Ig2uS9gNV=-(UwW=&VfpS-$1qCy6~VAlq_=;0OHWSR8xE&6yQ;w2Da z^b1v^n#^RdRxKMx9*&9tfv$fFR5tNn(4;fm!P1A|&HfhJhU5G_pmF27vdCG^fA9xl zq!gQ?uAam^IYIwkK zjGU&QzFFWSQhb%$PR43{=0`8>vyQZrAJMX-A*AqoS@;t^9^lXtKRED$DT;i#(Ynwt z3eenmFIZOa@{ezT!nF5gEcKmuB~+IOHB~d<`_0UyBBR!FCbubAL2}%|b z5YeWia#s$QdRdBoD>IF06~3_D`6*QjwPH+&-BziemTWPl1(#FF^<*a^q_o~=R zY{Td#=zSf{sa#-(a2N0H8g&$V3-#}b_2m06v}BgxH@r84Tr0%NAAb4~$Ka&tE2rB# zjNSNP%2R^qUN4B;E?u}k(H$FMNS@f3%#Uu^FZsuEjJLEmA<2FDjfljBr=H%=vEaLj z49_*HrT^bv2H^ElCGy+Jwxu`&LWI2ubPjIwqi%OGK!QCE8x;Kfpv@82Mr!g8M#Fyde_Ia;>maVTTi>)KSt;0M zX0WCq(5fx^og;VYpXO&tt9qPYGqZ@mO;Enm>5SdHV`brp{J5W+(d936K@L+59t~$% zc6XPU-t?~r5kfeOc=6n|0Vp0gU>Ds0bl8Dd8 zQMbj|_Bq7(wI>mF`nT!D;|}Mw-X;etnxQ|oEW#jJL3N4Icd^@QT;@3qWMc$sP4Mq} zgk)TGf{Nae)1h$nw#y0|Z&nAkJ+}ZaM)&$euj&q@n41J760PU#8Hy3zej?Oe21o&^ z#GN)%rXK$7d83w&i_?+#Flw9N9^Am;OjWV*N*Q0LNqfeFUjZ5Ih1@7@pdt& zsV{X+PU}yB1N(8CoiT-S_BrjTax6;oUt0=RAjCaA;a`ivwvpnYygO<2PjBT2ll7`^KheOmj`cbZlAS#O2J53C9Vylb? zp|SOBmBr~?cQDX6MadsA^F$RzjlAW&rNCfGPoPaRLgtADa5AO~w*-1x{yEH4*aGu8 zN6~jjFh<4Tr9nYgaDFkD)7VQ9N=*2hGtS?4JRC)B2i^Him*1D;QBdFylMcUX-Z1m@ zA^>p6rygmh{qfJqENkiCD6w9AI(KT3z)?7Hj%}+|pRlj$KSJykf>Sb|cpx92Fem}jlVl>Inv4oswZ)9j_au+h_ViE#R* z^`wCjb5}2eQm~QQZKI(PZbXK&^NYCih2ja*5m#@$9M@~a0>bVi9driwCrL}KK-WYW zS0h(_QmUxFuD_tyn?YQ5L5{E#(?Tus2996G>F#k8bWevx8P&U`oK?)l37p>%yhL)Q z_YLaJo>yKGODpiHZBlGIWPv-5HYaXNlAAso-^$oj{^s60Q_Gh&DdrlgEoYIsubC+g zE}3?|FTNM`piR_^{O+<}H+a1#_hh$S-C1kR5$H&XxlK+j))VgvH3#2g55S7mXHmf( zG(lZ0kBwWTe)<`7X_yPw_P(YXzexow662*s1;5@tBjVW@V_Zou0C}0E|GWh}@=&KU zvbRLG_?r0r$avU4xA8Qilj`HHNN~RD&+{h~;?j)1OLN@EVQoxT_X_ENauEsIY5H(IO6@n~MOWIT`a$5%U5hnm6JfqQXB zt=>=fIWIr~*}zRl1cZIsK-!;S+Pp{sZs>FUhlF5~in~O=DF}J=qy4Gab2=EvtzZFV zKPLBt1R^n`L%ZO1Krg3j>LH7ptK7uz}_U}XjsM}_;e)o~&}q<4YG3G4aksNAExmR>&-kWCAfe ziKN;U>G>}cen9f8k%KFB# zb5ttb@D_7*gOp1$zJ1C?7H{*iY+I2pb>J|7&mDl$hE+m|aG9*}7vJD%2d5}=3=+L? z)iKCy;IgpSs^rxQWLD^i{cf50Xz!n{Dm%$wZX*PD+s`gQ0Mp)cpNV6bU?FSlDM?5a zDu%-yVOq9e6`UdHS8Hf>&q&C2s92Px;u!R|Z_1ceMPJLsr$AoH(eKYUET0Xsxnh;i zET`^B4StY}9W^)lJvl_#!Fdrl4@D*^UMi_9NZ9UOBN(zd@L$_N3tG=juUIyu9{Q(2TKxyq8wBZhgezK=ZH7g9lzF60m0Fokwa)L z2{l{}#|k;d3#u*#-(X?~<36w$6sdigdXrwib!-^?nixq4RDlH~u{|8bw>QM&8sGC1 z0qxf*?AFthz0ie1a`>`^f+Cs>#Zt2G(4cH(Av%2up95E`x}0wn6;9?+D;5a_Nohgc zVS7k2AM~mIf^0RaXR2ZE^T8CuF?(=G~RTSZr_+NZv` znQGVS751fac0Q~m|T`F~EF z$J-`yqJRJt8@Mrc+XDlgnO)9k-8u#2#z47aoEdT`JjFnSZ`aAI5Md`i-ah4zs|^3H zlH0Z&Kzi`{ca;c*-m1;eL?2D*acZFc9H*c!nt(^byN^mW#^s|que1{!2IPY+zweFw zahPdO&w%;qP(v~mVfxr|7V$7Ti*ejR!!B|@g_Lc5^K8^ZS$zlaqmKm$Bgv=OUjq}5 zsf}@&NiNGNkG}9nv;N}kbY-AcaLoU4H)G{-5yDKKTq>lff_L|*g5##U)GTYn-b=Iz zK_*7`>nq-ihge^Mikq|Obx4`dgjIrQz$=K1|0Kh?iIc#QrI+rXYtMZv zvcM|4?^rSiuh4{wIp(rqj#OIutc=ZtzDUH`g82($mQr*K@nkwMS){@y6pSlwQoevv%Kjv zvMH!7VXN(!^IdTW$uEuxDOkQg;w*m;{51aj(D0AlO)AbB$E(y@>ItIjGLP7NdeBkg zv75XK&Qd%MKvek$ne$X9CsVK-n;{gXp=sjdDp(m8M(LyV*h|&%U<8-Q$NL}`-%Z8p zVA-c}`Fn`{@&6j*l4EVgs;o6n6ql=UH%FhTrB_o^T=XNmh-z99`fgt0Z4{}Aw=mto4)|KBCra=_Z`kJA=MdtI zdzLvX3 zTxL6Wa+?^KxHt>ptt9W^uvF!qT=*q>1$xvRT3r-QeA@65Gl5eJ4kAqv-v%To$BwO! z1xC*~@;D~8=s{3+-rZrv5II3NX}C?uv@Nyk>N%~g@yGfp#*@U8x6$7#k$RjGeO-x4 zNn1N9W3KS~#e}4|l@@3sJb8QB_vd~K*fvyuubdU73V;3XT3xKfW{SOfTa-O|Z!I+8 znD_efy$+KKn4}|L$c~Fey5kNPz)IS8-_%!77%m$!HCJvcJ%vFp2TA3dzj69bI#*A= z2E&0^*DuHKfsKX^HcPm0MD=%(o94BRw4I^mA8fiQ`>7gXC+zl|$* zs{T<5tpzB*pxk7|WyZ)#W@IX~dnqkJp@qumf+X4H z1SNY#W1;p<#j%NUHI1>mKu;bS*5?F>@2O$7a=}mCZ+(q2GIX?tq{-Z3>s#-|kf#gz z7;@WW-Zs6MQ7salB6Pz;8|=m)iT2}HGTL-O2hlIUE^KPYH6uCkK*zpD#5x9B=g2j- z*Y83`xl<>Q_o5sk1qck*$roETx8>{c_^b9 zy*WU%3*9vbBH7rLDbAc|j+NQTWsBV%&Czr2Ijz2f*HAmy!02EFMo!^Su4?>iyyLZp z*$-LyWkL!X=k6xKR?3qFP6?HG9s`Z_0h=!iS5++7c+x)|0l$+_Yms^?N!`<5(ABK@ z)YXlDEGBnr4{l8A+A`7g)ka;S(w@)3?%3J~N=PvFvi#7huIy%Pb%=Ns?~}WZyv{OA zvv4R&MI_bOb!qA8w&J9jXq1Nm+aitFb8+5oT2k397(SXO?(eb1U>%O@r(iC}YB$yYuCKe?(oI#Qcb$#< z_cm`6g)l7TWIk=1`QCdT8c!hjcJ)ThXVMOAuJSoRI1 zP1%;3+Wp)p=n8jFMc*4^$~K55Q)1TP^krN?h6{{8 zmL)i2$4FDSr*K(XB3xo*CXs?#9_^XI1l6fZFU zvybrAVpDQ?3m&dlqW)orEu}AewI;~xQx?|!i`FRbyxejVqdn(j`-JK%+@-mkUYJe0 z5n7y1dlVDWtd{coq*~99nsC>YNu!2|O=CZn_znZ5H}*3Gl2wOtJCPgs@UQW(vlVow zQ*!I8!|aX>kmF^yFl98ZfZk(!ybSJU{Y5{f)H$$hLPabd@>Rc{!`##QIgfK zP@$e~2%W+bt@gzYZd-EA;Cqnon-j6lPW-LijH9{{Hcn0B@Svu1c&8r;w&3tpp8Ep9 zZ-Jr!a&CY0jqXm}WWIalg5ARhtRC|Gw@XUrH6c3B9meDg(qy0 zt8bWh#XSTn(MQ?>rLCoT<4K-xE6r(UJS_Jwa>fN=U`zudm!J7F=WW*A{6%3NWjB+C zy<^o9uI4VpMed29IDwS$?NdXaG^)YTr-28-#M&Gf<4vK+MB0%tZ~dfI>d9oqRq=Q6 zUk6zOTVazfDkKGQ4wpuTll~CKgQ4R7u^^~a+OfK?HVj4C@R+&GD`wq-M0SjoegkYP zJ2-p#iGzKa4)r(YdC2SCL^!;G-I%zcQWW}KvEn<4p2gLK5A_*^v$gXhzxnSYO@vTD z_@a1wDnJsKi(>+eR8V@^qmec1@%9;&adOpQ6eC`Su#1HXtjsp!F)qY=4RDfwhsiIo zJ+OpvjoLaflTTM!>ss4bvG(GXgJmg%r2WyJ?Wm};`^+_r&H``4J%*X&)|=`x^fIXA z!tui=yy2EezPR71;d7q|8>9t@`~jjwyleKq=zGmqP`x=ES=PSdoAH8N;I8{@uIV`s z+!iRBX0A&djwu_XytKiHsFRYoC}EVJzR@_z9v~~D%Nthnl!lfn(`#|kD+D`k5Hbj~ zVJR_UZb2z;y{&g$@pVG_O2Om*CakZo`Kqo;O#KfTG93?2aUu6C;fwxfn(;ttsi1I__TeRF zqkFED-}B^uL~kqVjzDjx!5w+nLC}euYM|_XmUc~c9KS|#l-q%t;=C`R&0u$yb@BbE zM&iWK$cX(QiDLOKF>X@aSaHT2Mf~#AlYP0(3GDY^8*Xe8od7#;zn_;dWVXPfD2b{l zPG;tmd`6gP&})=>MwjaI$}bI%r~*?Lv0+TMasBiG*D0C!gVY3k#N))+h@Din$>2p2 z=yY{1hn$nw3u+6~k4TAzQ==stJUj$HAVha)Y{gGtm}E7wvxu$Ha;q@c0RP#Tve?Rd zC+B~PXVRv;dh%Z(xgSoKl-v}*k6cPR@q`y#2P!8XwCF@h9NC_=^5NXf4&fiUo)q6}?wjZJ_I4(7`iHY)t? zNEwy|4@M@bvp4Zrx9d0Q6~S8cDc7xT{S`Z_J;W=|OV~O(A`6TNQDO0M%wdtEH8!{2 z1Oe5-HoE(He8rgW+BM?eI*6RQp1@UH-@n0M?1c_Cb?`^61lDWR3$-<$e0pLmew7sL zLh1f4b?aJ4?o{_gom}Xhavvdo;Bvfd>c*RQo%eb&;zF`Fqvq;JS33@la?00lpiE7^ z$mc*ZUt6adRuJtvR?Q!bWxcXAwRDt^3-Q6{-{kcYo;jE}5_z>f=GyM-ku|2Uem z*9tt|X(9f=#+eIH$;Afcit`mLsbK6c=B2Rjt9ShNhlk4pHRT3@MX!=Z$5?atP#P&W zot^uW{}=@p-io})Qs}^q4j%!oyU$fG+AiL=tJ)a{s1SqS$eIC8T?6C9kv|;`?$iEB zdy|@-!Q+5M-#e)XZ^e~+lL{R|AJ+=1Z~P`SjwAe=+I7ForF ziD{9oHOPLf{+8~jAa1VEUB?qEcB|od>uFp9;moaAe`fov{PFO9^Lgy0qx7JeCM;ud z(y4-((J@aWv14JMd>5~OXHdxh_SldkTN;o(FG*{<4A#?LkUH++Q(#vMoJOX)oP%Tx zgb}gw;o2k-9i-XL3hUB%Wf0iUvc=`mjyAyD##p}dJ=uH*kL0uHt2V=W{?YrF$UkG2 zcAB{VvdkPA@fJGels;;&&7{GK&+B5ymp}FC)saTrdaWXp2Ldbl&jjH2e2Fx9-1XIJ z_?1K#4{*|&`*Kq^4rF3}nuph_&3&Gg0bxw|X{$)&F`x7c?M6a&0emZh>!^#&qLrbQ zV~4l!OMCJ8&qe%@T^jhgLNOf*Q!#*FmhVK&j7P3_d@TzL%t!IF#;4%B=~GM@QtlgI zF}_%#{yH5DH`y=x>1ywt+Yq;a?eO;+)LX@QVOeBS4~yONQmawjFuj@7e#*N3ePXxX zD`@e=R4$sW9B_WEp33@~I{%&DD?KM>O92WCJ!MJT&RsBaz@um7;gw(xTjJ8NovQrT zw;`RZ{bHx1Z=$-Tx430UzGxN6c`aB#B%^2-6I`Vp;{#?#goz4Vv ziP!IEO(0X%@!x?8uZ)Xb4+$4=Iv>$4BzIf$XoZ2|E9}B*-w_Qd&I=F4U)AT4vY4Een_H%HE&%l#;uFg_c%9S~ta5 zZd@5%Elw;dpwMupEs?>;TYjGa6UiV3>-1E^Xar~2o09nq^hY;)NOXLz&Rm>j zL1r>Y#>om8Hkbe7LgLLe8)6QV%RTnf1kISq5kcAkPP^S0MeO;9DTVSO3-uo%s@5VO zLE!8GzH6l1$bps5iV|-dW}cl@hZ~YdmTb+ zO)q>;AIlVkn)(T!oAHZcuiiwd+Qg3prl+o5YSZ{PBD+y(UfMC9E%!bs<+ViH;U~h` zcQ=K^QcSUgdKRhZT*HnDw|Q*)^F&@`raBhnVE>lys}vnZ?C_3u@=9Jjikdmi=tO=w zTMQOQRtG%!Y)9ASU!)}H7W0YS`>0Q3xNi8D0@Y4xJ5t3fQPxZM#E4*Ag_LQ)+I2SM z3$d-i{Xc6xTV`)_GS!sAaHy%$Wzf~1?^y_Vdwj%vhj-re75Rf6xb7M~esuuTEAncI zq5PZ0njpJeQv;&oI@}%7izgJE6f_1r)Z%gnOy^+1hrmEZqW!#LL#m{%R(Z|*SHx|U zh1JSO{6Pc=l2#zQlHJod8ePqJTCW^Vg+1eFo#OZTEK`r;2{*voYHw<@9Uy@;j%oBt zLTV?z#o%K|KWpH_PR)U%5@9pKH;djLHhWtg%BC*uTj+*RjlArGVACV_^~3>-!&RHU zWjL=@Dr@8Ama_YD)vXMX=KN7Mo^n+3>d{yFsr9X&C2+c1O-bYi z??3Yi*X7IQZ&`Z`8S{iM>71uj#BSWz>pP`0=Q8?7E-x>po@IY!osuWUA@d>})1)V%qD&{8#B|5}(JyC){uZq4 z7wD?TMKM0&sBSoHVQFJb@O$dXk#5@B_v$9aRNB(gz-!)EimExD+dU@arKzIyk|px~ zp}dT=%qL^oWY!AuCZM;mQK~1#QS`7DW7#y2&g_FB?tfbl`Rk0AZIx$o2~H8=ajofq zD~lB;e4bbAF**qTo;ywZdXFN2=lp^u|2uViUGV+CTEDO7W^m5sIn3&+eIMz47FR7S zLPSXW%lza43ns5iPFK|nB4E|)>^u9KXRKSm!Y*DVfSl0v5d_-06;Uztn-lW3`e`XA z)mu{-KzREAL0u+$bn0Aj z1x%7))H^r{yU`X5S@ez4aG64)tLSqiDX5uo&L^UJSgRZcX1%ntw7ZqRNV#1 z5v{H4beX`IibQnh=67xzn|?aP;0_f@1qRD8N}t)5O}y)4Si4;Tr^Cs}-2dc&Eh8*t zJTP)n&ftW;!P8t<%3#IlIpP!AhmSM%2IIJgLG8d+5ktNfHq&-5`efF&@?Tx!-CRwa zr50lXF&5H+^dVsE08kJ9$Pt#ZH-Z!2kB?JwcGka70My&0&-Zfz-qu33g0ODrqp^QB z-#8E7H@uu?=t<59N`^{W()3fD0V%-DHMzbHK9LAb<%yDx?I+--?^$+*wIQ6uA;!Qj ze*bnfJ<|F$xOE-l$cGbdb>Hzh8v{Zmfe|k$;fdhZND(j156-^SHKHgZAYN&&pM$)C zF>PXv`oP@Se~!F;g0h6V;2{idCL$}_(w3q*T>J&$%EXGurM37D=a>SH*|=S4!gb@w zB`%ZWlz;B>@6CdvZ!Lncdu{K_;|>kLLslmW^tnZH(1_RHJZw zmY8-;t2+5Ir+sVK<9Sj3Lp;~DKX8=Jj<*G~s8-=a;}@KcSYLt%ufnaeBT z_rHP+y-A>#d9YX50aYCmnw0?KMsAjRiaB=XEtXPkSvr>2RB$BxvRB1NVRikT`GDq% z+2gK^6YoT6!J2B3R6nhu!kz{8-ihV4#RBbLa-3g3FX2(1=O~9|Kr^;!gJ7cgr55SOk}A2Y=gXxx^Y;(@;sI~v^E8Sw zpzS3i5d6(aqNgaZ&hk67H6J5H8q@_MaZ{OUb3pa=1#*>cR;L-9BEYY{YRY=4aC`?T z_I7@I5=DoSOOxdnL@adJ{y;*b!L;kIH}v}kA?!qCu+mm}WKZ)X)_;C?=9sBSj>0<} zlt)G5tGGq#e#F=3J|w>zD9HJo#=-uhC1c7GvZw>r+B@(iF1^HC;M7crwty6iR4ACr z!~8uS{R_Tx$Qp85y4E1aRr)<@>RJE^(sQYAOfiY3PS)eb#h@U=nOg1MZY8Rjs z%Ny&EP1ra7M1_8MjSO;MXal&od}TK#K->FAe?e>@swwgW7ktqfIyU8n!VF>kQ0*OQ z3U_4|OkZhNf2Z^B6AL3c$MiMRUO*l993|7Wba^!yRNqC{7*rVZ5&Csp`uW~=QF@Ns zjSkh`L_Gpq7vfsmb(;>b%FBw$Q~L6qOix3zR=456&El;15p8r9{I`v z#w~yf!5IFx;R13qow$ENj2tUWD#9L5!P6iAg8I~*8ZN4U|L6r-?*D%b=T%F}G|E8S zM;()N5{w9q`PdbPL{EAX%YP>;RHo}dv{yLJ@`MLRZ%ziegv4Qdh;4mYmyt^!Q^^K%9Bzr} z+$BpUc8Yy|N7<#a(Z(!}S-`)mN8Q@QANTA2k_HTvpTJMUXshi1LCPZ+V`{`WVKU?JD~TQ^j>7xu+LK$UJRXPP-R;!ty8GO+GSLgT@si z+w!b06RlERpkuSbv;-(V@>F$`x?rB+|ys+%s-#!T!-vl-ae?C@h}#G{nMGIz5cvsB9FWF zB|}43mk!Fd+ZP=xZ_0Ct@Rm&oG7ke2%<2{7}*6dH;B3u*=E zCZ#VCI-Ak{Rp>reO!Aa53s9e;?M1fT0aBOc5zxvdH{XGPt3Jc>`QH4cf2I-KiTZ{T z*DhGC5uwQ7* zZ~Q})4}OB zhF~^38J9mi)7=~%E+U;xupMlsD^F-?vK5>Uk%?p$|Dg$q(omvb>2Is7t?_>iEeBchPi=;P;-|h^E#6Q`oI~)xsrr2CMXixX& zQYADtOglN{NhBV$=oEW7SjvvI)#MR0)*=eM%Yu zRWT5n*7^^scGk;|v1RctI9IQ0mGf7n0{8n{!~Ji6Zt7r1-o|U>-B5c1gK2sa-qxO`P#(sl>DUDvz*f{;4yfaW{6YTJ%^5>TAF9K@hJJCPh= z@qN3DlQL*r=%5gx&^;nE}d!3xIcOc+K*13`oPn~9#6@Z@9inJp>& zdtR7l17tQi&sJMJoPJh@fTfl?&3Nbx!LJ?Mfh=`F+0^#@V%9R1JHuvLI1oL3(n0#s zfTT4808z9>ktpyqp~PYX(14w$vnwsUi+nwm?B%h$FMtqo_L5XwPMo@=fgC%cit`kr z`1q?rEoEFJ;uRkrc%B4GQK3uh{nhDd)LfWA~%=|5nN-DHEqC5$^E?)``Vg=tE7*T)f``X0x?2;X-rL zN>$Mr_Q6-#3^A9^X!^CQTq7t6(|vi<4#RQtie45c3k@`4OIbe zY&$sN`h<%&eSNSF;?tPk!QUS^)e1LhF^)U{ywD8)h8V0mQf<>zV8Z!;Uzh!R2VDuB zI@`KLq&3Wkz_qBL0cyOnW72x143;IWEKT01CiEBuPTSG#+kpI$Z+Rkz@xV7 zLkuXGpbj_BN}upDi%Y2+%1X|b{X9UvJXq0SH@q+EKe8TEpGuOrlCz@tsTwUKJy=); zWBj82R6I%jTjl@GXh$mS$RJG~73g?pZ2!Uo8SBHqNE(A!Z_rY_(74Z|lJF<>*(z=@ z91?g%!{hvFXl$)*pk7`c=nI&`XIzKB0E>cdh_ohmL2ZQG-dk6NURT1?X5 z?b9>;a;03bZkPhN?}Ty0*nhai1{LKa9v;odID8>`2M+T7Hz&I(WEiL=OsygOpZooH zu_ZG^7W`s|25?SjmXJItWU8NN(D2Si*vN=VM}w&C1vS|XZOsHwxJ~{ zOzkwZ=`wOQv@{LFbHW}khKC)s2}R){V!}@w5dtI-!goq6tHk{^f7XuLaMrX^2R(*c zieB0a3b% z{~F`huBqOou`Vqp6i+4B%QP$L3hrDFqHbiPpNgy68#iFKUjS~Q|5p0kvfta6xzUwH zQmmGK*-+J&n8PZ`Aq$33f=sVL68*28FlKt4lmPl1zOE(D?5!7r8zU=xB}9NNB9b?0jBVmVGA4G%5){ZTtMbA?#)1N` zAcX774S&T_zQJ#|#c0{rpNrzp`6+XUh~vy{Cv9;eWm;0=KC4n5F%9un??QsYxZR^w zlQWo8TRj-@gpblF{3HSzgs+j1p{uz)dBZszJ#APz6?;^oW7Hf6y8>bnr~*$13_pEr z@@Ip{QY8wB(z%XNRhV8KU0GZx=VY{UNQ1QEi+q*sh2M;}SHu*isR^FaV#XJv%oiE$ zmCnS4`+v()h?Sa0-HN57tx-;KUuE?}njqYP@b^299J|jXn+oM{TwtD$?bX*v)Dm1a z^=P_?BcKc@t-`>eewods`jR#aC4rK%zZ#pI1s|@Bew`dSzvaN?^<4*?xJLtWev9hP+yqY)|IGtV8y>?t)o9_}SL3&zT!7AB*wHHFQ${P~( z#=+w`>X2Ds=hC$S^Y&(-2dbIibPFZy=2?hL&yj^kbz*sd|%J!%*DM* zPmD1}uQyK>qL;GYH~ua2%+c>&_~^!EvewZCCkKCB!-x zE%!VLs~(%~%NPJ;iaBmkf_P)``8?&pb9*x2$XXz;$>wb1jl5hhi!2O8C5> z-}l4XG<0Ng$m}qIiW%w ziS_#_b*ilsWrYWXEK&Pf1d#R~V+67NG!{xut2OTn>ezD6{=;j=#Tc?F!lzOXKT!G- zH79?d3+V!4EX+wM?5Y&Vy#(8ji(k}Lx`f!Rr;1dhuD_|RJRPuly zpvW@cw)OZ(@-4#xx6iAbk7QxbNF)kxP9z>X%jOc*6SX4$k-?gs0h*n#NOTRYlrAX` z@0TV&9!Km!fA>=HH=$8cX8*-0{VB%M1+%J?zzGR20J%nY(UB*$~T?c!#?U6jHyk{#*&cT z_(W<={@r>xCf!yr)4`f}D*4_BmSUIL0qLNNNL_H%S+X?^i1OO`45bMkyYrbK5*W9K zR2cIyihz(!PuO@B-d7jwkF_dvU707w?g{z3c2#s_iH7=&XiSA|JDMrKS)pLHrD_N_ z9t}?>J5~uDgmhh#!yE9-X8-DDU^4H@w9?rAGn87{O6L?PGb`5jn=(a^Wn4V#)fQYx z0kt% zBLVwyr^YRE({l`^F_UhMs2~Gj$%7#IgI3wY;=vU|^T(?YF{FnZr$1jwwiy9z) zBQ@In^;-PxtDug3ZIk1<@knTLDlk;?qhEgOR@okhQhG%|bMd1|ZL3h3>Ov-p_Tp7> z9Xr}PJ53sc6C)UTM~iT_rp~u_%zf*his{_2@otMZQH0;@2N%5`aQ0!)3{b@ESCHV# zZz1enc9Rk5{Jg-+@CbSy^vY?T3sb4moqWvB#BDeRo* ztmLuzfu@t)_8WK2t~%=tevLziu?a_RF7*7pdRN<1;1TORK)yt>A@RJ04L=G=&BPBcb?e8 zhb~O6XxUU#w&< z=dE7Wf*LtFm3g6*Gee@>*F@3}sS+rYALzpd z=W9!&$~A2&UKi#&>qVV()4Fd6o0B)dcm!mp2p=?eD~k$i#@?xs6lo@Td^e*aXXi-7 z-j<^^F~58$e$%$$ZcD8|Ril$$kTec-uvkctcdCh?nLlX0y;1x69;#@t^(6Uj zZ0niPKjIf>(dyLpvxMx3rYx1X7#p(nLQGi}xrdsXH&1RdNPdC4>t2#bw}18Z`17uWg>xX`JR8Vt& zrmn+`UHM39{$0e@EpexydCJ|b>8)PkPXuq#sms$tL5cuarC#)eVA^7(uPoqUx zQk}1-h&|D{_PZnX&Bs`R$y~(u+x@cx?aXEt%Xz&nEzaeD0a<5}x;Az(Z+UUI&zB+t z_5((9nmEMUhXp!$Zy^S47miO zW%WxDp#1!VF8Rpc6S$+bMZl2Ik~5aB+jV(n?H_DZSk-*+engx?8iu6r~T6!J*d@ZWt z9$(EfnP)ywZk&_7mwz!N+|7FD&0HRqtezbPxgS*aD+k?*(XjCN7;Hq> zn>0iu(jgnJSvgK;HgTJtWfUktsr|b6F1cnT+k+zRrBq0a@AU~@-!wk1+?}~nppxu_ z_?cAb&aLaps*c3I@Zu`LC@4{mHt8&Tw%HdaeHhpWsI|^k9dq|LM6-9mqm^Kk{zdv#1D%p9*LRE()zuuILkB#03!uq-rQ7`IO&t)s>vXFjWMgQ60UNjow1>ECAy#b z-JIwa81~PYgzEd3()rAPd?#(W>-V%KFfP)`s87s*8&LyFY{JD>LO>1?@+41j&H6O# zj-2*ksed(xowFkn+EwnE!H7;Pn=gL(8+5;~1SsRe%@&7~)7^xxjR%1q>U0^`Pn{gr z-DNJ17h$I;R|EFix`_o=pL(>DyqDj>fa)$urzMNtjAUPLUBjI(U4EzfPPs#VE>4qZ z(uR|6Y(0y#j^T>pF6LWmtQ*3(U!uT?!s|5Z8{Jk6Lg$rc?k|^tYEw27oi}foavh-CWYva8)-HhFwWQx&caggBrN+(PcXEDTj{Huan43OMddn%Nsy3M{>5IrR=gYHDk~ zU>#-hrB9l8`}rMa8RGy~(xhd)pZrItO;#rfr9!3EJQsCDOqZqDH#VwgpNg#6A8Z~M zvBomFQor8l9&eE2-zoK+3`QhK`mdWM00&b>VXG<3bEqpJ6u|_WRQD&kU+lT zCjn*Y=?cu$ZTQ~25L5A5Bid` zCDBh`Xst{^AApT%?vH9GoYnZe{XM_`Fyq4G54KfLhVC9Sa;?6A;&}X2#yP;)|x$E#UY>*7)&-PcEX<;mvqjKG3wC3lN!Pu7jnz%oNxAfCBWd<0X z?{?O{C&ivv~`6feEJut11FCCWVjYJPRGajicsH8~S_iIjbSXPEZ z#GXIwp9=Gtl4a6*yAg!$ojAeRO}1%4TLj)?qoi-^uf{nH#?d!?uT}ds+}iYiEE%`? zw*}{d@tVS^QZG?0XANmuA?qa8`hoqRGC~59h}9Qd!?0o*?i3MkMxBBD<+xJbo8yz> zOgIXcF4@*cGVQO|D?(A>V?9iYw_sALLQ88Z1v6$BW2&G#(s;dfkh7jKI+f>g)+$+N|84@>W7$5c$o_f6#om6HB|Q(o0*t zzIiOiiP^zRf2-c+C;0WqoGu6OeVHY*HfCmp$ zXAMnc4z0}8i1gwAHejyJ=K!Tpd1HU|xSn`N|Kt5A23=#_R;ALh?fHt~K^l9=WL?fw z8=lEZ^aOyceV9|wV1Zc;eQ%5FSv4GelR>>RnC*0XgV(QX{}(`!iY`2R)E~_FS8!o% zhed5Or&}RMBeMKep{oUWG^@}kxPBL zukcNd?O19V#r3T!%t+EQ7bMTVEyfyBInG$_qcDdD)rSsb4(Ka~FU$D8XJM|8JNEiP zbSRBRzp)(RFxINMg9-ARt)US}t73W|%Nk);3?;AS^cW++LYv$kv!>x{@tTBbzi3K-VRb>(YftxV6|RV;4& z)ARC|zAtuFat$oL{LzYHcGj^8`ZR%c^Q5Ir`455=7+e_YBqZDEEsQNb_`R&T!rOl9 z{mc#Fv~Dr)j5y}}H0uKEp3T&(c?69s4Ox)-di$z#$A^DYVyAaJ-+h=(_6xpjEfFgp z+cR#C?fu~J3SslhfQ#>r`wg@~;!X>O<@;zWz=_Hyv9bxyOAf{Ou+@>Fj42YuGo}2N zpER)uzn;z*PsesaZN*56*8KxQy30@j(Q=Cb#)hr%+X1G@A~ig0MsTuGMC`9>(;x1R z9*GpEF#F&Q`Uhes+7grGQxr%r^J8$5t{ic3Mh-Kh&Q@ry-#~w{D2c5q=O^X1Vze>k z4@PWm3~Cb4y-x_xIGbH52G-qC5-mqi-_L=MPAHo~(>?wEA0F6-j>atD9lF7O07{3A z^z+0vL3~bvbWJD$0;$Et4Ual_l$eDnd$diIk77FSP)4;RCY9w|tJM(f%3{1rz>;V= zP?3(NEJxJB(LUB)H1{2QsHxQ}iR^51n)vzrq&wsCr=^+n!MpM&cHr)$w(YGv@bdal zqlDh~MNOD&kn9!ep|alW)wCrRul4i^E8FR|bw`Kungmx%+ofPgaPR!LSn9dPtWqSA z{;`IR-*>K@X}+?rLs19VePB&R1h1mnS==Vumkqo75*l|=Zjfd@o*1rtV+N$N=hf`^ zrj3jh?1+YySz=OmV6;eG+u?^OQbjxD!-C(LXw~O*WpFr${hsDB zpOcitY?eqLKQUWi@e3GnqKppCx70AODh5ph(Ij_${|@*g!f)f!6kLDUw7xHW75}pJ zK8-7JgaiSf`6C6F$wk*SSsNvE!X83<3LZ5S{`QzN_Va1QJwZuuyRjNu+FEYyb!nG~ z%&ZoUhOvs31C%>V;}zg2-Dh~UUEnByRKk7wA}MtgLB=Rsg@A1D3;Ec$ME99{HL`9x z1JD4D$Pc_hMJB=To76B;_uL~z>^t6TAY^Ws`v@ul{x%})-*T7Ea3$UXV>@YX_27j~0Cs&tTwFo{X zEu|%wH9nNRd6=(V+%$fZQr?bHZ75Ra>-tK)DLW2P0QrWV_7*mskJ;knxsJa;ImyOc z0?u-%q_mLig^iMlZ-N3jVt(k1J1#C?oSfX|0b#-1E4*pS5#zL1Yba?g3Qh=(eBd@D zCm92)+AO)Oe*efEpUe}UoS*@5Qg$Mi0%@H~JYO3Alf_Nd7rOg?((O+Uj4lX6hk>Y7M22xYT8acq-}w#!%;a-}5)0!fBuOyw-o@e9$s5iI4 z=yj_-31Mw0#X7-k;~l%B$8!^s-2Wg1RQ@FkUFj+W+b4l@)=)%RP!0^auj3+faKjt7 zxDVUw6vJUHW9Bc<{er<~=AC>08U0QFBwic;3BC0cqe`;-2cfmV1}@Qo?z$@zx|B2n zX)t@wh*{G|QY>&M2BQc^@FNH{ zD@5?Vf1?5aM9!!;h4=QYQS>f+0R0ux5TEs2>dz7;m7v_b6V0$xm+2#*w02@%vTyxW z=Teb&_p-8ybC@O@tLxQ6A_9tt|FnmlZRv8I0lFfivoRgq{Q8V&-Ikz<043m#^$>26 z$jkR$BKrj*k_gUBvAAoIs?h34qitv@9=R@Udzx{^*9CC-ORBnK#-LKsePHyV!@e=f z))Jilr$kpW1qEGTjIC^<73buM?{f6ic+b{`xPN<91~2ya&%&e+mgBEne4oA$-4!g2 zC#P)Ocyy{2eanl>QypME;Hmy&$@&k%TCfn=!qRL%t!%esZH*cy2IiN0C$Su@CQ&^$ z$#G#x^6DRUd{l|Hj+KR?I%cAp=JmW>GrLHly~V(q?@xMFXN9r6`su1b{QW-m(#OyD z7Nn+8grqx~wQ8T14HJ!x(sQB=K}*KQ#`F1ZveR!!+!5Q7fOQP_jb(9 z+Zox9n*HwZvcG$)4L0X1*MB?=awB;x^nRFis{=B@cBB&gc$he@Bn+kBcHdvfFo1G1 z$WI0A?B07{5y99TGJmO&`BX+?w7f`B5Anyok$>-uC;IhPD@`zbsHq{mIOB`SHxRU% zNO&Gxx44_ey^0;!(EY~I{omM#;ISC=ze1(SWO0ksER$slW={%yO zh{ka3-tJ{>i%#MF%x^Cc-f_2l1TCp)GH7;PmN+i*Ws!KfqAO1YhLG=AbZAaF6^aqSNNk94v5t>-jIGr1*X zM`HCR+siGOS1WzySELN)U1Y(%c#Y&=OD5!S}pW^=#q%x{xn7PorQsR8V z0*^ych&I6!Vkyo__`>&!wafeST)iSKnvYT)@yn{t{z4@qS`QARsw9E*8PFC@i zG4UOfCus<_XnxUmHG!qP#oq|~valZy^Nx6Kn0|YO2rLLuNqMO*oIDj zr#*f6Pe-KaX>ad_867|_fk==)==t!g z6J1dEgCA}tl%D?LYp-1s;~TW$I>85TUd}pE`-cV%Q-&eeF-UBD_7t5fS{fUErIeX6 zY9QWrTt){PBB9+xgt+-|8yG}o@T}A~MDbL-*Hx)SkQEB-I#5mz8lcwO*jQVY!fODj z%W(*KwQM4VDg#xEUY^2GbWf)CTeX(X+Ih7>2a;L%#cStf|A;{mA?us6Rx6HV{hFd; zTW{|^+N*k0*q+Y#I@d(*}Pc{6*1upWTZbC$rcb8E%CnH z(0EdceANAbM&LgP9?PQ37qpVL^XP36=?LJZvIDrkBne!f* zSjNr6wS|OS1l^}wTp2~T;g^7jQZdjnc%~E{aawv|bYwJn#yuQUy=-dJ_`|?)@bm4# za$DXgfrSs6a%s=OcU`&ndERfM$f=`73Q~vmaZHc&%=D6YizEWlIL@pUvJNah;^{^c zLgVwQynyoFtS)d}8s)l>%@=%na4<%--cust+El{brA2E#8TaAEe6A~2#}FTTMGd$n zrOiLyPux9k%7Y2ZyI<=TzgE7Hb12FSyvGp4*>Z>%wRsgko)S@^60bO3%)O-I3mzKg z7~U-HbwB7Xx|J!=2E-ri%RYJ3jZuSdq$~}RW<8M?K3n_A`2tN4c^NJ8>=dizM}Ri4 z3ObQkKXh*Rf;^TLzVuZ%KC7fj{>t1a_w;$UFnH+pLN|z|U9(i~N>kD1#<+dXwz1rp zTXaG`b>jqe{<1u3M0C5N@DyctQ1#17yq|lu421P4cq%*HpoxgsbEb&Jtn#!Y@Q>&k zSW7`+(2BwF1NuiKQ=+>E^o5nFroLZS-yXThfNnCNgasyPEQY=AJgd{bxH*F)?RDdf zntt{tZ0*5IK+MyX*wafO8qZGF$%*dgHw!Rh#CpHaXYj`P!E$1?qF8hw6h1Q0dG8L0 zq=Ccc*Nv*dFtLY^al*d(wesh@IMqsmo)yoP0`gCzXE?{J9>Kf6JI@K=GECbv=Drnr z69qO4ysEVFwV^~!KSctYyYBk!(d*~q0Ja-*6X}h{f7d4xthnH@5|6rQ@cXuLVM^}S zmrA0hu1fUYcoZT~QNvZ_NF2-|$u@Yr58#afx}3>Gpp7@X&$m>em>~ojGO=8{D@$(m zQI5{5J>x$ZK;APcB<{!JxJ7b>dbAYyGxBfS)WPF|DG z4eXv}%VwBH6QlO}rX$hDd6S30CSmah%7NmU%h_CF;T}@aW{~#{vg<4@mTq}pkk8!* zmE~{k)RVqCE?(EM{RjJ)5SpX1SAGou{4M3leuyJ6V!z@Pgyh6}t0=M0US)VIPvu$r zPGKN_Io-8(K&-cp1OCqyJn}%2g})CBegiDb+VCCWmh?J&T0p+LP->mGS+8hI%IUlA zUh60^qUoN9g0x7^7*b8tTY8Ghjks_0&WuMa>r|e+Yw6sw zhXd@!lU+l^ceWMZdAF5mO(4D3gT^DR#*#gHbrA3`X_>9rgu%(3N6l4c>3M<|_$_bh zq~hC!+~Y^m>Lqun6^AMzCFs)rGZtBbSV^PQDGoswN?R)tMq+1q2F;OYG7m>@S4V)- z?eOU@14ex^Vw2#v2{xi=C+tYA-a5&?AaTyh*Xj$@ne}jO(YH5=8XQ}JgG_3@-c~v< z_vV;1k_G61uMiNpnh~DmYqoycK_ejaL^c71$V0yWgz=PBzf$O}7spCz_m(+6XLh>W zeZjD0Y|Av0P(g_Ohq|WnLq{ybz#x?m+;O(nnkqZDS2U#pKLU@u$a*s`d9B_ViG~4gqiS6r`MMRE#&aoQvz!#?l+2<9Al_& zS4!gqwuKBo?h1O6LXj=Z!^UvS2|F-i0i)Z*t0Mw+%DR?Bq1=Rm3{x%6JUITiH+Dz4 zU~`%y*1?w99Uq&=FwJ@YTu0KcF%jYenuJrvkkBC(x#)Tbxw63@PD0_W=I(sB`3V@sG!#55; z53G^pd=HP-m_VYmCOSwunD4L&pildjBhc@5O040Ht2gG?zN2^7IPS}JbA~z`B zMKtE9l*^?peArNg?l8GNk_RxG*)3j8i=V4uxp~+8uGm|#v;t1o3|dWi$%EGHe6c3C4DzP?gNe5J z6se&!MNd&I*3DkRqQ!-R3R&CtF`=KY8xh}^s5BO3O=M}dtj&x&;PlO!0%HOSse7{f z@*u-XU{CEdy{|5+zB?z%A*+@MTR=KPhHtB))H#x;j0YhiGi<_KtYO9MEmyk8 zX$TInFMc{u3TG#9DjWI0c<}0lTvXkDRQMVww6rduUDtmvyb%(^_6cHY-m@iqQA0#= z=YJKyh13EQ))2~Zi{&r!S)I!jpZ1VbGjyBg22Lr~7cLpbKp9*Ep6U3MoVfwWt@_m$ z8RIk-mH^WV_2I@74|4UbX97~^;=}Fc&R(k_k+-P&@{hK9cEdkdJS4iuCzUj4Y-cem zDI~m2s@ZSatb%};bKEqR^|Nd1g0$@tCA>rN`dM#1vACCOWtIXIM)7{ zL9F)Ij+fS;ITeY&Zt$k^({Ia~IZG)UHnKZbSJ>;I^!JgGB~Rt2GQu1>_e_tze-9bC zdu?|+)i&zN8Ge+ZB*kW`aId|}*qq%%aS1p6^Y$MEF5!*!3wWCAKM29fVwGb5zU~G9 zH=(Wbxo~7)sN}s;t*p=%KOhO`_D44!`Lz{nXWj_|W=`kyaMR~JzveMu9(e~ir(?-` z(k`o;#Tqy^t{#H_o||LJPCJ^hbJ@y)9JCWZ+FY=x2G_j9vhsobgTMxq+F!)3_)b{g zC--wPZ<{9d=0wtqdBFI`MUy*X*>;Ux%Lc4)>}G(kqX=#-S?FV~I_GJVvX|iqTcfSY zctHiaSVU4kwl!KUKjqOm4xkxhto`?^%ZaEfJeVk0k}-8k9DDixqy|MZmbEYp)IU4jB;$(P9Lee00Vxd`yl_x)q= z_y@FQuMW*Gc+9rc3Nr&q9Eq&A1|xUyr=*ZCq83J3NSke=X?Mp(%{XNUp>ZjSBQT~! zVYN%XruNntEzFCdi#iYF(t(fHBKE~3L|k>};UMCLD1D`7TU)R%TyxLfflk-NPII?f=E(H;IW9y* zfMQZ_YsmJuIJX8b5|x>y-4FSXIkK|z@4so;{qCO*;Ib_&NBm*1dV-A5$@m>wf?#ou z?XId#B*64$sPyn#B{>Zi2co1k%OsyEcos^3bztOhnBL?@BE$<>ei4PeIM=$_()Cmm zCt?pN5V`^e3IFq*7{;)43JOq3qcJ@_k2A#;gP^h=Kf{FW6XN6on$B})vtga?pR7-j zWN@%Mfx%$?rGU@LtS#fw63Vxkmyej{wwv^vV*|(w9|uIK4e9S}0x} zB6o9k^S(4&_K2s^UJR`~9}TTG$@Pl&7?b)wgu-VBO7^h8L*R|0d?~Wp`+sg0Fhz$^ z?6okyGOpG9R+|ZSPz&@j|L(=lqwHrJ3BbK?mYGs1C_^Dj(Ohtb3P{KS~NG`tmu$-o|hQvD?bvMNn8%^>n>j2a|75 zWOL_^?>`8x!M7KUehM4opIo0>hAA;UZVvQEfXFDC2t|?;zfp;2L4K6*CwosJ zYKOS)V`yWO>?eiL_PAb%i`ZL(>7iAOuFajy)hP=@Gwk%xR@zXZw zk%Y*hK_i4Fpsq>|@K{hxP~4a;u$u0n zBBG??jpVSS$*_B~wl_T^m)VV6GYyz;h`3{LLkP%hQPIfj5$%62{om>+6UYl1OsQr!G9LK_WLlN;lt&|?R~5^G z9K^}wA=x{gFuXKwMth{U+5_)yqNrF0?L*Ue20#gYQ;<#@*%T0QEo)oqw zGF7IDwf+U=7*B#5TR@Gw9WYVBp_?fZu1l7NeAP{ZtX+-;-kO`1I3Re>??h$gSPMnR zAwm%tbf48W#IN>pHKT-KVVB4^)pTAu-0*=}tVuqDW5}aj_h8 zdb1=tbh(~~Anuga7%$nOx5n#6%8sO<)XxQs#tshOVTpnd4+{kpo}^rN8Gg%4n~|8k zx%XBXISU4xab!%I1Ke3I>I zS8Ix-e}u?T)sLc0Ck}zW)Z^&GVZ1XN{H5{{+;CPvfxm{(*(m4orMIypY`10Hr`0u` za$?O8wjmrWX{hmLOgGyeDi-U(mF^^PS4ucQsz$gvO^AU$c$GfsTZ zFDaBNaFVEKDF@&u2zI>p6rNlER$(=kjUlo<+^dL(V;c|Cd+fW!j!3xj)m5(ZY3%n> z08av?se{$51&wCr`?oX!k)KKokJ4;bwsMqO=v=`R?o5C@9ve&w;db z*=YBXCia?Cli1uXgH1n(U6os*BzJ3oD}R$V_`$dCHOERgDRwY1#LCt}XU6kehv-rz zjaA&zDpy%zKu>$Az(E+uJ0Dohb2Tq2>lYlkf7UNZ?pmq52`0r>vabTYQ|&h@;o`8o zs5}pxzvx9A#C!$4Ap-h8bMn7ou8Qw0=1$KF*W4~glqG)4QHW2fduE~b^e)?~+aQ+$ zZHS1UsU-zpr6bZj`7$}xlor8w-B11IAByW`%u7E!F(2z|>MPJ@9?6o+LC2chNie;; zXe*;^~Xwu&htP5_LyG)kzp&@E&_dlQo^*%L!^0)cRi-{EIf8w~o6fzxHoW~%- z7Oz!olwXKq;g*c&?Vd*uT-6wgkHo-jYW!e;k?9p(ZR@naw`@7rQOkrx8|4+@gfc@9 zx~|mpQ7DDrrtbj_-6e&AoC&{-|>P^ToH@MK(lJ6 z(SO>^-Jv{@OBTE)@MIhH`DP4(h$iz@;aT1!0N9;%c&Y~#l)020WCv-Gr(inj*-K7o zZ@+33CV%k!m5Yg+_&H5rc{dPGS#pKGN#siN)I(rb&+|}ToykW*EA{T9s5u~pR|J7> z2GdJ2_xkuouZ%NAa~YB?F<61JZd!L`e_Gj-=bHjSN~2Iwv|kcoB0Jh(>F~{guKfBG z!11_bl1anTesXy#_F#D%A;$C)sdy&rD5e^`vsOc`{kH0OlU?8}E+>$`l)Pk&Z%4G$ zZ$&;yIMN>TO{y^I_0e=wSPkyy6F4!Z&%jYq^WqNTs-u!SAkofyJqLoQR;i&TRVJZ;V(8nc%hRAtXmi6N~JfP#4s$yQlIPy|5PD zPJiO)cM)#h<^pDZHwV|(Rdxw2Ip26o84u>iBPm*pp0R-4 zJtJX@rb}g%D4Ne;PW+wChPXPKTE+>)wRD!|}_gE2lQv@{|wnjwR^;ZBO!pIc6I;KtWfJb%D z{D0VP7a0)owPt*-U@(r4(~8AT3Q#?^|zU)E$iQq|>f9(Zs z&~c2kvM^%TAZ)&UB&EQg`d(2;k@sOm7NYPqN`@ydm0iJ!DGuke$4Rby1C#na`9tIr zi!mH9)t&5x4^;DH^YPAKd~mp64n&77X1@*P`qF)(C+{t+mHA<@r-rtUTPlVjp+sV> z=UM6FlYP<1!JEpMR?XGB+av{_Jo>lh&LyMS6eUP2qBq6V40_#)ZCR6nkdXmX<|EQO zNjvK^hl$@GPaqk>Z!#H7?dIdT-{b^XGP0%mC!c*~@pay5{4+k4AP)Nt@}wAPIqFfh z<1(I`I<>e`@6}8Wu3npC@`1>N&J(_gPu*Hu(zb(yu(}Tu(a#Wy9PZosg86rel1j*! zzRx!7?v!@#T>OJDUy67~z);+KTbF2NUWb&k*psx6*iFA&Ukoa%3p4YFKo6jGsm$N6+Fb8D4mdK+Cyp)y-V!m$!HT1I)0Lg54PyNKE9mv+0; z8Y~D0QCD-dgbh4&E^93=uU536H*TTx{lsVz5kYab+lK!Ry-S)szP-keq37EkQ@qJB zWu=RSrcmn`S|GVlCQTrbDXV54=1FsFMT+PSXzhLyB}tGeMam|FlHOI)c`^DK+@e;K zL1&3pOZ|y7v0dc421K42ZRMveW+VCy$*C-ziXwk&BfN>}Z)+u6WpO8Cg%@<%1Q^v*-uAFwujELRtl3FG#9JA@Ecie5ep!Ju9QIoHG%Wm@ z5dWjRWyrgYC4Z!v$;Q4i>PX6*v9`ym@WQn|q!6ccuRhtBWoqK&9pKg@&~~&fF^t+p zwUz}(-MIvhGeJ@>JAV#arV zk}px*wR}l-KOHVYd7`~&66wBHZ7ZD1l64Yes1hQkM96Hxc{nW#oeuDsq5eoSG09lj zouHFX#I=G@+)ro^q2h3f*7MW--HJGa#?|U`h)wmCPdQhk@|N8$%N(?}wi zF#0-bst4AfFhE0Bb5o@PwY!>@$vLsR&n2md6U%q zgW&wy8gGm(8c2vaHWB-x{_qb%nkTl9TR(znd~%>jxOcz7J-`w=i#jD%G}@9wrb*^M zxV#vIW?nN43~go2_6Za(y+B8%@i>vSun6c17EBySLx#q7%1e;HVkmyqZeYJ4diDYC zIf1h|2+;Bh(t|ta|251j)(FAsq<4efQq1r;z<)u2+XNynWvw=WBq~UUd1%J7IRBzhG8rD+&(ezx#qEY@-*UF zXN2@r)04xd%eYslHJRN({pmII^t7-JkEMc?w)hy~#%&Zb#B%E2lhJZeSJv`3a_bvA5$|GC!e5nNwYW4i~>0ge0yrE2cB3I8d=J+6|C(;#hx{a zD*nwM;fW8yDWSP2+cCr@DVZ$_D!*N061%!w0w`Xb8jAL0f^g7D&zfpA9B>U5bxmB< zMF~^ern52XUu^Wpa;X^EP@j&2YAkHr^pb7E$un%i%K3c^N=fJ<8tR2q_GQNsk|P9)^r$ zpCV8sKgP+5Mf#1VsioF5Pau9slC&AEO=734;zaGm#+lBqFeorTtj&as6UaZp83#4R z!r6k>-nkso1pK%R9g#+0SoJ$4Yt!cI81I19o$T&dYIV>{lzS9h$W%NkKW@xDvoQW~ z?sFNAx{+EI049T7(3$}bJpsAEl0U)VJ%2wYxPjyuz$tmpZv&y%&=L#_#d&&u^UVKb zr=RBiCph7`gW~>z;#crf#+bpAz=G2mti`;9YskAL(-tFDw9qxvVEPo0vs_X%<^lB? z%^fgUNfJCE=~2w@8CbtBXieqOUFT0I9sJ1m!YlswgW6s0a&Y85%Dl}#2oq7yR{-8% z49PS=wR78%^~pt#Kja^TNuQ!2Aj1$&vBMKn^nd4M7%ozp`q?~d1tvf)WqQC(8RlM4 z*Ng_37NX|=z9|=pq74-T-CQq20v-p{N#+S98*sMk4XRnj{5|NuD(lJRS|zzH002>N?m@Mc%9qHNTc7yc}lD8sV% z9?$$}N8KpC2NG}l-IqA>rk1LMVZ8i_x!p5t{p-S8c%gZ^q z5oa$=vB2$MgTH)*mv$4NcHo+Xm>a3NC4O^$1n@9BvIO$8 zl|YLCqp22HMLU@E>N@8_XImB^rk&r12G|$NI9DlnI-tgEefnL&u9=?j@pgzlE~DRj z)81WOmX!9EXbVevdqO)x$fU5PH_tIfCU9_N;}?e^-oZejM7F>otzR;M+6d8Uo;v;s z6os+HJ{K+tW!j<@)C*-nR+%bf=L5nsL!LP`iI-#4M;4J8eC zzRIS-Lonm}N1z3h`QE4T3-!=Q`wJ8!ju}#r`wbq$zE8lkr4RM{wY99X(P2s`#)Oma znAf~>f-1A#63=)gFcTmQ6+enApAMU&3KdJ=|e+VaIks zk^f;OdcrDgY+~Pi)F35&7V=Xw0iC$P zh9`F;X*@^Eprl($P$2ih-tIjfctGGekm>s#rRx>3BuybZj2od0gO}qh<})wL!9&1u z_@y|y6R@;iBeh)Papr(sBDq9zyZz|azqzGpHSH0_ICy-*tY|?lx2_gnX>fgZ^b(rK z-#=k}5=~oBEG@P7zxG+jNbz=Na<+UKKO*fTj0t}?Pv^yTt9O|l-}gMRt?xgG@ynl< z_B)9;6FWZ?JX59oOHeJBJy}4B=`%R-0JU2HZdj@ysbl9ipT#+5jlqFzf&}q_3RKD2 zj5V>dA_VLKlpv(N$fH9+dKe!Q{tDu~TeT0MN-#(Ao z|0jI*JyJ5AeiBFc2=EphMP5hix`}ilXa@?tcQwlgp0*ypuz%IoO*hCi%h6@)vo~qc z*wy8-oqBBFDq;~=|m}LxGcbvF0*ta zbPNrpU8(Y)_Knk~bs`HH?lq-5BOfC_@JJ-Y>l+z^#mxpQt8>{hLz2DQL}UZX?e?ny zALmM_{Pfh@Iga2!&ImX#l8O8-R3Lc1`sk}jYUtiu-~lTO%k1eWFT8O&U7ta8Y_Euqu+-Jg^rhQB zw{1??_L`qRBNDdRx>E6v^>Eg2BlhgSPpJW(X!$&^YleL;`vF;tyz|4HQU73Lp3yUD zRI5)}_?%2=yJ=TMEc4b%RDmNcn$EXLWuN;#yI?GoXONxDJ7#J-&b%m^wtQp5X~om< zni1+e7)?)pA!;&a<3C0L&65xi6x4`?N6AriY>3q$;2qq`>86qDQpAk1mAB;9DZ^u8 zZ*8z3KIBCQZ}3_JA?6udZrQ`Ep&6T8BAmFtwh-~UkVGYFA#ug8@#_EhZ%evSwEp5! z3~9OaP30Lb_PO>ArUVi#rZZH%_l}6KsV=PD2)`{Kr5(JcMZO<_`s|+P^@ofOZikaq z4G1}_#_$;84)g9Lg*)b-$T_c(DRMluHS^eE(8PLtiGKal7Q_l7_d~>gA1#&$w+u~J zB}jwd8h8ZPa>EJ2f>3ha*0I!TQi=2_|Q zAI|@@nnn73^2%uoeayXa#A)DPV^3hC3@QXlE!@E`!DN9KY!xa~7iA>d*KaoFwl%wi zo1jV#GzRML3PjCrgiH8Zx6GcZL2w6K?f( z)U5$LA=&%SBTGW=E5GgcIY-uIvZIw){FJ%CGD+`waPXLb0mE{R$$-^K zaiU8UT179&*am;t0-+Rs=X;tHOK zJhNh#jpFExF{e$@=ZBG*?w*|<>O*Hugz%v9DNXyYqEmIS`KfIQTg^0%E>7-|23B*i zUM@ezS|>ECEFb=~oS9@EIj4x=&Y#H7bhGqNFg2D+A|)9B6Ze-oM5r_|JsX-XGT&%Z z_!O_LNBRh>>az`cu>@Y76{u;=8}2am3P>qo?IjmdzuBRNLZI(Byv z%a&3mI~>sD=Y|b0(tDiKC_uyqeDCXc`8YcrEiFFh_KX-qEjvwOB?-;_cMi18KXe#M%F0?)cLo(qN$9XNPg>Qvm+Qkn2}S?cipwW zFBTTL&Ep-Z(?k1Z&IW`XSm(TqxEGNf&9;&QkXRGuj+uis;1Bj$jn9{Y@~*ht7_@9KLZ$XBR?58KsVroK`MdeNuRc;~xXADN`e1CX`^XoYb7v}v}UV@)w! zWsc?Q=4VA_$BwQ!Bc!{+_yc8xHKy)w7iMvr5uO0Se+mapEmD7helBIZ?nl#Z1#*IbN9T^jKm`ld;S!%dzJsa zTRPmddIfc$K{7s-WD#_Dgh4i2`9%#6%X2&6m7!a8L{FrQ(D_wSVX39e@_6tE5US^! z@KntX`>zux&mBtHBUe10EOp;-8DcZ=#6(@oWR|#Y*8B{Ed?xiJgszWJFqE?(CTNc! z(A}j-6z49w!kaRB&kKv+?IEewi(%F&8FeKz)^C=h5&EYs2}%f`y2(V=khghy_G+&^ zBqNNrZ6C2%^Db|1n|wm@Im0IY;Wk2AEDY$8^W)tuv&7@{lzol~J0GzUE#OA$BCiyQ zcTm5JJWyBx?5cw_K{mF$yOi;cs>{a#*u%$=+D=i9PFf?(I~`Qm)Yqa$lM% z*4OKyc#|UT*8cnJBRz}o-WG+{VQ}ZY%x$mZlW}`90;Bz6xkbUnx}o(NE*;lBDTAyl zRiMko!5cxN>QuV0A-X5;sr=AudwPex{msd%9N_tG`tER(BNdox+R(+M57J9GY5Af0 zAAF#C{1OyewsuB3SbcX&blY*sEWQz%AMlj>3~N@VUGAG#Z*)YLJa1>0Q5SV)L+NRG zj~sI+9bwUmT-izo2mRGI7M6r-oBYUC@hWrrCE^EkL05oHQ=#EH@=C*RXG>|WZtXta zq_RFtiBT#9gX)BG7@Zt9q)2SNni| zFkxo}r$UQz??+%~b3FiX+zU*oY}{9jprhOYC_dKT;l>+O=ea#cgtIo;Tkks&yUCCz zp@zAqPj&dKfHS;+Lr3@j@Q`f;gMq$|yT(8;{%iwo{ifmfdrd|S8t-u#@G?ZbDwcQB zN<%6{f5p4_qxxbW`w*Lxl+^-t@$BvoG7-jTU->#WzK4iL?;Ba(K3`Umcq)2s4@1XA ze@qTZ2LVO)c~Z^unBx!l-NZlNk-g-EC?o5bT+$bdeW@$$HKP?i(?>%51A)OA*vqb2 z+tHpN_DC8_!;Sx5$D75-jjnOe_@Y$#1(lV#= zJQNEzZJMM_CVtf;D|T(Butbr@YJj$#WVQm3FamgYiq%X95*ZjR>bK`!if%{Bn+yn= zzVt8w1iQTw=_MR}zGM4;SvCEs3c-@g(K?J5(%g5?(S;-(PsKjjv-bf=$iK%2a?W-a zl|H)0L~?cvqCl;f$0L3Rln8YZ0_5f!aFj;oI!XSSz;Tt>H zkStI{O#Q#}49y}a>+v=}b6_kw9IXe4p70Tuhi+_%95b845sClObCTgKMlN9ES>7L` z>hEDwWI zk9@kB{%x$=TuIwCtvAKv5CAGaLtE-8y8Ql88H0}mK>%?+)uw+xcLAu(4a$^&<>Ng9 z{GEZQs)>`YLk!Ftv!^C?dcP#jP$5yHX~hdi zf;gHb6Dqqk-Yh$FzF@`|p;`VJo_t9IF<3uXdx>ut-AtB<6kev>$p^(8!VS6+n8HQP z=n!!T#I)X*JuE)7K`cXZ4c-bDSx;(km2q^skg6x-U3ydT!e6`8wtZ~Vgq7SLp$`C8 z9cLe#&$d?*6L0cTaNoN@cf>TD9w665_N2Bjz)_)-GX(dy+jg1fH{UKM?x+@o&L0pg z^T{K#oNXH}_XnimyMGE06;SPO=$-_D2$?4j16{=FCglMPxa%GFY;b%zU#u~=cfs=m zTm{iFzqO*)u>o!K#9LkAUlokA`k7~P2?(P02H?>*yPp@yG4eXP>!-;n-u{D%dqD$z zf`t6?>VnP`;+Y3+3iNY*gS(!J5&cn~I3z=ASF2Sjrj>*DsMvNJSV5Cf*@&TJPyZE* zR`cARJy~+a{kJ>cuJ>hL)Oz5ZxGX$&0P9>?_TQOv&5!$kg4(C?uqqJKKh2h3Cbn})oD9ws<_EM zGm7bqBF;f+#?@^LYEEP3vdX&)JPmLH{57RVHb4VSjI`CgwycSM3w%tTJZ|8n9oVC-n!;If+C*R6OK9Ye5i8V5q{e4s zf=1uJo#u0X2_-J)^-{im!aLe@$q|RM{gqTHr5hU>Y?EBw9BVMppE!?qPnX|YmQ)X} zES91^Wyw)++uFb`dQEpf+l8|;|s5vCY>l=Eh(g*dj>kiav;{(v}^Un?zH zCf1*KjJeqv*(c^p!>&fOOB9;$2hS^xe3;<V0NyM8kW;Xs^LKT;o2rY?@JI)`yrYUHX>kQGf{E0i1?Yc}c52P&S24 z;I`DY_*f8SdY1^hH|kM@=-*Xv^7OeyAIzqxa^vpY)9>g%KQ<;r8q-fN-cyCJxteJ( z^~zZQYZ=vMA`J>#=P+#5L;=S&Qts=CXckOwf0N|N6(K^Hx~aeB;6I9@qK2H1JC6BY zhIN!qp2!UVv&DhCic6|DW~)*sjekJyz7Zn80m}LLwfkG(0G-Z@m{}3HyL$L*2^aVP zq7Ujzx7jf~Q5z|ZF5QKa@0?Nq$$_sWGtvHcV7Xm~FnCr+2gY@h*#YSr#A`8(8Ow80 zt@wO2nq;Oq(+aEw5HvF#T9(buP(D$=(}a*6k5z~dP^LS}Tj`LFJ=2DKD`sMg5s73K zvy|}QN|?-k5|{M7(cUrWQJA`v$_n@caw`F#`6(A1anVWR@mg4g0+Sq3XKYV5DVk$D z57t5#nIp8#gQ(cD;yP>B?5pk}S3@0Q|E>0CXL;+#ANdW<7mFh{o6iqhEFf=pXq?SWB>bf3j0AOMHzkt29fs@%)(D1qKWldHYw6pChLC#K;Nb`LqfKHNmuOnjQ%wc9uCmZRQs^I?xvMe0?}FE^3Hrj>uPckvlA zMHu{$oKu_1@%}JIbXhixydvUN3IMhPG)_LE&=0~te-(j%cTAj>^eF1s`U3Vx zc{+w?3!is~NHgAgccwjH7niJM2^~zA*@H7wb(>bMW2-L=&Th@_;fgdfN2l5h<)85G zorH#yV-T`ZQ_wqb|{ zi+lh=7A=o~BB+t|{pUBzihD}-jNjIHglh{=TF~WDJGC_zqKSL2;MWtiZ01n?YV8)Av0 zP5@K6qM%ghrg)7dZS5J(?@z2)@~6WFm5yHtg+E=45(Zw8ytfJDK~?*TB)ZG>T|4yZ zDxXvA)&Bm8g@uJDTk=1AumX{IQ;V36jYH?8KK@;dJQMD(Q`!`X8}lNYZl*81hdXQ> za)C7K(YJNN1SPY$4`jN&Lo*$SNLd$Q$`tr`HLx{^{0E2q!d!QriNAI zY5V5?iO7v#W|RzM?zsUu$p{(;*|i`u4Ot4xG)rs0bI=&RmqUKmk5PC_kuw|n)2IJl zvJ~w#hu5zg;Qq2Q;o>FZ%;+NX-e$w9a!`>6G2E1Ps-wevlOM3om0I4+=h@GjM+Vd` z5KOla#3$$md{F*C?sIx2I6gng#q1@rNgL0yHC3_65hU=PC7`UI4<#Q{WVB}Yy^_$u zYTugt^wlGiv^p=iHd&0$i%W~|nnr^YK@H?!E+{+=7Tjgsf12d86T8g39GiP@jz^ll zMz4Fmboc~Rw@2)N&2FENaN8adq}SNBzt1tF{pwwn)&TemH7eAcm7_;EX)v}z z0OJyf;V$@a`L@@ta)4+!QLI}O^9NoavKwP;uAv96cp90@PV=wQ&(QqY&q=G{7eQ2& zCS85#9Fj6)u*EWbQPKelS#5>iY50nP%LJGBoMXPS;#G_`HXJ=+%bDrYgMUB-&Qp8` zI0`V;pfB&AeNkUNgla4Tajd(6)80O{zkA*dW+>BtG2pdlkEhCx3C!qB!+Dn7^+l3~ z$>s>m(CCMB{{!O}F$(c+OojsNLomy<(_ma^@V?Uu_oi z5Fl5ME{}9N`T>qXsf6`wRnGcd5e>iSJKJ3D?q*?6h)Zd1+chxL6mJIPYQ{wfdLI>b zF9qL%wwYm`*RSk;>~h5n;FuZDPGAm_;S0MQNy)qCy{-414Jk|f?K8*`=KX#@H~kc- zmi?SV7cB$D5-z-jdgLX3OAupN_N{EF+V$ahWR^*$=w2Drwn?B~FJ0HIW{4kU*G#i* z*LS?K#a-}PC@(c!q-L)&8AnR^R4@plO)NrBsc|(f+?9W>*mi1~y87UQtLODVl-pX} zIGL9~GiCS84QEwb`8!Dm9o0u!VTMKw4=;ENi z@*Aja3+v+-<}6aK_>zv4)4+^Z zT{&z5FnI1rg?pxrn7Rdl2tkVI!2ySqp|wzl=Pb(-{(F+=YPdhh?mE^LCJ-N)^LKXT z!{~=EKHC#t`1jqoJT{RFaLnd%6QIRChsg<(T!Jef2^CI8!Mysh z7KL&qbDUJw5?t_X*qJFm6^hC!yMXX=AsU>H@Xpl{JUE$jXU*3Fg9nXEeaG&1@>6F2 zj^M~to<{y1!I`Gew(@9ypBS;5!(=JP{h72Z9n`-R?cl`A%UfJn-cdfxao;%Y>beAs zr$VpB4Ruz;R+)65W|cajmvs=@9NenDQ3YO9TfwhKeBX!`g(Eg;he)^^8p`XN=iq}* zmah^P_1cEV4F;NNGGsQHwgK&cN=z4u;1+z@7RuG6tih#Lz|ffAP}h9y3B(WcCv2UK zvTo(mSfOBfFSaAvJfhG;JIIJww06 z8Yh$e2McOg%~FXR=}A;Xrzsi16bX6A2ioS!167RzIu`qhy(?3<2egRHknC2SWbOTp zFgv_KZbV6Xswg>U>qV~N8#4R|r~(1stV4aGb>7went^y)Y2^AZoFT>9fT?cv+n0;I zH=zj=eTRk`tPq7!-`NvhTIqX7|Fky?xjDY#!v%cuHQ^q@c5Ny9lPeEVQAZkY<|j@{ zSw8)y4Yo9v-gr74X_9;m3U*PHa)%mQ?rjf_X9`++Y7Q*9v`i^X80@Zq<2vd+1s48s z_yu`~DMPA>PCO=rc{o;PJd?bql9Hw(VI}GR0S!cmG<8RS5RY>P*i6TNKv^tY-;xW? zG%EH+*vOg(XYqpj*50L^&p3{4KPWaRJgkr_?^q#Es%(UlD#bz#AO)I6t9sK6roE6+M{aZLPHT;6AN-1dYsC0Wo&Oo=LQ1?vST@liMCd{ zD+Tch(=u{r;fTrCQY_MiHBc`p^i0Ri@XVtm<;l)Q&2g-C5EAb#RR%-;=GX3L!!mD^ zZN_aBZKo6n zt@Kh^kLzSqZ48aM!#a~GUfdeSSwV=x0(62rIWL2?8gbRZZ3xY40T~nkDD);(dihtQ z3y;{aIBgNJ=>$d-e=&1oBEhTsABehu^0>s*C%l81q0}JW!nJrEfQ=LPB$@u6wD_e@ zU3QODQ~2PwI!CHh*%;#ckGuec$UMXRNg$1=TKl%`{aqGoDJ#uS4yl;!?5B-sDvWC) zuEXC<8UVUj`b8tGNJ;kH<2#+QY3fYc+ODXF^an)HOmH5D09tgc~+<3aJ{+v z^o(Rf8^C|yQI4VS%5ZRANHr2uc!q#F18TNGbXoUp8+NPolJlSk3lXxKPp_8Q{FNWz z-id|>{sB>P158f7{|2xUfNZ?acjK>C?jgW&?rQzb_y&W)y8b>N?1>&&aXE*&(|;A` zv;S9dzF2^dbr9w7LMTZAw%B)4Jz6)m1xyfDhL#ktJB2Q8)(AUdkX(;>?Cpv3w4&Lw ztU=1C{eON^H(*G;)9u+%SeF)Fv2d;PC)KB!c|(b`-%d93wPndxL+&SG{5bJ~kD-s< zljJ*T-kZ`nCbu6_c+|bO4T=4zX_s+(B4uhni1fmOz7sjmHG!uV!?C8OSZq#g*~)_Q zu5U^=l&6k5e?oSdM- z3cUb{ipbJdN@Pk4m_>n4$rwmb>fiA{-#M@zHXpv9W2tuMt|=EC6}D*F1lV180G4`G zfY?@nE5h@2oSQ@yHDTr6Par)=|$|>O~%zkW^9o`PPi1AgVkR`wF)krUh)da@Y*FoEAAiEm1ETTmIi2=TGr^2 zen8L!quUow)9}3yV{Q7}80&iE-gY%zNUjr>VBLbJP=56v>ehsD(EW(;9t2BMDGzEgV9!|Qyu1i7kk^UuAUg&_urd_L`P~lZBvAh; zaUnbwzK`!W&ZOs+|Ijtc4x#MJWJK*x`S#PVoaB6PH)fOrFsGy|5>B@&#r{#XNJ5nH zp|Dqy{Rb&15>6pAwmU~<`2LGcs0oKDym;&%0Zj-b8bsXH=iJ4&6CKVNU1TQE@N3oQ zc5tD;Z=Az15>w;{mljqTQ;WIdhmrUw2Z0a=D4m~!$JhO?-d7}KR+G%JWG^9~Li}(* zB&gAt(4sLR3GC2hShA9=E;H1@%Vr*VokCI?{HtGZGa7q=1Il_AZ`;kWg@pynY?T2= zJBi2|;w|K6m=Ia!D$0Voq)<&z$&lrxyiiU*bgf5Jo{NbaHmVtRhz4MG?L#Z-Ep_qI ziE<+Bh)|5ZQ@&s~zA|a!kfCVxsS~jUek8{skMRoU;up!g?HqL|eNI$VOu4dzu)>c+ zWUenXY0j%dO_FM(gQ7E6(5dVwnp7YUU&0*nZ^bjr z!rEiuukXo3+Z$1Y%@O=eWk!YVnaNOQ7+L!o@c#Yv##-)ISc_ht-x=Km*mdIMy7ajV zo%b=B1!loJ!Z{F&P~Wk!F5j4tlGR#|wBO_3-Z?1oV02QsJy|mtSya^Ar$Bk3A`N$J z7o2}UwdFO4ma|2X-#G`4{Y%=br*4m)>Dq14kqw6S(QRYf3tffLfg<19yPQ-hLCf0)`J^?5q(PiD^SyTbcw zAoiDt7x0LeFC(AXTP_xYuleJc$r`Kl8_K!!guB0MlweWr> zJ6;a9U~?f?k*%4U8*WNj#59t}7#7uk5}1?v?LODk1}42}t820lmo`r8N5u5`=sp-C z0s8+j+HZf0f(n_&4JMujptYY|ck@A;D3uQ^7(KH~kTXqRh*}d!7K=ugCY9N`P!J89 z%nynDY2&&vAXJlq$l2f5I)R!D3L5yD=UX>_f2~3kzs(Ag?^O+(h1hKaUg#~v$C4rc zdZGUaSFs*i5KJafe#dRQevKlck`gw3l0$KN^`*(oMPj;i=H8^CG`mBIR4swDFYMV4 z>+Y^$uZ+F;;dLpZ=n2;F=))!@1weX!%mn%(39;17o?Y{kv7Cnxb$eX9{Enp`9UBOd zvlolQw|IBK?pw)#&|4biH@E*qM2nOm2!v}T6dt?N#6FsX)?c)L_y+{~YUxM;BVR7m z+^&!1q3pcWa(X{K_h=;NVGspJW}_X%gp5^^1qmTNn3p}zOg310v$g=70F$o1Ia1T40&1;c8%FLub-8jIbfmIoY>JI*>N@6a#zUl*;PZ@SXgZ~Lj5-_Iv7#XxS! z&lS%`Wak*nZ(VPEfrVkqugV!HxxA0w&3(R4^)YxQ?CB2(QD;#pvI?N$F$iEUo~nQ%K<7o~2u+}pSbMS(&U!lSG&rPt;)naL zzW@T96dr~CfOyu~t`iaIzFTW&*z9IZ`!&meFp1E|E$(UW^75s3OW4Tz z@wD|10GVd}dkMFG(g_9Nx_d$$CZ3@f1NGc1Xq&maFybn-dF6JSL{p z&uMQLjUuhBCBuk$=&5)zF`Z9)Abi(cyiP#(tEWWFfY_<{%O9EH7W947wb97^@g>XR z89D~z3j3jsjJ6}-3%z5QH}d>HI0}~)%QqS>kp_{soTxuz+0xAvHWO1fV!#ajeSs^&dqi@xiL)0*EMG;R^e_=jRg<- z-XkP*_@xzlyV{J#?x_5!v5=?wc|wSn5g*rRfcvc6KY}*Zw8g~BD2IE_qkOzqxp_sz zC5uq4oIBS+`=FysBlCE(4uV3`nCO=!K&_|DK7NvC%{lb5e2gwe>H0OKc@gV-T`(mpeb63c(*HJCZ zw&T!wuQEQjj5I%0c15E()4N?v*WzW{)2~OCE%oDqy3X&%mwG?h>4~XO&(n_SDC%&4 z`vi91r1^FI`BM9d-@4;K!uFSz-e<=+H=Ft(x~4FvY>5Vjrcl@^N)Sf-^|}+i;-{ib zbHaKzpaNfe@KN`OKlz$uTia+y9dIF&LcIGL#PmUqo7;q3>eJR%0Q)i>Vf;{prK}xG ziC&4zHKN(&z4`c@;_TFM6{+eQQw^+`?(J{ocsyvBAs8%iL4qjP#g46YN6yff`;)ii zPe4Xno%iCsm9utRE;oPf&CP{ibQY=BM0^uyJs5PJ_LZd1QEPzaDDr^w=YEl#@8NWw zvX$i?wak0z4N>&Dv6Rcs*ie{KQz{m&1S*2aVR^SlOT~6=YjG}nd@@imQnStzW@4J~ zp}HHd9uh;7jG0vxQ;TjV(sZhtak5A}m@5AgnD)4MvLK$?jSsn9Z#}s9^Wt+AalEEA zMrW4ay#tYuPVsF;6qFy(#A%3S@k!?7@%SDhP~cvXU#bOqjGClrupp*aM0sqH!Ue26 zPm2Y223xL+dd<*+S|8(2Ceersa5D!xq=>h_pYB)r;-OwtSoh#LKO630!W%(?vi_Ld z*iR=C#2^R$Cz;{``9o*MYZhum*5)zm&H`9H)gj|GNz|RXn#lu7FOlV<$ou;ouXclW z5APp<4j7-yn>!<$RUn&kT;u6&9M>PEd{HS{*(40J0HsKAbu)uCd%> zng=TR(3lA4E@y4a+gT^sIGR>>en%2d|iiQb%Mc?Olxmc65iuCs^Y$Z?W|Usd0$!w|~fGI#J-hxZAp8PJ{S(PYsqA z$UZ|nBz-u0C3ODDufe_rx2S=w{FLmBxebQF^g zBY^Yn>R}wsGuYr=4Z|V_CJL&_FLFyR<;rPkh}2P$!~bZLYCg ziYa+M{j~=?tPr&8a79LN`#N!i0h2m@fohm`-~+4(UGh8SYG ztUGqyKFlg7a(_=ZksxH>(Eh?M6@W93%F>+DX|(6*+J!G!MJ^{^f8+{SjR*8^%w!2#`xTr=QK8@Ui-XQ9)&;< zEY5BQ|Cji#tV!K_njz|<{$*}ty*><;NDmm9zEVt_qeWQA{O{a)k2YyPWQQpH| zc02*;n!e#JPF5__#HDK>;sV%)XR|w{pYvTl-?9}sm`77utDw2L$?nEomf1#E)t4if ziR2|-H~dOM5ghx{R9ZAX`3whBxI1Qw3WUxs-zhZD6zuTo)B4vgo2TVyCl4Q0EHQ!+ zueWZoJtzGueeUF!Dd>ELbZ-3`5a$U6yH<0Ig0i>Y9ejAEzkHTDdO;Z*^AR^Cgu<#x zA6D5RW}o2Z88{#{U?nK3?RjvNZjv)^*#3oqT7o@DmWX~qybRG4%sRu^rWu8Rg*B=Nq)*8r$S20il%(RoVaf0 zICu`pHeK$^J~eEJ1xcfk#Y)-bKF_1MA&3akHL6Gv#xY$KST`pZPAaa92>0}S@hD>$ z`=z!`@*UIxFm<3M8J&Y|*>&(x0RdBa5c{x6eD<mcO4_wKq{Mnn}z~9%3G=2d{fnxi6g8>P85n znIJbQVMhKmuV>B2FB!L@C4I|Wo%%NxGE%uqD6-D<4?0A$LlybfDEMf zX8aIS>CNSBXt=5iFfDEUW(r#`h-YM4LLgI(a$9TT$;HTKt?L0D%DG08L|NrflSnSnya@7BIqG1-doO znv8=DM`oUvOIa&v1c3*yXY)2XS`BVa4b1*_DQxo1sZqk1E&>&_mAYcPL%d`94jyiW zs^t|vfE2Oq(}YT9jcS9E=WUpGvwk0QN|J^nv+^^f`#>D$Q`O431CyurxP}^_bHNQh zkxN*zwGl!EFZ! zaKlzR{rg@N-+C!HF!6`}0X5+T zFzmCX?1A80Itxrli!_lQ#odMOt1WUe}KR zJTgfm>xEH-gEmt^{=@zZc3mb^5z~DHAAH@8Gh;KMd#{50DE(a^ri&_?FwHTlN8N?7 zrFY5vfwKceE`=!TYieYorFUc7lAP(F!moqmO$-_*--#W{%##O|hsh(vj>3(ZK5&CN zflD4Bvg5+m{%A4s6B5I<4zy=A(FxZRO$d2-}z0%zk_tmSZk$2EGK8kUtZ>NNqu+Mx=R!8l&#iXII~>+P!98zCJr1rwG77^XF-e_k>GQJRb({JaUx0 zh>CWG5obV7(Ynh4M#JS!bRYVtvtUwNE^vE3OoA_H8-G8` z|A3A&+u3;yM!h;d)f%*@O^$I!%ay-(>qlJWVr6jfyNzKqo-oDQD7!Mq*rQG{jiA3l zitqouet`)kTYti}v(EUU6L!IOxr6zF6T4$f(3M1puUxAIpFIx`RF>}N`wV*Sc1?}} zJJ#yGL}EyjEzY5liKj8)yM=S!c6;KELD_h?2VNmUYr>+M|f0h+$?q&FL z9EpsUk*MQr;RWsk^aH#xF%J|2bt{(c`>cTBs(6wwFJg6_YYpK|u(+wIln=}qHZg+X z&JJVLkoeC{E17Y8yC{3oBs)pi(hGF<|BR^s@-88Cc@WiC7?{v69d9^-%j=!4jY8|| zM&bj`n1R!bZZtou9_a`^#m-&B>pcv|$h6z50tc3GF+7Nqp}(INcjYkz-(AS_{9ZLE zSO&{XFFE{}hCDo09+%uFPv0b0mBA8JlTjZ-dA>t`Afd4McN3SxutUeec}#6a$Ygo|ET~ybI#z5b zFaYV^TmBmiz7vk0yzL_rv)0QK^DPS&0ajIlgUXMb*W&I{QvX$nUDN*oHL3mq5sEM* zY2)OW=UL6N6wc>u*INGLLI-Abz|EJ$-LZm^HW_s6{U}|l&a%D#H`}mR9~`KbCXuJp zmXI({%}r%z$PU5jUQ5gTq;lh^LvmSLx4d;-CNCu3!*AFmx~;>^{)s*9Sa8dY7rL!x ztzt=b;WMdCI_JZrO3Pxa1SGt!&ZA8E`(f05WABn_8OVt3mP%AN8`vy~8$C4SBVI~b zF*IbAwJst)7wUFxFuH}bsxuvNz07H=MvoycMQQN6#bLS$Z+BB!L0oop zkniMcKW)8C4trZS^v#7l2J3Fa1@VDAQ~n&E*-05rLw5>V-D%*~>sNy&?Tzu>NP`^d z#Zr~Kk?SeDMQ@W!nJ1!rvxuphYnszpv2(xQQ#nzap_xK%QMZl(SzZ?RxKnO_eyEhk zuW!5-{uNgAV!X8dG>4oMYjK}Vyp^ioJ?X4 zWn`XBIcmSEu-UAcD^u^4tBKVoqKar7Rn6%2q!7grDyI9Q#&udWDbaN0OmUxM{4d9udUz{Eik(K(0%tuGl zk>1&dj?4!?WSrA&|63ye{a%4tY+mQ3xqvAj#rou@@HemA#_v@moGH}Ci*GOvL@KP3 z-@LWIqx%HVMY3c6p^Bv8;HW{l#KOY48^D+i&Ht=L-+AEW+^&E*)cEq%&g(vB_(#g! zl=>9DLapI*CL9;uGv{PUq5I-UXvgbcz$i2Z`xJDaVNDjf9t5U%vfAGpC4=ga+a-IM z-Z2y}W>C!D6Lk>_==tw<=*}ppN6{Tqc>1=k$zhr7e4B^h6o@=|q_EwxUrE_nR7Cz8 z_%fMH@gDC44f(B5473#0XfJBF41lVIl(ErQD){K8F z{x|mv`opWfbC01Wo_eE0$)-2ek8|w)LD4^>>g=3G0Yj%>fro=6aoI>i_?{~2ca6IB z4iF!2owT&X_MhL|4&^Jz3mA{5W&BVVWqR$djo(XAs1KTpin(lB4)o z4!$kFWkju7R5Q~GnL^Z+W3Ky>a(%E+Wk2Gp++H~kNGMnuPM`lY`IH(r-d8)7ViGXF zD`XKb=|OB$uSSudvPT0cf&2lrA*l0Ig%qy6>wfi3n15Rx;q(UKcQ`%VXRHNz^#S9pltc8N;-Y(G+d7MfSs34rw>+D&t`uNipinXa^)PvixdUh zNez2M8J(bc7Y<2sLJAUtNmIDi`L67h?tK|Qu#OwPpl%1DzFjtQKzfLoN_%cdzo-@e z!n9#k9Yh3c$fEknM+~<>GLBGkrB0f0S5uikzd{L+hSG8QrF|M)3U+APcAOH?@A;DM z7eSI1h@V3i@Epr?HwLKTOJ>8LZ*4xy)wOv}Y>M9vK0}E@w7q%}_3Dunml2k8jFRzm zUtFonv-G=^yL_DwOGc!^_bmH&<4#N<2y?#s`7b|ra9Tz|X7^%JXo}cllJKg7gSUsF z+f_b_|KrF%ddpZ9%6wZ}4miHd6NA0O4ix<35hIJM^L^OeA1CEgI|uV;P*K*fr^?Yx z=sPUhDEB;mnj?DuC0~5I^g})tv=#I}vC4kh2P&cnNA4R{dw9iPWybU!r~D) z9tCTxV6HktnV&GyQ1V~}a+%#qqS9ZBw+g~8%JZA10Zf#XG%8?0yaT3-cQm!0cZXw4 zfg{?n+tWj;Ct+jRtGWmOA6H-f*Hr(wJs2gefYbm51QAf0jT$8(3P?BT=7-Y9eAdYv)PI>CiF{e7+_ifzRaa2+shHg1G*);-$AP`#omNpU{o)rR z+b#y+W~4B?)x$(>9#^i_j8wRR(zp*hTP}@2l4B1N)Pugx%;FE11VRnbH_`?RwK>YN^qa>f;1$|34th};G=r?Eyu+tM( zk2}v=$<_w5s6YGGEu$UDgsfe^A!o;@ai*kT))R5jdS+ib_Rz0YLYs}TU1BQs?$lcg6`QuGN8hUY zqgzWjDoG1Md9<$lLYDA#1IBgDFKt?8ob@f+#71nZ`s)x*??$_r#qq=aL1)xGTLUht zfv_jttIpVmE}xhqu9FjIc$p9D=VqfGNkFM#bi|-;tdzY-6kp7YyQ87tca75s(>6q` z>6HY8`-p0HYuv|+W+P>MOgGvpXtM3>HGg(*%v9Fg=Z0FEC-Xx*Kk66lLV3+W$S5@05GJ zHf`2JiTztlxQ*S)gEI!eGOjqdp-4mggEdrCcxaI|?Jh-S{e?dSV>aza1FJTi^^%T! z)%qU*l5IbdyNpTl!aa%(rjuxQ6y2KiJ@zZBAs~*@|(Ujrr@n0wRLgf z!g0GduY`8J{z4yuGvEpKz2&%`s=>pH=eD* z^kgS)cA`Rdq^(r7v~d_G|Bat_k#u6>K{sf-qg#~<#Y*)cYEncnt1Tj72IeEBp7LnI zdO0DHwukXkJP2%rY(ZLSeVICBxmgK659=_>T$i^qP<&;gaQ^z(FlO=7-)cJY76H8` zYDZib!!>p)8=$U5h&!Xxa2ZIc+`E(Zap^Y*kO4MjKK_9`q`&?TEmj_^y-gkIRV$LsF9<0Y57&7VtBH zGx{jPpGdHKv<#%f-W%5b{c~iUF8Q(grn?)c6jPri=lS}$DBGGJ_fv*f5cBqX~h zio?+}ieml@2l;US2Vof$yUR+X+VlNcVbeuLtNRJAgL0xcR>;)h-S1!0)I)DRuZO=* z&ke!L7Pd6wi^^;rYs%n>VJu;T`8w6nKU|^vsf*6GA^A~nVsd{hG+P%tr7q}M_Ffqkc zT+#LDsCJR7D`hNe5_Kbzik{V|uIR32JUB}i7&;^dP&A99{n;Bc(^mcLfFNp^-E9-g!_fEPyB8l}HVr7-i`FbcTY*IOx;&>j z&y9qs;IFYjtxGI#xEr|aVkNo51cniR4?6pMz2(MP55$g6Ttdq2jFJ1Bt6%*1 z*n8zL1i8&f08`-MYlYj=%F>C8AcdU7@oti%mGtwkPr;+nq%Ydd4U_3}Tu~yokYTJx zQ;&NAS@{5ib*zSkPgIV!dhGhaisJ^+UfM=N;(7J7@cYo>%zjf)N{D z#y$72WhPCI`ZYvGAJg?|sZbnuoT>Q{!L#IP=~rLU(zB_-Q{Wq`cR72eT5Q48QVb%3 zj*8IBBcyLaoYcdI)B!|6rmeHcJFBcMkY8In&#N)jJy@_kzqU}JMO?wnJhEFVpM&~c z?mkeHHP*PBkr36ZMo26QC%z~v9(y+b< z5evsUiq+HkmE31pI$~%frGkQM=T=2!C$FCrW5Vq5la`*YB85CJP5l|2-m78ZfFVdA zZpzo8nJA2kPtJcQ1~UywTNMdM2ebm9T9qR7^Q_kTq;fEKTy;r3F8%b7-sz3Sf@ol( zXh)oUqvLE%SQF}9=Hu0}W>L?mT*ywjV@HHHjIf_Zx332(oREW?(Y+e(lAWTDGLNne z$bb69;%<~@HBXht2j&FxUt?21kgQkHV8m+V_Jfa@Vc|dO=+h(;d*q}sY>{hDM;>eW z?rz#s&0eW@cqh!aqLvSREp!2>#I5=w0gy}C|0t%;ItF2HK3hzjy1p)mr1)OtuU!YJ zKtyVxr@S0ONB6&oZ1J5wudAx`xSp&ISbrVB^gTD;N3OfDoQ=2F+YiBC_VJPaf=+dq zo5d3lJ);br(=fn1N6u*+ly&Rm4mu{;op#exV;1qfQ(;oZY6wQ!Do4*4Xyg4GmJ&Wi)xk`KgS5 zzV5WIS_!L&QA@{kkfL1XwFDZxjs=_+J}OQ9G-hT?c+_Rb<|479BJ^*#OJ8IOyK%AB z;w?FgLW@_%Y$7#z#aL!p@>g}YINqAxul(jD6#lH2{PB<@=(^xVr!$8{zHVREjJV^M z^+Nq7DVw>0488JGc#!nx$4aU~Uevo&c9N@^F%s<#yJ>zAPQ9#p#$h@FF;0F3(xF%m zfky+3=>@n3pdPql{#XgLt@d8Q=W3xcGIt!TgPX|g&yPCd;{5~!jRtsFn>dO=i1APCVo9WGY{;P!Jiy5d^ zg+Y5<-3HtMo1!|Nf9H~i%k5PC-*Xxwa=+IoRyQh_? zT2!p6(=gXgzbiYq7L~daj{ik3s`_Q3U|4?l^G4qdY5DO;<$%Q&tdVl~$Dbmi5TW4r zx0cr<^Di(UJbUd%45fZL>X~`^u=*7H&C|8rc>&na13yLfxM{7e(>lT9c`*X1k4)+u z;+*3#$oI&FAH^1rHxW-F6+3*n#%Iv#O);Enx&pPKKa6K5w0?w^x3&^BOqPi_m!jmm zC}B-Sj_Xi;2tO+1j?yEOL*@O4dg8cx4t*E_$d0&Z95&r~wx6zv9&iE<;+xPjl`rX? zei4HNzyYPn(afZwXQFnlT~u9+PoRMZ3_JUiV#>lqh&MjJSeMxL$Wn=6L>6WGMIb8X zYFR5-`!T$ZueXi$6jz8i_o`q0T6KjEl_;2GCnYX8GIP>WBN%wF`vmhul^7!KUBlJ0G>2pFdLX=5 z6#;U?h#7b8@}rqZwSb(&G_KeYn$J895}nvdUZYauWLb`NifbJLU~>A}R6*BK!#WQN zo;bNn{vFq;Rs$ocy_7)RGMM;YHY8CyC` zNpu#d6jtulFm@k7{rkgm22R#uqd0i-3irfU^YFx`ivmSljTdYIbt-HFkikCKI$VTn zIR6`Xq7Qy^d!u{pcs=d9t9W8VDATWflBA3FI(l-+g1{Da$+uqwH9Bz?>2^$4zpsz) zDhmOk;2f1!N>m1|R4*O`n=A-Hv4}-6Rtkqc-LW7;tke|Sv7|eR{gAWSm?6PPeGsrG z`sJ}q|4f2R%!ZI_zL%X)`B!(8%?DdJhIA%!BQ#s|nN-un39Y7Dx*a2!6Ov_&<>@{D zB{GjD)l6S#m)d}NjHo&qP|eI_%_H&RdYK=xX3aR&g+~(es=omA8u^t+s!qvBE|&PD zhtjVSc96slTRMU+Kw~~Tft~qpP&)w2o;MMwj$6N{bloQGvL_eB@YlN=nYdm)i|wQddKu5-)l2}+y%t85Jn2Rp~=MAYI{(ulb$!Dde+G7HuceyKPRF2 zx0ds=>%X$9L6@VV02&HH@AQ0uN*Klp6Q8$UV}tn8QP@h7GG5A9fx<(yA*@k$-bX2t zjBW7|f$P+*K+)(;0myXoDsmkiesWCS#dUw@h-;{T;o~u|%hBpOQjeyMg0pQpU03?u zsQFWILBfgsNlHC^;3Xg#_a}`WKp2%uto`~V6U)fs!X1~WXHc|d@D)h5I=ltSu&@)e z6)GnK*(Eu4hA_t#-Eg$JYhaY|D}X(V~=e9za|g6!j$yfm=4= zZ{pz*`$;k8Su z+1#sOeN+zx-O8R^vxjcHUT-M{G5Vg$;Kl|f?hC=)L%cD9}w5df^+f=*BlDMr}6xE&sY zcc9@Z99`eLMyetF;Zp&!Lyqojqk7BN;(2$(eww4*e)2X}|p0j{nX3#FF4 zea;K9CLCNvB9_nhW0;++c(*6=)FN?pJ>)RwGLpF0ATh|rwF#c}(=Y&#xj_52|5{%V zZ-u_=Ene_8a;4!AnCHZ;v%tf^;lxc%TapSXn)-gyB6>R6l0!d%!^3!8L>bL3Pi~x^ z-!Dyd9fGnPOLvwfVfU(hR(bv@nRnZPMBOksZuTAM_|`=+KRQhn2B&Q+NGi9Hc^>nL zS*}~T9$$h9JRUbWTItv@qA0L!KNb<$n;n2USB;3^2rX8WgYE2$k2v3#K?i@9cj421 zdVk@OhFi?L=z0hllGqW4h*cp&(gZp-Pt!83LpP;0t3Gnw1?8kG*Pvh81JO8qpNLX@ zRy>v@m49Fw>MgaRYuxbFJa%EIFZJgdyIgOlwmdjU1VWt65Lxu}?mBOL9G8gp=T|ub zb#8egQRa38FH>b6YB*-ocR-wnmStxWEeT=;p4(hEeUaj` zwr)ikc&_Ajl)NI)`y7{Mj5tAGsJAiS*(#j;7K@Ot2{&P^#gQZyNb2*G&`U17LTk`d z239C(T)|KMVrVn(>3wH-3y|$4ib@b~1l-!O$@4Fs>RYU0z|-lokVCR(Y_{ZL zH^6~3B-y#l?pMt|XAL@`mCsHZWX|R?o^gs4+I`#MN^nxVqn6Dko7A$|`~7tx4gdai z*dK_uA@4UVc#e;r%_2yvio;(-`0w@L21GV& zDFPb`?o;x%oX(qjc-DQ)$%ye)j3{&`?fax}&c1jV4&zIrJe2`ckfq!A>YV;RZGxG% z9nvnnCWI>!g9plS&CeKu^uNZ9ANshdC0t7tiP!OX*x7HD!yynt!jq-D`ouNM2ujJ( zjYpNyzd<&|np@D0*B!MZti1yn@vuRGcT%2YJIWbf1tR<8CYV38#yncDMA?y#;iad6 z0J{@sJ$?8cD7XOP$gL&s)ip8R!XKHn;-&C7r!IC(PQl};G*O0t$6b(QFAjyE&#=cs z`BsVqeWEvXNS{&+Ses87_ z7rpWM0^^BM98-#XWw79?2o8pd{EGEW$2L>kO?Z73{HZp`TNbP0IU$RccHP$psw^~O z^*fVI&bj+#vX@knnr;8y#NtneQk5&f1g)(<;iH_Luk8Lz9!w_yRK<@Az=inrvwN8m zhxHre@*hlJtD^=XB4N81!)EvJ${9t7ad3g3 z7u}+vT_AQuGL;(?jTL1`a(9b;(}=DK-?ohK3pbZ zov)h6nRc9b=H{xQLE44X4vJ;;VIkaATPDC)#TQ$GUcz0ipf7VMk|IlV{MYq78 z8at4*FFP67UFg?fu4?89!!(OWrwMuPPu zieRf=#KfP@{fM?K*QoXaV6~vXLBL*S`=4IHwMxrqg%#R97;9a1u<)nF!DzDjL~A1i zFyX4}s1I@Ur2Ei5ug@>)C~akqh@BZ3?zuZx>Amd;Wq6i(#^)tS&U`b9>g;1lZNKmK z;+ur46e-@-PQYbB@yh%_VNI`l-8_^Ea9EIklH2ztDMjVYqagjR@ZJS~PoT|UJSWBi z`Jt?sjD$`0ldY;H*&Nw`Q{V7RT6&bggtwqijtAwajTL*4OXTylHE=ue;zaIdGr+Bv z+X(#!a*U@MpF@?P@rFrn>uvyp!5MC%B%oli~2ue3w!4s?^M9#B>Y9ydvP&aN>`Hn zi%o10#$j$ko9c+tgVp>Af1F3U7_^V%5v^irx7B#IKfy6QT{7-ljlzQrLk=}{qm5gvCKxH!hzkOxkqP^OR@%Gw$$tFshm{!qHw#d`z+V%t8@ z4Syz6VaYw%jQfK(9%zRAb#Xjhp}wF@K=u@J~{4Eo`cE{9EA+hpJuj9MELHlE>llMNHM z-1r@JBG^(MhkTuux#vimKBeM*8JWGMx{JR?$1u1?{h6TqV%*<-Q=YzlOJxI+OOW`n z5f3dJ1G&e8m*sv3bba#4>#doH;>_R#7$p7`j^Gw<+v zXM6(8rpstiC<8ilDV`AZlF;LBqir~vYA0rx9-Rq0D^P5`Hz&k(uf3W3osY!rnW9(5 z3-Z$MtI51J7=47?TS3Ymz~PpmL;f@P(@gdLaF9A|T~QXiShj5V7R$i)ka)vvDPWO) zdVgpvN(U4Xa^bMP3kn=`bVACbUV18H2DO6}>k)OSEG`$*r097@ic$Po#_b;ji$9 zn}sy_{SR*?PL?#jmG9YKbri0MzHr0jq>6dh?qHAmg)GW zNgJ~f6A>~KlDKxfN|EXy#Fl4Y$IQ1sSyqDg<1U9h%W>=ASm6GS6d6s00<*)Eq3BwIqCT%iAQBy; zuW*wpS}hE^QvYW67#8=XUe_)8%O!KhxZp1W0ytF z)7Qv$^%veQ+xsZdPY=9^jR%B3nJFz@(#10~;9XU2;2<)8@a>?rKO>xdgBf17@qNP z=vne8*;v?dc)bbAF=6((*-aF=)uKmMr=GnN*4q3=D==W`0-X5;%ARh}8@82XkbLxWbo9gfJIJK``j6RALBp#!>yGE4m| z$M`I?(Agu`nt7sK*__`-LU>O@)C#|&VN^XJM8O(y!c^{F+OG6|$u$FZUi6}!Ykj@N zS(l2;;j#wp?MsO9+P$fvv6Sz23w+^8h;L&y_lM*w5_BZ@>CtX)`=_8FWGi7nBy6K1 z#U?&I$33#{%_))P9_m4{AYp+2QmK3ErKZpb-PRGatlh&p;pkN?=d3*)S&Pi`MD5~; zpf*;!{$f$Dz>9J9AC%ynk>EOh-=O;Ozb?mk{#=4WEgM!`tR*8qRA-Z!=8EB@nwyE=-ifc@rDb7u z0b+N?VDH_prmCw?@btf8@gU^?HULEGx;))Z-9Fl`M976ovP6X&aSh@KgJaIsV!&w< zhvj<`r>Ki9DNZ`O_Ehx@9g8)Hj^t?6S@Y(t@ej`M?Itiv_o*1W1-D$gNliigT0lh6 zA!0VO^rJT`f74j{L2m!Rmzi!cH!LG=Veu0&e@rWF{$7*51`Sb3)`{wFn^ou7u2DJa z#Z4=z%YJ^*SP!Zj}ICe^>WK38NI|E4#!8($pbwR6F&nrJk?cvMxSojXg6CL~qh&6KGa`?5A+%U2Z=)4csy~s%W7F6j4 z#H%J6jjExhC!)B^SjEju?g^15bukX%$8B9~;}eX!^g+LHxZ(aMgin_u$WxoRu2)&h z;V)MG?&i`BVVk#dh@%5Fo!)B6YhaPZ45P{Qa@=U>D;#_7y*lzh;48W_OnjD!89t`n zF+X{Rt*hOw_1tAmW*uroqmC6N)l*Q=^kch)_a#{^qv79z5@f`|j?Cz{$*-J+wZ5v( zf!08?);JOd_b;mG+R@wbKO6@M6crFuBk0b-na3-IOq49WG<5@Jq}aNG8mrshYlVe% z0DZLE_Zt-YA2T%)1np@>uQPaUA-`P$HYkNyM=D1yFC$-Y#zZoHR%lv98EXCz=m~}td179hFsS`ebbG!IM z*9;VW!!=s~*^;E;`gF8o|L#(d zu*{t;lt`L03|c(LIo@pgx-oX5s|ydak9w-VBHn_yRUY~vZEbDo3cpvyJSEl)nkR4f zW?a`hy5DaiJ*y%bMtzkXDMWay{lZKjulUIes^mhGb%kXOB_mHvIHRoPT0ujy);kGi zF{8uNdhZ@uLW&{B{95i=AoyyU@r^}Df5-ih9VhBx>r<@7^jw;w@&c1fCF~BMg%9*5@(Cta`%hc7a33X*7%}lc^4h78?iAZ78>e z$zUkI$EJpuMWz$Wikf4LVtNk`&K>Km5k`%_bJ*Q^iKPW5D%NERZo8ek>_JQHg+ zu`DJ;jl@4?kG47~F=%DRB#wx}wOFqjm{*6SlEJ@#2rtPTKyU zVF&U3wHJjn<lJd(m(J{A$LlEh`v}TjYS{d3_}-13kvcwXYz|1s7BtFEFUKhNJAU?RpPd7^i2)QPac}&|1ku7Fu3xY|I?6!{mZsHP+;oJemjV^*{iOF8Z7!{+wqZRq9u#BshqJ=@_xwpV}tV?3Vvl~La9tbX7BrUIxt z-~W$RWcx*r5=hi+6L;Yxz!=1!O=4|%SjdNvN7_)g$4sqxo)z*74+VS4Wc$8fN(O_| z5F024y_8X5stHf3g$J>Bnjf79Z`g}85(y}0A_+Vs`Kd3PtxOI+p_b|1JggbsO&7nQ zBTsX+i@)_Np?KB{9Lv2ao#{34zBGrDAy0Kq)>dN`mKgG>u8Gw7-79s^w1;-i@W*D@ z4~ERi=wuw*f9<|j6c9%Sxlc7Wu_*Nlu0~0z5}5-?CXe5*4*2IGEdmyXl8Yu0gTBt) zXS>>}vwxh|7E!lP{yDlo@x)bvZ`%MzcgkNGjfCR*pIxu-{{2bb5SF@tjYEg`YMc!r zpvT}su^-h8PtjO4dS9HzyeXD_ROCx)qDUnlgpz%o7ziP0sjB~pt4zYYI+g@mM2H!@ zIuquECXDVaLNTo>6@OxP#JX*1spxnwZn$i~JEQ*5P8X3rUCReKCtHKK&Fr<$q=Kzf ziw?vJS;C5DxA9vVn`AA#vmU_-*F8Gk3$Gt(jCow0xc#^oO%A)G$#|KBF4o}?$UU!< zEpZSmjAMHW+P&IRruZobvzw}1FU{Uf(9U2U?^X@eeilb4V07qi?a82|g9Cb~+zKJ9 z_Q!kRi5M%myi^8j1skTCuDD~xjg^X$OxA!NAOOC&-H4t?R$|V_95mbIhB5TV-g`_3 zH+%uIz?8>9i9BOvg+y{s&M9L(>vz{+N?T*FZRm1Jk)q+5r7ZO{I=b*u!IX*#=N87L zZ&C6Dy{+PA!uHiW>cYEp5GNf})O`bY<52YU5zg2W=aaW<3Jk1%$Ccv{fON?4h2wWw zkAl~$>*Bw>vi|uiswwirP8tbXFM?K^;?c;@#HE(O;$A5QtnfzJ!ZvCVx1I9|?*ir+ zx~>c>*2I|Kpg!Ip{k40zKqWRY6eU*es271xsoW9Tvmrf8Iec;oU846qu%edfQhn~R z7OMa)nGau(5YtP(7)Q`b98(XDZ3<~nXAdAQZS4owskLeBVnCQo|TbCX`cWceN$v9(YIQ@`!#KP!B_-&DrqqVC>ERC<@=f;j^R}4nZzYS5|OwkCz;r?VB5VH5&1E5aT)oD8;fvaMY4d~ zQ+r}ZdlEn^_&^R?7pG9QUG|p)K>~MYvvR;M5y3rTX=sM1qfG3zhh1~ajy$+b`4=ad zpmj}{N^>y@eKp9tQefn)Hf=Q!nZ}^BC5Q7tg-7U^R>`&=w%_#_L$JJiNYqE@))^$Q z2Es zrDw(pBUh>r0B=lexJ~00j)+r=$*vdb>cV%Q(%L85t{-&j!mY8RG@r6zr$-kp>R8rN z`bvjzq=;U~&UnhqAr`3_5=>O?YvP#?J@nF=+3S;P5w~@orngGtHgR^*zU!#OOqVTc zKQK#?qjj0$&^t+)6Bp}>(77B?0EkNDoxO|_c&1T(dPP3QPzSH{_~54+Uk4^BnqGBnRm7}GdhA~z z&9eime~EjRD1}Ji%G}NMyfaKli%S+iYqs&d-^=?VLTl=PbeT@tj0la zw~j;3#@KnJh8nk5WJ&^dY%1a_83Y*OA)!BG9fC1h#H}>T6T7~t#!z)h?yFv+uPyj9 z?_NtcS?ze)VBUG}sLvp_A=Mp80o8L|ZvfwwohkjD{>5&`*0z+Y;leu+LdKO0a|6w3H8@koD+ znR|e=%h#Fq13FxT%;02z@u$1VP4%QFUy@BjPOu?=JHxUs)zKIswv%G7hESofR~}{% z^m1ub@acEZ$D&oHQa_=@P+>o!`7bE)d6C*4`qjvWVulX$laP`A?V0r$6;l4{DHNzq}1!{_cm<0!*bRVb+EG49;F|oFWZQT91M5RHNgj5R9qg z>wy)kEz^RgYi04I;2uQlbro_BzhKn*#}A=hnqS_qBb%i7lIB?hu zdgD?0LH;?jQ?H_9MbEV5Jxfo&@S4=7cN-gcTySd|B` zBsGhJRB{5+zie>zTzaZWt;WckzxvE)))AhBUmQ91Ja{X1n*6n9KEv}1dL(R?;jbZR zvORxxep<9Tdd4r?6`x_W?GS5>6nIq^%SM?72nPj5TEOBmJH#Q$xFjJY$)nM~|5)hV%HK*9?s6O8aUXZ^W{3im@d(?n$0qL`DBr`q4W)Kmx+nI8D$s6>|(R*HDlV)Cgs;q~EilW3*I*p%$BFa0;l z1%~pk-3yE}sOEKHrNdIsFjYCAIoK~NowB2jfF_~3fn{ZJn4RI1>`0|pa7D-@wLT=x zrsgK<=u(|0rqMK8psca*iL7U^%G_}}<6CRfpxf;^9WJcP7nw_L{3L&!)xm~Mf1_ti zwQBft6^~?uK~RR#I+0lMdZIbo!8@x|sMP_GsbxnhUhDGnA1y;j+5NbT!OwY%K+9I$ zvs=EFKW-wK5IC1DI*piQEDboMv^)64U0R1!57A;bN`{7>Slsw*TJ85)9~e0yFUUcB zK@Mlyy8d`uL;1eA=fjwZtX?<-OqDs#3gN%b5?t`cn#(;PD;b?|n&l_`$*n3W6TVr(RxFrwp z-0bG3NcIA5eb@rP9jhBA-7c$7WIVRvtp7#i2h6g?h`Dp z$b^LSPDj4)G73+PMei^?HZ;u2+moNlG_iM-!hv^2-r0?8&T$>^$wKs|JtTBg#IJ*x zmDSl9#A7hYG=2(AWbdY0LIg$`Ziy2sgFt>HAVdNcox|*wIFYOR9A;h5Aa1_FV~kYs ztH#uT@JkXs2zYK7^Oyf!8L6AJWee^;EikL=span~^3AaCS1 zAD==HU1R+dMHn_)>lzZ-W<)Q=b$4-#ghYF%>( zXk)qr)Xo<+D4lB5_0_9E?rR6%1d{W=1|Rs@lZ=%Q>s%G{k4|$+YWy1vbZ;AE;0dIf zjzCc3?e5Qle`P})Rc*w-n1{0ok%rW*tKol0qY@Y~v9VJ#_xNsHdld?+;@jlYiuA}k zU5?Y>%vFX6HkJrBVWefdJ8Vo`jkGO<{6ijE#TKZP*HCsH8y>A+TJmCRXd;xZHo?;h{B(nY~v zBr$6;G1%m74>&mjJHN9J2=74ODw9gkcW5|oH2lFiEtI$b&I$O_GxY@M6l_{Zu4U}e z>v?Q4&AMQev%9vM=f|Ob)TS1AbVCibbW9+Wyj=k#t0j+`F2;nIWj028^+|CX8AQY z<8?=S_Xh~-0Y@ixx>x2ZvJP4E1q|$Z&l?XNaW`J6Nk^##o;iq9L5i(n_COSkQ;8OA zV)EdtZ+<6*v*OSe%EecO?ky^@q(47a$VdhT;bl7Qrgv2+yq?OoDteXIX@v#4chJ=T z={Lx0{o0)(e6@d1kA7o@;={_T{kWh=bZ-X3e%GCA$5%GEx#0`>-?nwWZhlqI?JHGF zj=x-`w)R^2JTw16HEg{4%FB;NPxmv^jHN~yhmj+m!ys;{qio)D`|&W zrquX-+~Gamov7a+b>2dr4)JJIE%(;BTCMiPxqhuqWh|FczM>VDRUoZXbPEm$%## zWba*{EsVNmnqXo|eJ?6jom<5Szgx4SO>)ij)%DCInDU(@E&~0cSp1}wg0G{-dR@fE zbb|6X$arv->_uqbwZaCs3ioynO>&D*Buy4y7cjSbd?j%s!7z$sIClWDXj`TPH(%?{ zb1$mO6vRh*I22FM-~vzK0bHnHpq$a?asEMz=0%mB*Ue$e;a^jYq~Y!u`;(h+0Y8t& z)W9in&Heh-)##synr%`=w%hMaAucthTBDqw6XWS~RQLTvPwyc-z%FUP0L<2hm;_RKhDiw<2- z+9$vYRn%8#OHad2wU9$Nk_6d_n&0Dl9lz`M!InV_*EZ?=QCgr zLs%VugYFBjintDXDL4d zP7EK~>I4Z}(ey#)Eq758twEHo#z;KC%+C zDL8Bx>#5m~nTpm8hBm9qF-)Y+|Qu zkkr#@U+0tSP0Luw_#{U+1x`C4VJH!(rWkZwvXo%Czx+t~R+W$QEJS3)XJFd3sA zyJEsYLZ5pzakncLH1PP@?l4jY+I<~l!W{nS4N{dOe7iweUCA&WErKq~V-L-Z8x9V` z1P0zKcrb%%zu4oa5kihdFK&3Xdm8aOa9DzO+%WZsvf5fjFVGDF)f-(mDz`063wF9Br<<``79 zKa3ioOJ}$T-9zx(uOR)L&*gNy!Gj*?lN^ksm!wOiuq7kUTTxkX6NrJWYodql1IR}S zeVzhBfI@0y71coU)|cqI(PVsuwR}nPdtA`lc+mW`ek<=&d)FxIqg~YaC;>63SDpai zFs32)K8-2_c-uTUI9$ZTinwFV7XJ2Ab9Z0MEIDKx^>hN%o?aqcU=&8hDuOYY{{r+y z#P1ip<|28wIv$;evspD(xOQX7@Q2DK-uB!3>tZ(1Rd`hYbvy>>?GE!Np6>49mIY2! zWc<=RP%i%P?d0BnIs}725Q6dE*5jdh7T34+VvA*p?1~l;4VWSImTRq$%lpIyQ|9yl zQ^&U8VfP_L^ulKq{=CR1iWul$LptwmLwSf@mz*wsa5weoy#JtdFgFqtVjC>KM@|a5 zbm5cGk9)l-*lxAnu@k9|Yxm}sfBY0cJ7Zg#EU(Nol;zc%X0nGrPD=nCaL-u}B6T5& zSvJ=E10I5PiFMLlxE%XPa?e;4g~Ov`0dM|7C6kh^#P1$-)phRi@aGo^9}VBOrn#ag zQgO{FP`MU+<&cDC|41tkbCuUX@i-KUI_OR?nCWd6Qs zkgEyev@BpJZe%vFAXj_m^e@GeAOa^GY8K`Ne>q@^F4T)M$P9dEZ_n=4ePo^V+V&c5 z9cwW%z+ASWN8!ZLFM%m2sx<7RJ9*BsN#mrg^*kXU;)emBWsgRKi! z?s&c!5u0Uyf@*lSj5d~V^!EZ)%Q;scU0zRUe!HOZ&>y&7<39h z<4+^o_>&2n(`#}0x00Eh)Ptbd$Y2)Fzmy<~1vJf~I4bC^8k*rxw<95QYW6>qtN&FA zrDSuxRoJc&LE$;3{96lJEQg*Q6jSJE|C*|KjI#|UHa99)y9@MSw-yE658B#N8Ez`+ z<`>py^kdmJ3-Im4y47P8EuxOchzUrez#TS?efi9(?WgP07+yJ?5f7Wjtq;sJJE6Qb z$h18gmAN+!kY7=#gVNw1;U`aJ*$!W)QgzMU@|6;8I$m0Tex;z20ry?eVEDi=tij7? zr-E#-v9WRU{PpDySzm4V@v<^+G-y-1<)PjU(Oa!alcR4EbiJ4~JMwk$Ag`#TsNR~- zC%grb`>((K;3vVFwKyw|op1BwER50Oawg?no`+|rC2Pl$*vM?6pF zs}&6HeX9wTC-+yXT?c3AeE&M~vMDa4b-R~n7Rf^wzuJ7K)5ux;9lgIV3so}@g`<@jlPw$CHMMZch_s!$%Uu7AaR+=}1e>d@x42<~M$qvJV7@k2H z%rN$PjGdlkOA1$S2&HsRpF0De; zCvDwGch&=N;fiSD5fdsXuZ{`F~aYcg+t-o!P3<<-F zA3u$U1io=_W@4pB1nV6SF{k+&CK&UYF1HOA%WppFzFY+&(<-NL7(UMYlImX|3K%6@ zW7A@eRm6D1B=nw67nF8N2ixVVW0(=5?SledA^LS-4Se?rOvLzm)G!hQ;JH_-Vypij zS#KE^W!rXr58Wv!-Aak1bO}l~h?I0n4jt0cFmy-^Al)D>jl$3^APqBgcXK`mp4WBX z&-1Q z6a%zWk1L#x|FI>zQjm7mdLrHdlW7nMxL26qXXIAn53hYA&GKXP{Auy0ZBzCSM~%P> zt?(MNj@p^0!@YSw*=9>`;yG`uU8*5_8ESXm4LN&J#?CM$Ui}{PZ+Y}B|A}@CSG=tT zrc4I+c5)=nf>`Iw6$@EfCyK^W(VcWZ(#Nb7)lZa0mlj#YOVGsUqr?I#is`K zDYT1->ETsIy|wY`vj=r4i@rrfQwCB_Vxm3UH`^Nl_2(85Rkd)Y(H)7PZ@K;t^~AOGX*e;?nL zfCF^2X$gwn3+}^qwVh&CCq$XLLOO_BoGw{sdvFFFnC5H_8d6PSJcVZp~NgiBgIw~k4j7S z(zeS=%Ajzf?B272dPNdhe;81t)qrcgPnJfdrD*|oyfJqB_dH&E2(8Hs%D z|4R;|qwQ^E1U(K5XAKoqCQ{(o3GbC3X}^%%ZyilRqJ%hy59)C;=;$KHy-uFPvLdhg z%BX7cn}}YznA^Yi1MRp#{kscrYg}GxAAxq5h)PUuhd?4B+fZMhv%=YO(~qO7+B;${V$Gw1u^zOGFA4Xekt7D1$K`~b^Qzl6i zoJ&ArFs9+|gEa!~j>->j7xZ;0pD`bfHnO)mbruTkclLMf@GfXiT@Woi!#Jo!VlF}4 zoKDUBe)?jK(zz6IJav9?m=2PXH+7G;-())5e(6@FlR|gGj*X4 z)Mem=G-@}c6%u3Jx4Man>q-ZY?4f?AhSc;XaFC!v2tyspmO6AR)+EZ71D{Ipx@T8l z8K`}=g{tcRVkuU*O03<2cA(LR>ifdv(U(x$EIp;{rG*G#*99gCC-E!TMo^LZyMkTklqZWNNoU)X&2vVqQX*1YwuxAPl&IMa9=YS20iR zrGHX%$Nx~o+RtZfkwG%J-Sh9$fCSkzJ#$r-&<*kt85H9ii8 zT^2(ZwuA=Rrb-XKt|A^3+dEZC#i9I&MTpr+`8F_Ao}7w6?RRVDFCOo)6J z@>_JP;E6bg_~uA6Kx2OKr+U4RO=nMY>XuLfpIuPdMR5k8)4h^t&ixmw7^H6n;QA{J z$4_6#T$483#q^6FchQD_b2b3NU}Pr5T7YO42waM2u;Wc6Cqs_ny_jg~hY@i_9;M&8(Xv5PQgIcJQgw9#^LRL|K>nEFa-q z$QsKOL6ULffZ!&zt=|VT3?3cVVLhy895ydMJ1{yK^G*>M^(KcRq9q7uxqKA%~T&(rav%F+H;gIalGQVXo$0V)o%LzFzz(vo6zS8u!(-hQU<1^XUC2I8iD&uSsjpn_U164#{*Ui3PH zPicBmCn_0sQodf3Z&+mUZ9t`-X#YYNCJ- zd8sCycFv1nn4{sj&5qN;>9D(w<&T#1d6(`)VQ$co)@tk|-Hf?AeQkRR_0n0jh&n~9l#5?u>-jd^bY1DA*Q_lfH-PyC^U9h z4qPuUZG2ao`I^F0^aWp47HY;35=Mp8_go+%JZVSB39lfyo5I!JC zfPm5XhIifdSbTt6EV871h?hDnG;m80<7XjgP=Q|~{IrguF2QHAdFc`t1CQMCk*F^I z4^RB_nUvSr2P3#of=|uT=b}>?0lkW6Xy(xDfy{rKNJw$w^KIyeY8r3wL-$5);r;c^ zLwLE~{cnOJiThU;F_fB+UigUHf;y%3ep58E%l335qzJlLCxSulQGx#)Sx@GcciyK!|R`Iv)7f` zmnM&K_xYpS(od%WAi$1M+3BJWGaIftHt6}COfjft6xTOQj(Wdm3>7QO+E$xLZ`GZ= zVms|W_6u;9{88sAk5Q>b<#I;)Tix1)$26cQ( z5npcu-~JWO;f=Q4xShujWJf4JjN6Og>MNYeub%T!otg=~`tOj_&0ox>vPaAx=j=Hu z_G5Z8`ZYX~tLwKABX+*aLXgkRku`E?VbLCDM(U7lnNw$`*=5Op);len55OhHeY}r7 zGbhuCsH3t76+hZ!q$!H5{o?+%JU_5L$;#FVbmtum^%+BvE^A!ESQnGQyzwyuGSPkL z+)1Fk`=u`rUOgZ-9{DD7V(b9^2T!qDNst;(yBRXR6O3^(Ko-@CJO{mn)QYFZ4C>r! zeo3~w-Gf5-uwAW{FiXvYK|3h64MNs^gHsoH=?&iDVL^U|SwlLXIKy$*=3BEA7!5Qu z^*h2%2~4hW!I7ZP3h%$pzwC5}xhw<3$7gKClzwMlbT-XyA88FZOx(Hp>r!0=7&-6?mRf#v3) zMbmpwHG3%2Ep$1q$Ao2eTrR7y{v?5Q#Y;Cs@D#JZe@7(yXnWXnOo%)+#*JnhtxONr zS{wn9KH+j!B?n!u9N8uSrOE@=CB8QJ>ulsQYr+jqLU5+j(JB?ea?850?(RAoi&9>t zn&WzTTAziLBMGBV8_D7y-!QPkF;q@;zK7D2&tzIu%b$@k`K)LZa4w;|RsX)Y^wGfU zdyhTNUPEkc9QX>WL%7{7ao`CcjH zI1|C^?ItK@6R2^}&5yz$DK8{$3%Ge ziN`5VMYT69&~m>+s$h*5G+KIoR35O9hyQHkr36UzN_+AO|3h=$iVf(Q`em|u*Ts7$ z|1&n`8Xgu;sN`q$iezUszL)blh*d3~%_D|NT-8YN77F6YN1hQHxFLXmd;@ zuwUS;3Rj2&o3+k2uRgw8gnrY;C4jt5Tkv+e<}qgeCG`M!^kV}Chi_1iBAdQU zt7{DRU5xHeaCa97je+~_Pk(%4;ACJX0gUeeO`lyS{Z##fjR8&psP*gUL0;#Uv;XL) zUI`N50PA7e5&WOcOnOK(!=(ydR_|)?=ZE+~NC4IG-xnl1s=5S>!jDY7r>2t?eimE< zWUsiGO`t$T(a-iRgfi;YSEQFn>2Dcvp*~RVPAUY-7K^0CuJ?-!n&4u&3w5z>U8Sx+ zU(>jx-XJo~f_o?~*`)_t=|+HJa{5{UBt7YVV95b;=|qvx+s}u1huKql8T;)h5n?R% z!&a|mbo=C=1FdLG0RiebgtB5hyy+TOvef2aaYFlH2ZCc|GKr|*d-ljenBKz3jJB=v zrw_>^7he&YLFj+P!B^4o=jDebzKY7(d;3I3n2?h-#xu|H0N9Fk;r)ie1@k1|pfC2u zh+&W$J(CP?4F{i8#&HTesU$(g%-9&qx?b+`FMI{GRN*)zTSp2HIURhZ&n^tQq~v zOJzA6d5${vVG0SCfD(ltIeH)YcOBDKiTg<#n+pR3;%&TVSUxTLYq{ZM{|DspFLp=q zM>>&9p>a&%YEAuj|Ed0^N}osE&eJFV#qE%Oe!qtGnRm6n{_R_@-{V9Ep95pb8H1dV z&Ndp0Y;R#WA7lB(&vnmnY-6MjHdtsbj=!2=qp!&tdvDTCH$bBnd2O3zb)Y)aM%mnC znW^t~dqIMusL(o1pTO+3@Gk<|)r45ePw_z`-OF4zG;c14iHz0K)qaqsbv`e|88w-S ztwhs&Q!}CCn=)pb%KKF*RnJE%1AC9-1md!XFScV-@dYu`>npW3x(b*R>scstjP{Ns zho6w1J~6Wr&#{N8PtJvTXs=lsK$h@6eMqA{4^$zYHJf1+J3TAk_Qw{F zIuUyb`!^U>J1NRs_Gi}9Z3Su-IJ;0MZs()*aD0V9?OUrm)|xu}V8{FX7JwZ+Dn_d8 zib53$it^^XnE72_W#7+ni9xvU*;ekiIVHx?Uq5X8jYIG-1^I48${iGYZ`g<9PN9_XB)Bfa`B%uu zcwet~diO`XlJ8ChcHsP$=s4pSU6pqXFXO7-@HGB8K2lOH! z90v|Q>;+Mc(PwbeI5~j^tpq`4v4+cVNf_ZXdU@>48K~fhUt8EGkaW^~&^$?ilZ1g~ z0B*5kYi?~2=55F3L|kuHunTe~E+TewL_z_?fUV}h<_v43jRA3|ZqUb~RLaBj(0TaY zx%Vz?q6J%F-1%->Q^ceaFtRzAj7W+RI6O&`6pH z4ID7-|9s%lYB$zL#$dlbB1?CHLVmHI`GME|JC)pouEWb&xdb|`<*!t=R+&CG=P(7$ zm_8@E2tBV63j&-X?9r@pbeF(M!8_hVos+L8kf0sT?^PMZ&30^I{R_|N&bN2>BL}{H zM`udeE`A$Jv5(;Sg@yA9-v`<;dQ$1&A-{#Z!1GeXgz?fi0Xzn%cRw)}C_19lUZiX+ z30}T*DIW=})}d}iYDy3k%KPJU`NT@4BKoOkI9|*ws+FkO)XWTMP0PZgR}mNP2s&eo zJ)I7qB0T}u#s>#ZxviK1wp&T;ND{KkSIbg4n@ovQz{K;pJzS>&s~0@A*)GHf!XzP{ z&u|w`ymA#o12KA8OH2p$s%|d09ZKS1He(m)oJeA;QlGe7{R4`Z#6zd9sE4R{ZK4#> zA?-`zybKR>Q6jY35Zn^%-0V;R!c^|+aoM6~kQi8%;CY@J3gSB3*RvnH99_0u{(P7s zU~xRmXsHS>`+ULpCisC5NkQ)S>kQQ;W8d`8S+}7cFlc?o0|QVjH?Mj>@b@;XkHK)N z;k2FTj6%aDn@p=(@s})5SmWRIpF@93n>45b>F=uT-bnV>R zrv{Gu8NWZj4J*JFFc~!)eN|`6WNF0FUN8^wtM91(L0ivVS+QUK08EQz?!;i-NZ*?+21fCM5#8d>HKynGFjZoS3WDX_*p!lwTPhE@uOGy6CVv|6NJcr<&qts{on?jQ9)SH zy)QE0Vc*#j_4MNehx}O|>J@(JW+`XK@C|-J>u_HD87HClIqw6*)azNRaZXM{b-bou zEPKK!bWdW3wB$)CGqnpUwhxV~(55uMtKxVWuVl*QZ$_?{Q3H{2+1%8|{elOq$ErA1 z|8A#HCE20`B%v3oKdErFcQCTO@SBcb5-)6@C~`Cr4d|xu#}CAEboS3lv_)eNJho_6&xbKAt#pY$Ad{d~o{Y%c4aU^q zee0t7Cn_yy@RwTs@chfIcDwnXUMR7PIm{^_*YaF~M+dsdvGe`#-=i)N?N~tf;eTes z)5D$kf<8{Parg9ew8L97tPPMsLWzx<|I?n~+YwPwlszb5XQ<&PZ6BTOn33SMyL+g; zooUcB>GuFQ1eEH|=1k(C=pZu2BGn4TM$_o3C%Xa{57cc@d)31v7blF0#0seNfoS0t zz3478RSqo5?zF5!I(qYZ!1>qV7hv<#KNKjr6!U}ro>!)tw9KDuwYyDX2w4yiSERLb z-K5@iA&G8#f0xDYA&5KHAV=*i@{s(*Z@YkHG0l&NiPB_wL}d{sG0|*slu>XlJ5`(%5^WhWg412;+vmG{oIDF2p07BzXpEbrLCtn z|6}R-st5cX5t^Op0+HBXTR!>MyDWS%`q#TGpw(7f$ayvyUuRJLJhdZ{okJcFft?iW!=Zp~>KUj9i@5WW!O z{A=!atPJawRW7CT+uk-Fgu2@q+FZ{0ho%OxYM5j+$zv03LkFL1WJZ>5{+PrUq-&VH z`stA}q(}rCLo#svy#)lZQq2DW&6oK@j50)`GITeO7I-+8Hi8a3*Q5Nz&niv>=iAJ} zN(L%`5r(IhcCG4PUJmS&p8wpz0Yie%7q?WWlUOw^m%~_vIa-qJ?5*$~TZOPI+J{sg z5e_=NVTWzeNj~H6jv^jbu^BTo=ijU-#1s}OmB-xWkP>io1$e-T_2HtoiMq-V*l5l3 zuC`l?YHeFNv`L49(o%w%Lv_U~e;|I|!8HxowH}qXl(URP{sIr~_Zru}IY&H?s@nV> z)V0q1xVf&QC7Fh5DQ7YS;wcstud^6iF8?wHea2E8}uGR7%gU+*U-i@H(! z)yIQpFDPx*ZC8C*j+`Ah)-AR#3zYoQz8U?+4pO$+_&;KyAz>gw@eA%uvkuYQQ+*oi zi6)a%2V+$>jMF>7a|S9XrPDVF!r#rXBZA%4D_uS$|9@4mCgffNR?}a&F7YrSnx;{^ z_n8bsBtdTsP)1R3xtGs?Z&?5tC6U#qpeDSqP6^=Gatv8Ca)oq&C$J6?A8W{+37_1R zGwzZd8o_R4rse!>dxEn(R$?b^F&iBu9&>g5TKjs^k2+}^}N&b4Z)A|4aX|QOLkX}tv z&|SOKZ8BClvm%jOT67mLc9Byd9X|Ms@o)`S$36$4Cbmd$(WHsGT$@&B4VxJgDKn57 zahO_?0eQrB+0P$D>}I5%P~|eAucLY~)k?e>3z8AK0qfS}u}D8}O;EIFiP5LAEvw;7 zjE{EN;)Z6an#o7kQpi(je22xrpNDsS_C}dx6uTyUNv=Nv^_pIHm|n!T<7PuS0jSTx z{bzlnhRNJ=+Vt^^oa;2>w!Wmna19?kfgci->NhE!bT%8??wXLELRc;lo}n2 zCTbG6<5dJL(mCnF4hKt!&65Em?Pv3|UFiAnb9<#fdA7WTSkd-$UT-g`&@Ok7Mhi^W zO0D~uPN^Nb*hHK2*Zu#XFdvDUHE>57wEl3%Np^(4brdH4oDwIa#Vt5GQ*t3DSgO&w zrzk@EHN!=j1|6RioS^Cic(%^Vb|RUK4#}|$B|AIC7YGVKLN7G(?`Fw+*I}T0@gm5w zp6Rn;8g?_vVNAcc` zTkt8)0=fo1k7%8dN+w0A(dd$c`JMCg*MRV2blcI=!P zRl9j9IcP5SRk`0VR#A5e!gt;&E|BZ%jo)P%ZYsG@|F}noPW>^oo_YLo;iWR`(rou; zru*SX8=b`VnY#JdE2#>=PO$IHWLm1T!U~wTGF{8r6g6@n$ZMB|Iw`FAEx#NoO}@cI zdhx+K7@7w4nFfBq=;dHUu&j9ZTH-8%NVW7+B7C%^$sn zf8I?$$A1^;HF%Q%wDD=HIi~9Y$hXBPqaV3Ne*oiXKPMj!_w%^-p176o7cIl9Fky7& z@SGiQpUKQ@K0vV{Gc+>$k8g-Gv^?lKSfA)L#B6;9yu7h;dbU=#8UNE7>ps`FKwoK^ zVr>b{v>PoXt-%))7wNnora;n?>s(Yre@s(Sy4}bxAdL3QX_N@!lllD{yAweF@IKr9 zK-N{A%BQZaPO;4LF2;fm-B@UT!N)8Ry1D&vEC%uvA?ryjDxvvoh{QO>tXD`3-P!ke zTkDVdhR`VSQXlocT-9z6;D&ZCQr;mSxcw<@V_Mop$Mkl|SRJr2+by>HyI2`*J@%1`dU_ zn}0xBG@pA+>Z7HxU<)|2NHgTj3()g%j_7X*798%=ww#2oTRSnH9-Wbm9!zagm!Zgb z``#Q(x;6(=T6~WVJ~N<&F??frr8Swyu$8SQF~V-b@9sxMd;Gd_|F-8fcs8}BcFl(J zf%ulmXtgGlnH)AsE-}%x!?^^&4QXOHFR>rRHr}*pgzCP?2P^dieDZ34_!_?053AF1 zwxXeq%Ho58P(Dh zf`wlA3oq}W=wNary6V^LTsAIzVXN~>{hj`e+L}3QZN1O_{{K>1U)Kts&hb~UqS5YY2sk*WG13HNeh|@1e$su$+|bZC_56p_ z`~Cfj+CKt3Da=zQ+$!mpevYSB?#BOs{5l>nKralTqHTI7TyNf6s;Pp;vJglPvB*^lfinyY4ep12+Ru2O0nM6ApX)aevjGpC4(?~b%gY0R! zOz{bFU`qdhVo)f98=tG3IBB`U8;gp76hD{6{VI}*2VP!-_Zg58PZat72b2gaYNXyG z!r_-x8zKN@o{{HskGu&FP&p7ugsn~%`PIK#e=EfGFW0U zsS7o?M#TKG_S<#Nzd;!V^rDa`hLpK!kvG_Ly;sI0G;(L1{i94ro_ED$We!xzlzH|m zzzz_=Td8;f7QS*898~w|NdwglJ>H-&!UQ7OW=2$K*0sLmUN?&I=60FZN z?iCNZC#pNOE#Darv#XMndut@K5m#`~mmnvxADL$xwG(Tm4U0n8CiUe z2UZABqJ7(8kQoRJ%I}v{K4!tmL<79Ks8$egvdu_i(VDJ>nC1+@E$*3U8d*aqbPBX> z%s~;8Q*CEOcO)Z3eeW|yAK^$l8p0$%S15xoe#l%*8v>GLQUEkRqCvI<%uTAz-Q73R z$!ZF}N|SDv1;jHmH>tM>m8tBTkZABJnPQX9gy3XnYjIy}zZk^6c&!;tIN)_bcYc0R zij2`zuQ`ZvYGu8NL0xlpxRezT1wh6HPm)&fvEV3${`amr=>3}m1yD_jXm~V@$nDDc zw-v2hTr0gg9>*(|Cy%z$}|Ib{nW`}A^DC= zAGPpIuBsXP4*2G$u^SwG$675E|7qe%@8K3F_;Xa>W`H+snTyUjYhU8e{Fl#$ym52| z&->MNM~yjc8r9~ut`aFa2XMVy& zgqjX_j;NAv36@5EufBHhW{x?W;Kchg`+eiX6n!F7oKy(VW`wnX&Z-gC2HvywGzE6G z`@gskG3Oz&WHfDM4f^Cg(2S~}mv)zxV+JJ`a8a`@+n^G#6?3VGD0aaS+G6|qX7*B{ zSQ+FSAOiA^4=x`X{p*74AV?Mcl7jq>F_gUXaMbQNQ#%ZA=w8`OzOJ*&tND3WYB8WE z`j);(h65?*gveorX{GD3r2VFWjI=we2AOEH1+tUW zu|^}QY&h3n-QD>s+)6}Kj+M5ZK)NJmj>+VQyPBv~06LDk>3^N99QSQCKdP-7LQo89 zjB5hp7!qfyREwz8*-wF-oJol7EUBa(@ zt&u!?>v#4R91FP&(R^XiXYggQvijPTzEtfABbAeMtA>ORltD!=^D0$oH`5c6V}*~E z?5bXGEabkIdUN2+o?xpvo@bPj&8pZvv6sO4-jXg>?aMzPE|&DMt{uCbC5BXO$g(2P z8HUW4oc{9UnUfGI)Jgt)ttbi1Wp_X=N?TSM7z)8#IJ=%Y-knHpI}=H%$0A*2X^{qv zNCiVByO^6tm0Xd3aO8f`Y}W7jp!xACl>O@%Q^NdPr?LPo7B%0~0GH+0!Fm~a-|MC< zeoWZyFOg~mkrMSGU3XChsbbLlX2H=WKMmZUM6P=xYqkmiaUzmvT7a4CRJ-B%p4HXO zn!s_=`1PcNPJ@XBHqtMvco><`c@1X~;^Y=(Fau(gbccVW9uA|jc zq7Gna#(AWITLKJAzTK#(3`_QBcr+mUjo1ZLAO>=+idLnzM?-gp|&)3Znpk>9sI6_l*tFJ0;m&xBa;?UW;3YpNzwEldObHC z!--#g>i?avlqzJpd0h_*qL^}}%r_6JGws{cU-)hlC_bCA4A#uf@PSH z^7?aZL~OeBz{ zEqW2RhCNR-U17ViV&>brlK_@oRXl)k_}_jvG1mG0?~Zh}oJ`s$vnxIsdde-f!snH7 ze+ZZ4u@!c_aH6X(R%i>%n(JOdiaIY++-Y9T)Gc;#s2>t!uAHMS?!@x1&Zi^Oy!gUN^@BzmzsSLZy|Z(ORkPwAZ> z&GGwvTIr&G5P{JBj;k1OXILpe0D#r9+DLk}s@H!bDPs#mTthkL@bwW4R43tRzZb=; z6bed19u`x{^UyomPM|@H_gsiVUkypzrnzxF;~|4t>Hk+xdvi!bTJi?l_!y)cNx}{Nf@A~+Jd4dAe1VzeMy3al2bcDRJ5+X z5fM0m2b#{oyzAYa-ExY~ zJhAJ57b1K8I^!MP7kw>teq97r4SnaNQ#inwuaE*bH{M?V%(okH`d zZvnIpZzg&2KvhVSYrLNpeVUtbUS=$~lgZaxtNW5E5LGwgSD1OFhurcnE~!9(BrYCQ z|Az#!q7dH`vzsKeP?SqWUfwn|Wfdw;#3eS^x#)7M)1CS=3cb z?)>KbP!fN>S$-~`3SBxnaYr?0MGa&cU0Q-yuWBj5HL5bpq*-m?Bm0aaH~}0D#SqPe z`aJOMJtI?Yeba8SCAA6VzPS*+Q6Es{WmELGKSekG7%IGf(Z8ZTKs97Cg*t2N+Jyl= znGrn_6FP@4eL-1+$Om<#t2?RowRq4Lit%`&6yTcPr!%8Ven*ue|9(X?UyZK{I$sM+ z`6ka`Tt@RV8WC-kt%k3v!5gWKWUgD#GmLT5?oI6Cv7WW3l;igV(zYpqL2^9jvbYuFYjs4dp?@-=V)`RE$>``m znLEDcDLsCOo#7M=9)W7==5u(3jahiOfqrRem<&hEh?%M2C{;8r_o=6dwox@sz4j-VH`{GaZ7!}%m7kYOGaRto$e#5If znO|pyt1ffbX=CMos`R`cpgUVX@sBHK{_$E$&p@i-M!Bx7sui|L7g*<~gPtvEK}nLR z^{h+bx{(+e$BLXIam?qp7-2;zoiEz~p6Ft$JpxNK20O~kC-Ljqe9=@iD-(!YD^STqDrA%+`t)#LCYN=7QM6@keV#e?pfcTQWr&Eu;I3V- zh*=cq6+9Cv{Zg)|(9`inZ==KqZfnb0RY~(qp2-#Or#&#{jVf#<8@c~t1B+#7wzV~% zp0#AyPtrY+JkD-yheNHMS*Hkp-Lm3v#KbpkD5}xNcVQ zkAHTTvlT{c{(=YGpFDCHAMI7oME@)?7%8T){KQM3-yOm1>lOA%pXgh~dhtWsz-D4K z0)(Y5cX`-}pn@kJDoXP1MIU+;4>csPd+nYa(z2f9?doo{boI`G%+$C}4VMJiIA0{&fBL>}Om4eW$^Edj{!3zxqJ0~727TzEgPl?Tw()T)T zX@GlW-%xPyJ|esDtuw25=7~^+S>#(`G@&QjHrQD&4u$4;nnpLk%4=gORGXjei$2bK z`UzF$>9h`hQRVb}DX#4Ro-A@y2%p-BS9r=LN(Xh%MP6n1s&$3N_`z7#x>Bcl1{H5Uv9o+Y2osnO4r`+^b#S!e|B8sQItx z&N9z;JwzA%d0tBWmX50 zGM}xMDy&1shL1x-{EI#3&f!|@EFQ7=NNho`#eDlJg;aij$h#+yNe2kbB@ZlqnnLqg zDDP`e+9xeu4G#pHM~r*~PZs(^+1XBizplCi!~(`zmJ5e0J&gqA!t?flthX1c4(9j# zEn_VlAw6xsYlL5K!09zN6snqA8@8iesm}Km-1NcYhwFc?qj0|DmuM^0xkYY^iCy`- ztQR44JzuT4Cx|`W!^_pl%7<=Ba0AqjD$aHPuZ4(Z zn#|C~eQMMNN+RzhhO$d0jZPkDUH<`HBeU#r*87$@+R`_~1T`kP$L2Vo`(?2i(pP;4 zp5v5FkV5gn2zHSD4~WkB<{;25e^K!wJY||P^aVYgS?7Knz6yAlW!iDC!h}(C%yB!eINm5$BDX&goh7T{$FXe@_>5x)k}ag8^@%~=4t!`cK|8G z3+YG*l?o2+sE{iA`xat%l7MFte|#p?bjLuq7(<~~d{^2!$&s~G#TY@dLt?C=Z#$A7 zXg7?etbrZe)%_$u=EM(m4D)WUazR6A8|&3s_k8IiD?!9z9YO)LV6uBI>EPrb5PkD| zn%MK(q3-=U2Z#7!ot^#MecsQbxjALrpt%nTWQumUAN3u0@G%yJ?%55*Y@Fwl4{klq zxreG<3*HOpu`h`?;6xYul zvb;i~i}u)zG?=#DDUuhqsKVOew5s?xZ>VGyt3Ii}Y1ZOmoXjkWAoM<~peYh|_omQh zmTFDTb^CZsT}du@6CE5cuc5R!U4oXo0FGH^0g{>N~~`Xj^_ZY14kI*P=GoW0Bj zkB+|CER#*O&8jVoM zJo3j+CVln@?-V(?Xak(Nf9?KH_d7gMqjsd_2t?Wl{0-~f1uMf>@tg(jC*!b5Zd(y9 zqeTu9dM`U&tIPN=J#8p0Z&E^6%)XRp`1ueM{)h2FmqeTxohgYLqeEM3Km#Clg5f(`SFBv@uln{0N!=) zJO4KHn(3qyH#~3Srx{IflXx3TEuP#Qzec`QY+IZ>s<*d48&sQ{Y(GRjkB3#rvUhSu z6kbwZq@q;MVBV6z)BGj8OOP_u_q!lt2esU=SwEwSb%edk>y78K2AtB&2UiGO^An7Q zPZX)9RmAcJ0lRFu_C0b+RGTvx0Y2W^_6qQ~JcCpgC?=6q)Ce@j?M#63rnzp`teY=S zXwOx=*sSj~;#=*75j#>)hL8Bb+vWJApA6mP&KOWpkRb|j-Pyv1x%T<+-fTl;n|drXikPkqd+n1QpMw!7 z$!{Kt6HiIu^c|vpRBokUd9;Y!xSxo+H(KV!xH%z0K-(5Zg_0Z5B691IKmh2u*FDh2 zan?g57daqJcOaELNP?mN5 zW~>ewJyc3~0#%BZ6TG<>6Hnr&_k&^5;oc3o@AYq1QNcwu$f8V;l&hX3Esk6`+##Y* z1P>>&I#U$MP{X)>#&=on$T)fSzCj%MsG*VXCr~^55p7EL8{{R3BUp<%xhP-$f3Onf z7lsN3J2B0F-u3ljQjwX#DhMCfjr6Affd1oB|7uS`7f>t0FTiu%CbWf#IEEx!5w?mP z%>H_k1q(St$v(XvcM|F#*3WWQ3$%IId^V5Pv!+mE z9{sY_Pf|icz~P4jCb1+YtnOx(^=9~#25n7fbILV~SEtF{>tOT8DRoCgV;OMBrX>|q zOn5C$rIPZ%RNth#5#-8*#b_+ZNRt`dM|U$FVUUp@>w27$_^PBxe{t5Z`Nvo7+0VCL zW`SqxvEt{DQf9# zZLD&Es-~-7OcDg&`UD|PKA!h2NWzc3NZH{cILIG8FHXPA1NN0ISa%fv_N(APA|nlu zF=1)65LLQOF48$a9X)m zuGJy;O`~w#u1BKK|4z52MtK~)NqwaKO?CkBwo>DGN36KyF~TSMKYlJwr4xoy6PEz0 zIF$vc=G)PWmamyG@*piTS?|1`0G>p62301&QLV zY&P%X8RP;dAl?3C%(k0e6VrQrQpFUG5`e(HF^^r>h@*}Ted1o?xCe)v`=VZOd`gf7 zY(HABc2o}zz_A~X{{h{Uji%r40G8Qp4gWn-i5`Lh3wtBSKPy^uT8nQw$$KrNy!I6y z_uJg-hnN2;ER!!TeLa$bY@G28QxxU)E4RX9a=|3zKU8zHW=w7M@ubhCZTi%zRMD2@ zYf~dY5wDL-OOwsUgxyB+%=2`#=MlXs++CNsh2oY!o>lCB3nB?xVuE1yBGX|T5phq%8{DWHSC^ z+|wEBvvQUyus8LeME{Q!53GK6eeDw?d8=Qw1@nBOe99usw5iM$;3q{21|2?ku-Ff! ze+5)#0SnYBM?MdSc0Z|>x66B+bWs2P%1D(1;ONfXE25G6fbkEgV95Csv9)H8Sb61l zDSVhq6Jk?n)UZuhyUHvI-kiEd{aiLMEc}pkS);9Kbur^!F%d#QF5=<$wZ|Mh~e}0v4iM3`uTilg#I|3Uz7Ch2G3z<@T=o z*yp)Hf+-rMU7i4&_pH>8Y28KS1B9tILXRx*85fmZANxk3|EOQ`=9C_QGxtuz_%GA% zdCV+=ufw!rfbQUUfZTQnT@mjZQ$N9KQ+a4v8?Cq` ztQsAKkLW_9VW~U(mk&rj^?5oQ2B^`5NFmdMV~}~s+Vc&HsMwsgqG3mFdfO3F9xiaX zD)dT}0CGs1^Td`NI;go|nTVN|N+F2OG*Ovs%;VOJeV`>XtJBy-)%FJz7n(2!1sa{H z{2_+*sF^2q_H1)b9!WvUT`oRro;YuNx!$x^=hEl`QMMqO#HAZKQQ1I#nKg>%4*8q} znf71pUGfV!T+i;o@}5Ne0a-Z)aQ_N9!4n8f&HW&0-aKpZ-j#Ouz*&zsp7;nx*0^X| z4ZS3oG~wIwl#7Y|QCQA9QC#fle@}-kWE@k)apUooK4XyJ)j^>0X}RrqqJSpE9;)R^ zRw>%Pb{@1GsmzfS!W)CEBT%3mlqApZ?@Aat@lfj`k0l&kcq6zPvp>uUv(p|uo!F%u zOf+=S;ls>Ph`@-#Vg~A2INNDs7KTaCxMjXh-U9a*noOYcB(8D zM~M=%Ma1R^7WGeIR}^jOha8v0#*6{_SGHLRi@s)^JlH;_!P}_wm9ih*#4N2Jwo3Lx z)*f>6@DK0!3(^ORXip@Ec{nkZT4!T1(D8{%9#<6I&rf##?wE@OMy}SMUyNcZn!JW$ zmh9nSM6_*(PSB1lOLWa=;}{FD&wpCG4|Xg}-)|dP<5iZq`%8AAXBntA)QYg!CPYz= zXN6|u&=6ZoF*8A|*b-mQW0aZ7>eHOq9_^19)e671(*rE;C6H7ap_vcUN%(h3Ts znlIHQrHyMrB@!sjM6k-Dqy)Y?r=zgdLWs*i`~t>R9DSr6H^oEck?U5QYU_{QHKVH= zF#xIE^ObkgEcvE8pzX zzmN9Td7h%)f*dpX~=m+GhY7Yd}WGNhAw+m`pm+# z1n{k?2wQ#iRU^|Y0kN<(!S4Z#PZY8i)$KRjY;g4}`37n<6va{^@#1`dO z7#M?v_1!H?uW_&K~y<7Fjab1_Fy@njq4BzBp5 zB^8{%VKAdLi2Ksvu#X0{;+WN?$ESvS1?wu31hwb93CFY?jLrXONdZ;rUomMK!8zu( zYE<@%_m+-eTWDYWO>NTJOymAie2MClt4n^Zfv;sy3L}6d74aHyIRL zU0wA9)0B4&jeQ{a69XLs-t1Aa$e1?v8=8YW6CiyVwh&41`Zl%m0VYz)kN?XySjHDRi&2Y`YEFr)`tnl5DWfr zgjHX41r;Y{g_&Mm@hGIEUyELiDi}~K$|QOU6XD75B@J3!0;RE zs-hV?mt&mB8a=CO$D3Q1)3`IofcsC;hFt~C|Mkh;&E9Tvi&)>v6o+dqxyqesan7b4 zMVnW2Cmw5JgDuhaAm=&{(Z4k#UcK7%Zl|%4(G2j;f+G~D0b^wJ{`=z?VP)_7j3eilwYTHVjO z_hsVEqa02;UU$c^ilSt1tO}q>`=54{^WT+7rRzNy_|of;blyHeG+@OFtxo1Vbq0w) zI|Tn80!3iUcM|l+2&I#z`#*fS&S=cU1jCQ?b%bCl?8pml!ODFQ-8oejv`Ge?&}VSpBRnk}U(dg+=74^cHbYfP&|jjk#213W?Aar+wEv+)gQe*~%KMkb-JOPW zxm>oS)gLL&kDTxMtsT^p zNzs10i!zmmScLE=jaQ`<^($#mL?}h&_V3eI{9Yp@av>CJ4R@WgmF?!iN#BwvjJ*&U?}m_tibQQyUgveL}R+edndijM#q)`ZVmpWE;R zys6A~gXfrnm`N;z^@97T9Mg`#zb^uw;KrusNh-HGTiaq2Ir9h~Z%>u9yYg)uu)F)( zQm>jEc~@J)CxHX*;<}Wz)eB(jcxVRhJQYU~^;-%U<9oqD*bMAg`{;Ug znL6^p!22)XC~~7>m=2gRfI&#nN%<^qHGh(zsGX$^7God2q={tow@T%;y1RQ~?OgJK zV-J?8dxQivWQC7Lo`wyu)WRvlX1-Llf_y9)7%~eK%02hV+ywXx+Lyh`%Bj2)hhMU^ zuN7;2Z`--c8c~N^(-7hArX#0GjS*^T;be3T%PUt>w?BAc*0*0b!IyB8ZX#OMd_RzQ z*lx_IXs`Jm1?RP^h9kwCbVIy=_?w0vp9h|&igKKuf~w&cj|wItTN=}mp-^*#f3Stz ztZA>`!%{uHm%A-;xPwvUc|?yN4G<%o`QSh5)Oiy{qlu_Ds|Q|ZZ(gJ|8#X4Ai>XUT zIG*puq@QHBL0#Hf98_jcHKI0VxAhdD_N%8#LQIwPyY|?+6pWj|(I%?=RiNY#sQor1 zV&=_ScxZJ2jAN+p!n%-_&=w*;^407lgZo*pxD6Ya3U6IrEIIu&uHxtL{Qqi{Cniua z4NlV~{1}Im&F@FQ3?NWTU51CI4~-b;^yu_Q*ydlL5OYb}4( zdc}X@%V@v)7#7Hysz^i{Kw!;l|fy`Hw^yVe?S}V z@m!{XwQWfw_4JWsJKAvyXSQXwfZf33fCZ+!byu9#5sod7xR<&W=C=KiwZnd{lg9+z z`Oile=5h1Bbv?vS;zOJENI~7&#WH+PI!=rV5=&X%8uXN#TK?!;zbln`c-kY+a6y7= zc9a8rAA^SvqX$08cb1C3JS&kOYnVmBm#I|KoLA3G1<{Q;zIeq_db5wrN8;Oe*_Mb- za@e8||ACc>6zTV#-a=Fs?pl99&ts}M#HJl}okY%ydyk%I0ruTbxc*oCBp%{<=07yf zGWu*-MW@_A_D`aXp2`1lcxPt7>m7lD!QyKc+d6Q<1ee^(m(gy2>eyo+WmOwOrX$8x~W_=#OcEe6fK{snSyLP{C$$JiAMp!CU<@m^R-wy(LiQ@5dTxg&pos;-8bKGg+IpoP4;-LxgKw-Yu*}( zEd<^CSL;wrFfL?Ejy^PnPoys$v55l=w(x{C17gim;N;vB{Ul;_AM5Y6` zvx1&&N=~B8Q1ym_jzkt4HTsvS>6)8pR6%Ucfa6QyVuwAvuAxXb6@Z9DBj*AGzEVi? z4qKyjuQ?OKwr7WA5O>K9WDTZjyAm9-;4nJi=yIKT^yP~z%0kdq_;Xup$K5Rz)rL~7 z?ta}5^pE{Zi~Ap%9h&;Mp5N5*g-1C*&6{>JBYxGQkAb0BWG*4H8v>s$YQMH?Ny6(La3Y&E8Az805?c06t-oWTT+cL5B!`%ue>-Yp z_SA=P>xB|Z(iLV7J}^RBY$TXZ8;oY({X!RD%6lISrv?V3E^q~nLqgB*_DJ@L>JH8T ziF?_(tAgn4)qU#Y!kzD~UptASxxR&VMbJ-E&u6KDLKg+5fQi5l_iBb+sP#C9#1uSYJ^g8FhFOn^-o4xE5 zA(0Kpu!9+Jrt40M)Xm#F`DC;1{PhGXKF_Ps3lxv2*UmSu&A z#bxeXl3xGaZ~rPvZ$slD;vC@vfh4sW^RUmcF>wt6zkf>wpCQ-_Reu1s{wjXqK_ zf<0ZV#G{~_ZNPdVM3otn|KdZ$kh;r|r5;* z6ylu&tO3nY^%!ke{o_rV$j%{|K9+(EGssapFp956a8wjMqUQdJ)D7FCD$)g}dn17< z0&Kt6wES~&OP-1FJCAuiGqJ@I;oFRLBqNQv8cIN%Gu$MX_r|Uv0#$yaqU$OMZ2R!> zSW{V&ml68ve04#~?heAV<9CF!P;Uql_{es8^ZF#D;G(oW+ zZ90F|?cV?Mc>en8MRcU2c7p_OQxXE+_iLn1zr*@Re*s!aYkg`dYZZp`Q1lD_hvEOw zDD8jAa2QZLI!S6~AgUckm%)4EY5-x9VRfc`)80<-%?IHH5HSmvCCJN&1Ez~O9ow3u zbOH8z3h#mHOBcKU7miAun@9N;?6P+mr`(}oyVgqEetui(b>6+=_&b^;fW52j6;KEl z!HV$^g`<`@hDpd1@`W=NU3iV#V(0+f5dBRFNSCX0y|@iV^X6CY5}~LK1_eP`CrAYb zZIzBwv!UjrU?1KMqKP5FOWNW^ZAu_WdwyUhbqRgk+PAyk6+gRsKnFeC!S@oe5WBjz z?eVbMwCFqH%FviMwF-t@>^>cPXk)rs{GhHU-*Ia9;j6Hk6jA;dqTYws9F2Wym9}5B zrac@sIGF}G9Fjm+jT|T-$l3SNb$Y($UODO9;M`u4tg>M`wxRXV8tY~DM8 zX(=dcSr-~20To;Q{mJ089hzBneSCI;eS`UhWK#L)7p@FdO+vlP*$DYl%oEtACu)U8 z`;PwjtUJueg6RI1P^Qn)``fWTk>ba%IRDt{JT-msfiTZ#E}gJ-JE^cZ|MpxQwE~o^ z|4d^T-6<9V^COJ{^LWhufb6aQ!6nW9OFh2Rk*{~vX%`cMF+i^>spA~VsG(Sm*_jZ0 zyt?HoLCD8@CwRFko_EAaImR=KIDC~_&+D=N0M<=Zdn5Slops!{GV$sSL6B;Z5$nSD zw+t4Irm!RI^nm2i&-_WFlc@}6b)Uc;Y_sJpAZ(mA`Y2EU87u6u^9g%`#T!NqtWWZh zBKQvUnQ%QNcw2Ff-v_$%VL*T*6Gz?FJ2N*`U3evU0UaG|T`UcZHu*@OD=-HWs4gO% zw*XMRf{`0nmCVk=z>oNyXQhAL!1z_wSvHE}(jM#v*#7}dun*fr9Nfl({U79%yUOCu zO&Z+FvnBfr9S~njZq-NCMk~Lxn*B@L;*FzAVVl9zpvjF(g4%?0nlfojgGW^k_Mz=u zot{;!ZJbl+bv8D#Lo{2*f;t0qTuO;uodaJ1%sL^#WC$bnBMA*l+lC-g34FgU5ph4e zTF(^Uw1O5z3)`GHAH|d-Eg0`MA|4)<1S6W$GpB+ml*271;8EDwFt8<~pp; zH89RIZK%Nqz1WsOMi0)=KysVTu;Sr#5#7xZzO#1hTkOY+_|0*st5vd~r~>o({2*GQ z?2&Ua3_;_Mx~>EbSdY$ri~huXsGB&{a7lLu3+LX?xp$@*NLlzOU;>LHpKYe4(}#y> zPMSE_jh0aFD3`EOqlBc#9PM~-D%@#1$SNPFvQAk#-JM_?kLc*)vlAF2Q*gv){CcE< z>*BLyQFP%O08_N#?%#Gxsu2 zyl*WYj$9?n%^{0EY>FD_S3tUcCn$0qMb2APEcs5U&qsu@7yOW6TuX+@fiWuK!}4HS z7oThSLS}RcZmx81gF?B0ki3@DoBg99De1D02>wDBz+33+&6BW&WqPt)y0L%O>swNUuVgY9yApF>mZ zW7bxG@_1e7K7aH&XC;Zry1}A`YQkok3pU&_qGtHkpmEE{fvA}z?_dZ%aVM_1<88_YVNAUG5k+oAQs%?~5 zOYtEy#d`}@L(fp3`c!S7Cl1M1&V;h?^KY(ThHkwS@(eisiDWN|%&PfTS`;ytf-T4s?~oi)&hR08Qzpwqq{)#GBcm5otk=e>&I z0k5tkKwftPT zCpJc+(aIqw9-i2*G((o#t0K+oEDZk%6TS1=v4H5~$K*81s(1AXV?X#5=0bc3!$K2A z@}H;WsSnyk$!6NE8O8igyfVCi~Zc}M7Y|b zvO)<&gj*h{icMk|49cDw6khbOt_L`L$KpXv1NzW1b%OPOLY}WU=$8Qtj~+V>`KO%s zgLW@9W8fiv#bERh>5tpUpZ_tno+#a!kI<5S5YmQbMrUo`_?HZm@LXzRGq2LKiyUvJ z@9!cf@!eFks7}kBK{Q{SU6F;rQ&Wm?hv}*`$a3w#tDTzGYbBU`phhT>Q1|Send_+a zVi^|1QlBB6S3}u_Q$TQC&tsU#=wkGgQWO0GcX#U3;RIlQDh0%U3z0rC8Gl5kP$Vml zgD1^ty1HS%f95PMbl2yD54IBPdu*;4rP+bn$Mk&tVf2M2`NAQi$8R~OW4Nc-w|Jz| zU3Yi2@&@NTtI#Htb=2j@4o_eCZ#}vTLS_o%Uwe_w1L&pVl)w|uk2mCZG84(pHJ!|9 z(!mTd7V|ofE|&~O3v?RrI~K#S&l61p0+ZcXtyz6MGT{&dJ7dE+*{;VUC$`VrM(ook zE9++7cni&Y<;YVPUHG@$LZ&{rHRIs@P=DnOb+c-!-{EUa!|i_Y z0>Cq*2mXMf|2lq3J4u!8y{{MFKUm~}WX|_p6(RR;6AQ%uv%SyZufgs2i2LXyNH+s{ zP!lF`FHu9FO%O+q9-WQ zelgm|A6KyMgZZ9F`~fXaZX0u2JS@9goA^GHGVX&X^X#YSxu%do*vM2t_jTxOiqQZk zCqAWc!`@vg%&L@?;LO~fT&W4;oP@*IbRM4t2y!zIxAiq|ApXQZUzz;+b_;VwASCJ& zbvO6|W18O8)+4x#e97lV~A< z3|lh@#WRKbDY$(yuY{XI;OT_O431@&FA(zXhT3WCU>6F`0-3&HkRSfV(e{tJ*g>C< zKC+PxpS(hrxs^j-7fVGZB_tr5;a8ig<~eTk#3Ik~ZC;WE*EZ?I(2VX3&t+7HL6UkQ zKp=fhXw#Fo0>|Dgjr-gmS(B{6 zi$NZ|Fo!In_iWJ6=MyA0TW+yJkGQ+$x&SKxM#L!>pR?eIMex}eW!8Er%!$(QI(bT@ zGPc|_0sIobFP{I`wdm^)Ujb`aWQ6gR?q59<6N!koS}uHzv@Nz0tbE+Vk2sdQAB@!b zPFCA++2%&;eTtr5YyoM0qYh}lzl&~<=L=k~4lptJ|5snDcb8&;y+pnpK7`bZgw6YE|v{2-Z&w}+57GoYmilZQm7 zuV)n8ueXN$1Ioanc{tnO-saAX1*id!x4?mAm?Rp{_<|2f;}?MHobX!IhAKF^3ZoO> zZwBDT76P$)5XT)t3~&-Ml9vB$UxK8@ksw9TEFJ{$=wZjiS+aNyZ|f1J1Lzkx0x}DS z7TCwVD6z~8kQ@`I|8GJfI&>jg{$4beFh|6!N`TcpfNRrDK^uOU8h89K`Q|s3c;Wd< ze)!;!u6l^Jzu+7-D~LZ##^jMI8$oF8$@ba7>%=?*NFGAymuApRSxcyDmGd~INQm%5 z+N#hkXBnjenUAyP6@TMTIUrijswS);*svH4E(TASpV+&5fO5 z>eHfoE?Z9-#;cEb5J#x?0+SdQN-`%l5B2u%Eu9 zs4Bsgn&ccMD@|sKlN(1%{diJp?DWSj5-_$9{!v#=6&1okg<7@BjPk-3g^~13_857X zim)emb1H>NNB4Q(XM3Z5DMwD`V1Q6`3_|+cXdg>end2ILa$tAfhmeW>zDsp+lX0On z&t>=~t*b<5F_N>MS|b<9i-ae{5KqOEh${(!jCS3cM{ zt|#Q{rG|V|vcgLS^|Z3Hu|YghlbWNt=H;;k409)Veo26Mvke7DoaM9+_KDYgvS7O= z0ePT>eUEpir;yo0#AUX&iN*Nv-+Vjw_9qyu5t&nO0rjou%Ea>2u98C3P;={(_sphy zF|H#J>xrYH*QfuaNk6;k*1S@?Dv-nfVAXsP7U%k)WS$^*myj=q{V;^c(N^*wWx7|> z<=^AA&(xp9{P3%Cem!W2D>jjL7+fj#t&mgD35sPL-JSo;#QMdXgL1O;(8-I;?eQ<0 zQYeuUh17!_quD%9T+m6m3%yNgudAD^o-)~t-@s%<}Idz6%tU<`xMt>QN&9fl83 zF7FY!0tVvYZUy@4SrfCmBi%WZX8N9S zr&Ee2{D|4QXoPHKS{QFGn?BYY5eT-q2$dHboz|>jbyn;$To_MbHB2nE9tWb!WvimPlK8}hf=6lHuYhtw9XV8p zRwsb4U8*G=$rgE*wpBR|U!&FxCY`WCVnp8uZ( zkp6iPNv(ByvBfl2)s8TdcgP0*u40zgJKtMbmd#<1lFErgRdLu!5Adno?Et2{f34C~ z6h8MG|E7Y3wm}gtL=*L8wyt3j_Ep!D;TiXW`aMY}*L>h8Wpxix{4^qRnX-R17ez93{wk zZChXKywsCNGH;45@Ir&>^B>F`&US2$0V^pZRK2*dokhRRB;H3(4 zU5aT1bDA|4Eyf6KQA)e{bXV%Qv8ud03yJW|71|@fWe}cl?yKWGKgR!BcIr{ynda*P z&equn8i>@MvF6W`$8A{qCC-yb2bG$(S-LbAzdkO8cig-i+&h_m=%+OiVZJYj)R45| zisM!dN_yD{=K1IHl;`F(@JX_H1k|J{n$}Rci)*#39mW1sc(K)+j$ z4z0yYf?FnJ+Q%Zh>>UfOhHA^p=LBI5)Ktl;B7~WUhuU$sLNWZWKW6@o>-1k=A28ZH zLwt*y)ellCo4KBd`7OAV+5hf_9k_RRQcj*8UjitGQgA-)?y9vwWmQK6lSAJAB7^qk z_vJ5pVy{02Q-Y+Kp+5V_%!%7X0bV~#Veec-7#?wLX#g6a!AG>=K!rQbHrF_EWbOcYA0?5C# z#B7n}Cjh_DH_u3)_iNC~=hug}Ekl)w#Z|)7~AAR*?fHwuBy8)iUK9lAKy0qG+8m zwvd@q3LCTnt$w363=4Ds%-#&DBfGRH@+cae?w-B8MDyXfglE*GLoS>Fi`D%y?#5bH zr5`>S!*MHbRu0r1a^xp5>g1G(e#Ese9`3we&X^ypLvjo3jAJC z=fiz$-OoP^LvSRM`9miIFG^Fs{1YJpvv>)VHn)7w7^xLV_+@Y5+ESuFY}oa9dsBW5 zg6Mb#S_)q~dNc|NaaOJO0gZf{F__|`=YQHo-RUmoa&SlHwkweHF!`_AdrUs?>y@>$ z0;B&$9DGze9jZTLd7h>sOJ4o#;VuFOz1j#yxU3b(4Fx9?P5L5puLwXiq1=1iD5k#syNvA;!$AFF1bQZ3Ds2~c~#zsvWm{|v6 zze5T)2ZE`qxwf-CW41CXT@i(x~m%{~9K<06L-`S6%Tc>(Y} z+*LLxvax-Y?wZJ;Z_=v7W+?EU{vOW)1~g)QN_s>pztY+HE-g$fd2pTSoq7g*Y1>l` zY7TY8>iJIVj4`a@IuGxD3M21V`2jd=7(6Q9lkld&73>wHHH0FCgO$>f%-8pl?T8b} zUwwXcZ@?vgeiacKL%0~uZT?9>$%~~X8vN$b(&rbI=E`KWKFGUlY)J21|DZB&JP|sT zG^Ym&xF(A!eN=1gSXtC;f_<nv|G4XJV}Z<4IkmzF);wM4JR%iH9{Mr*0=x2UX>Xb&@h=>EO99?dx=s&4veg>w z9bM#D&Z~n6R1!vp+}-b%r0q+Db?@rzmX5g#8qFLyDEQ=O@al2J)JMws(^tin;U*?K zgD_M@)F<3D>JdanF0#gGbZ|C62^{bH9H# z8n?viq2zOvceo#uP=~?BHa&;(^#g;@4W`J!nW=wlx{YfS7?*Nd@EW+#99+Mn%h_8>xY#App*_GuqHf7=({ z8sly}_hJ0u(me$+by`6vGvH+Ud5 zIbOd{TdRSB(`DY*TlLC|3^yXl_vY|E59HTVd6)h!7k%M_jCfE-Zzl9K@7uhAEn9Ws zHalQdb?HR*R;LlqZu~A{^(IkHn9M_c-q#(~VsS^z)~5c~S~Fr^*%{d%yQQ@3aY}hO za(NLTo)fJ~YBbMWg+&b%j< z6p0!-*l$gg69bGS?j;MoX`x*F0|Kd}+^JYm0{$?$N|WxM8ZD@xdBIJ+o5NymyzJcM ztN=yw2csHu_=~0negRjhThk6~DT&D$CLrw}3dCoQ%WABXO*Z3Cv3IbVCs$}2k^H6f z!E@{}@Zl1~MgXi&92iIiJ-$(%Z5iWor}34q<4iR_$T~=5^s->KE*`XQ7{RGlFa4r) zySgoTu!&>f-?huY%9W$5!1-t-fTj@5j3%f3JL$ zWh(o`X`$P)uXgFWHoohC=;EvVM&W5`&Lyy8VAi{TKm&b&w_BG)!?KK;c`!0J?<%D&3~U^z(OLZP;O~d)5ACwK%lhI;A@<`r$much1^Q~GOsdf8_MIuY z!8ZZ_Zdyk1S(2dMN_Z8Q(`8y7!>+qMeh=6#I=xl8EN0U zbj(4$T`P1UJ@2OK%!@wbMC5@Ky$PQM?ko4fwKN zr!W}9YOCVh4AQZsH*lGIa&@i_aNh|#2sSAe1$;Rt&+psmM0U-FT$fEbS8zr;EfA`V z^7e`bU6pdcjsB`aTdV$6(K$@Nd8%+TcwqF<&4MW`R+exa15oOC6smLE+}2cYE&)O6 zlF5Y;9#+?vPF|hNQ)GFG;XXjsovj7T^TM3SwtnI9FW`m}eW@xzHY^3l%N;z?$1QhX zFbXQTkpfUOj+Au;Q3QK6p@!9${X2t-Oq2&9cvfohU-L{|HND8<|A1<}V&d_?MDfu@ z2ryRiPtZkf$}EXV5=_5YwMugTsp0ah{2S`j8xvvWV)Q`dOJ7x~VbY}0WBT7B+SO?V zjGcCbY4ZoQR!yL1unTLDmOJA|A*Req!^faMpjXBiJ>CpAfj1>3h~E4r!4{0quRmi~ zs&5qaQMYX-3a^f5N?2>J(zw+$)!jEnd6AH^C-Q*60UoRA7W8Gxp2XIY^`SS!P3TMI z*Ibk^1ovK9XaY)d#HYT&U6ieDf{!V}Ti%-E?SmM|LqWDihCs!c9rF`WS+6@&30&Aa zcW{~0S6g94mon|8bnYt)Lh`lO;sXfIOb)zamdn8=kzQF@*eU~H)!SGSNPVk7DB`yN zM;%n;{c8>>c*1o*{E+-JAI=&*Je|JciOw0I%##}U1%!b-$OObIoq9E{j#lGKvx$2a zSl+&0b*N6(Lg1U~>b|$Bs=wc#{OgrN&@$;ZlXKVB;zXak9iQ#hxlcUCCK9%5D8nEA z*2=9$Wm_z#ietZ8&cAF#LYLX+sKj%JGsC^^>eNhZATSS&AobD9B}-%r$79EE-KGfgk9WNF?_&# z#mo<6lAC|Oe3OVf`UjM)6h4dN^Z02pEFb$rpsT2#2cbJ$n?p+7wy*A2=PN)OB3-bR zAZKTB;KXO$Ae=ZXJtZqBXremsL2HZo%ie|9x<%zvEA1z%zdLt3&!N}9&itoB^zEuj z7ro!QDc4K8zP0tjWZ%CMA&paB-F|MzIpBXO*2^pSpgft}Ym@S1`Rd^2r1kNOfr_$O zg}TJ?&4!v=JeF0()lJmwsn%80^A3 zL^ozwal35b%(3^gxEnChjr@LTspo^*p+?Z*_vJOUMc-`dZ9}O~-a3Cku0Vbyow06n z;BUde13Ay70(n1MI9Km3aj*+CHY51Ec(|lkOHs1k!SQA0IkwBi)ZYt*=I$@d8$^aV znMTj98*AMkH~jKgy^kx4P;l+^leWyzG;qO2JmeLB*OjHn{PfeJ?X$--gYh{#fs`F{ zEo#Q|j--%0YijW_yDi^Ywo~x~=oWc8vI<}n13&OJ#M5TEPXgfC_P}3w$tI%@>h#%i z0&g!?yvFV{?gsyM7aY6-yM8cG@^;iGChm|q#4DT!C;B4Uy`s*qN3Nm~`kMcKs{DoR zXGD2rEkoVMgXSyq>z^#+f|-zni;kjRD~%=JUbru~`48_9Vmg;Bto|O^?fk?7e9;a+ z!W)27l+#FADPwSqgJG7<8k}mzY@nbvt*@&8%`f z_o0eQY@_wb(rH-Ym_o};Nr1d|J^WO^5pn_VQNSv*^JXH~tgTM&|1B{rii=fGm1V9( zB6QvfE%b8@ot}#^52BEZU@}C{>M|e~np|Yz5`-`-KL&grC2m!^Lh(3N6&p+@P17hh ztb2DE*tbVq|^N5kG(rj`>|OxkY86Kuti|_o?-|0Jh3n14ri0(J#9Rb8l!Y)V)kZ%)`=eZ`q>NKOIe8l&H?H; z3+|mSUyf&aQP4~j74@gLm@EXy7&F>2N+B6$3H;4wNbp|S~SqVH%Ho||=6 zL%5aD^$bMzXvmZm32%hn`rCf(Sr1FU2EiT>8=#{+E|FTNDaG-Y>a#CTxEj=lwJjwL z;~wq#s5M8rN!jl>EU-jSnT0FhRk_w4!%(h(GR{u*va69PV=!ft+2K{WHkJId``U-o zzeSu?EfVxTpV#u*SblZ0-eqxFJC9j7MDYH^Q9uU;E!pBraN6A=kB zIu@kQ*J8|KpLa7?s9-V?9#GJG?`af+Sn>hVBR{MMl0F}h=GlCyM&HzGu0`kIL-(~6 zA-wXnX_ndr+lTyU&ck3h^|FFv$E0}eKFbn>uli-RHYv5Iz`5yKx4Y zc95`yGa-$vj}6mwQ7ZH9o71gEZn<+{MyQv7Tb!F~b0kAh2^rRRKU8ZySxb$@`Wt84 z8-_@`IoNY3=oQQEh)Q$76IUY`ld%$k8wceSIkTu*mLuqx)@u_{+t|1EMrBpH<*U^A zg7T%RzOe^dd|?S)uk4SV0U+6yW7x=7-XI9ZCs`;N1M<8@P<&zJ%q*TDW+ zoMv_H^_*e%14}N2`{ychl3S7^(4-%wiCjjX1{|dJ&T!FRtZDk95(oQ*mX=(RUN^?J zKSKsHRVpL)R;2c-9SCl!rAh`6xXh*rsAV;@Sd;}sZ)r4Is?THC4{EaHBTPOmog^Uo;)LYiRY`d7U>fdMf^En zSC!=piNZC`YaoF2ZzjSGIf2-~EkoOuCQ66l<)c@+GPBfXAW$yV=0qXHI!@*FhO_<& zhu!_TMZ>{AAhU1FNqdr9p?4IhHj%P`^YLMHKBqgLN%nBx@-MQ|&)xdO3il=q`S?w- zdi##a1HK0gKQ89ZUap{8*xYq|KWhp9Qp}OaCydU>s%%4xwq0%cH|?m_(x*u>udowjond%er?D-ALFSu{QQ)ppH3KFBg! zxPR}3)Sg%D!bdqL3(i?dwiI3V04snn)O%7CI+;&9UgUB-&l1KJ7uJvYROQ zWiHuyJA2yZvVIH+)DGK%TDzV`)?3h#&kKwYnm7p_AO&bu9W@4RkRP(W6AU1^m>NON z`3rRGwz^I!e69p=nGcHl&(%5Ek5Z(Vre%jX1ZT8=#w=+P9djd=T=>}^K`4oC*AT}2 zI`szeX>2tFWBBfx?`iIA<=Xu-4qf$Fx%bAG%GNd_>1%Wu;BudpIJ?gH*egoE;J5R0 z;o#u9$d}%$h!*d{UBB@DrZ6$$h7Z2AhNadA8}d~l{O`aFmd+TzHoL9vA&+iFxhpv> zu>=AC+%kHB_~rIYiNbpN8#aAmJX|858PXGBbEv5|BR_8GkT|XGe`7R)t*&TZy_es0 zb*`!ox6;P4-AqVyAu`gwlzx-Pz%vM%%Sc8)Mc8bXofi+ zmC6RKK0-?aeyX$#NkOZw2-~7&nV&mU zcHmX+Pz8r>BJFfKtunD#!e(E6s!tRW%JEMJspv&e30?{9YnN6&v-H&sKN^tE?}U^x z^Yt!UaG-v%532}4=qIO!d;9E0B%?(?v%>DSDf5nq^7a+^CX*)Bk2yLV|J-G1(qH+g z+N6cyu|yH%Lg=#=pgp2_b~^{keenlW1&CK5;t$hVPBO67WUFjQ3I?LaTKy-_qj4dp zYXMKg)a{uws>a zc^nH5Wv3Oy>nLkp*TeU}ZvnCh^5`#{ITeyvk+ zfz^MzaJ>ID&16F*T~!-DCQq@AtEgxjWC>0^{dv#!u zxF^l=Nm-BKdz9EK^}hBUj-?Mx{wiF?ike!>3ZHXnj7$BCMU(t5V?W1sH-|XKiKl(0 z#CFs^rEY^3{(oG(by!qg*v30_N-7~OAkwMM(4n*-AdLtNEjdFYEh*hOfOMyHDlmg| zC^h5&Bi$YEnK!=kedqi;tZT0|YcBR)`+4sB{^4TS!o{(ZmO|zAwODn{#40EP5c+q} zgIK!TyI~v$0?2;vSQ(t_2%T5>^iKoo~ZAdM**W5z0JdiewK1JFdxm{U2gs?*@vLhmk2i zRcL|!(UjYp59Eq-ab7AvevvIiV7n82DB*s(BTK{)&YAJ+UY(abEmoX)AdsS6jeb5a zHp@kZ3fh|oRz*SDorqmiCGV72OmjMIv0fyJ@Vz+!2dW^(z}o@W(}3TRy6M?q8DZ1Q zeMLJ``#5e-N4{zoPXSfb6wDhgh9w@jU$*HYH>xC)ekb*ZqN?!`#|wSdmyG1jqk&K2 zZ(Gd=xw6qLMu0Cq2P^d$D}aMdH}f(|fo?K`&PhLEo6HAw<^M2Dafr^~m+X~>wAQG& zXE9qte~XnWh@_o~Ar_7FZ_MclO&8cqs8^DChGx!ohszjcB^yu_JnBjFt=8N>kJn@e z?aL_iTW%YCqokVqg&a;}&E~W3s~)~4!(@1zRc>f+M&YQzA0m#1f96q{a|HTY27>m5 zVGpd=3uxeM=f|(1#8UM_oLPY->4gGs=T*WuI><>0;<(03pg%l7ygDdOt zE`CQwz6d@rXT|J*YIu6tRf^8$JUPf|7K1jvfBt#OQaPHvBa$&_iK!$ybRHT@h{h!= zu1{V|%g^bJh~$vnYn3GftOnuKI7DL|T?)iS>OLI$P$=OY2#EB?MbS%1RV+_$;!yz^ z3e&J!G4vBuaKHy2K!RiJxkpbghoy~lNNmgfI|J~Vd?nL4D;$V{+@k5f_9(kl!A0<| z4}pWHD44HK+XL*k8|%9BQQlSr$$jmC+eR3HQw9$3;@U^2Rh&@w zMdDUn@Y#o~W)pW@KM8GfOYOFA!TKLS?6wFkHetVUOjxNWI+*(@+)Yj}+`-a->xC-i z#$NzKI$WWBO?5wE1qiR4{;MgXldAEGf?64CYg2Km0!e1&ixIKetXDR#nu2w|?0LG% zZxiPV`61DaN!h2p1i-U?TAGk44G1&cH=?hRg($OKO;gN)Z;WndOD!X6f=1%SN#D`( zdw_~o;)_^iDJ71TJE^k8*a^0V`bfd&z^`pE{-jbRzDLbsD#2d6P%Zo`O!6LkGwpu@ zz9Id`K5FcO3(j9EQh3jUv%zIOdkcqEpo|}ueAf=5jikp&<%ry!#~WV+rD}mpn$=20 zWD0cGR#}KHAEYK6F{OTE4D%Y5^2s%McMC7=A8(8%_-MOs!{EVf9@7!MvU0k!JQ;3H ztT)GOcVzu@X-a$Yb!WZteiH;y`jEO?5?o{2X}?wBPW0k}?^2#N?Z>>jeQtZIwZNh@ zN)+ICMR0gg-V`0)Ce9LdQ+#Mfcg5EJCGY#GIlKi?xt23zEDF0cF@2V0h5ulj+1mSB7sx; z%MMEQDnpmU@oiD320=0AY0JaME375(86qM;E1`n;9yCf`bG4PJ;K5kfQ@#xc&dwrO zeS(sLZntJ1Mc=98SccB7AdOdv@BRWTmH3PnRh@ju_&@Hz$1b4`4SBE3AVod^;LaKS z*7dJHF59{a_X&^N!sLYzo#HDfB=hush_)5LJ7QVbiNr|~3=^FVCV2QB|EmfW5HSka zpPSbpz@(`Zojg#lFy4{y9lCTna9mAr(jC2c$Jg}^jG@yG)N4m2S6|;poRzN%KQDco z!sDctvYT}S-Bt3gu&ZC3^EL{tjS)~8%+Mf6VI8E0yU4KkaB0)VRs>9>gbmdt6d8(Z z>qj-Won2rc>4GuPH%Qct_V81PYzpYk*Q5MA`KBQ0MwIC!10qA3DODu>N2Woz`(*ROC8pHPD!!E1_i|ykDI}*0BU#niQEDt%BsjNzdH)G> zZp8{%2Y$>naH%RaNPerR=Z1?e(fJFoT`)hjA95p#smMuN{WA-lyCVx)Y1*Nw>7m7{ zCgyav@kX~~k>uv({|;fgYc1rPy`QSH;&6Ar_ug7cYb*(fB)@sdnmoJPzN-2))3*<$DzlExJW1qB8iTF@PEfv{+ zYE$vs446yb!AL~{{@xPyp)+IN(3u0%XB@Wo(;J5RF6`dDxb6>Ad_Hwk-3)pE@7{R& zbGHR=e6#&}x7Df)R>43&0ow2r;O&FwMk$yj#2+Kw)Anut*&z{02J&-=R~Si7P@ixF zhXY`^K02AJW(bz@!Jz1N=zt&vm3dFveMoz+^v%nr&xd+T>ve#y6=MiistRD;bHt8k zy6AjnDO#rfXbS^g{&%5Oaeep9+;Mv&g3Mui1ZSo0TLB_B6JLjfmrpkT9r66^4qJxG zZiKm^#9Me4nQ!j#8QT;)FZ)bFbNwT?UdbNyrm=KaxSPnX&Gfd?oa*iPjo{bZw9K1P z9n&<0_71RP0B-$=ALYV>-<~);=)h$igXuE-%EIOk8D5e!Jjom4$%Tp0XLd!0YKW$J z{w%6tCqfPvCl=uCwX4x0OrhDFEWB7Ek7gf)^MEf@4r5uG4CVGDss7#_e%g`xQF(~0 zSiM#zLwfoJeE72?yPD!t30_kITLR}#o?AL*=rjv+iv(qLp@E{FK}({mgjqkC6iq&* zkxkXa0;a-tR8l*x4=Ff*4|b-ku_ksyAA;@DE&7bhNrG~jKA+m$Qa5w%cIHh;HcW)@ zAlf3Gu%3wUS%Kv%voZub{eWn^3^8DAGMhmy(%&FN6U3h#Tx@eQdwO14(mZnSu#c&y zJ_-n9xqhYAo1K#W-c0Zwni#Xn5@RrH6|mNX{b&QKcu~9Zb{zcjfj!iKG8VoX|J2v5yWd#cTfFP(tbb?3JcAd9&1O96oN7`jmU>+I3R04C$8pMnX$<#7K)1lDKIfxCL*; zbRY34zZN9!b=$Gq;ZpcQH03b&oh{i1$2|%sN*yRxX5qbU32m&AZx$wCE;`W8m_Iys zLzc?;E5+oe7TmiHlyo=xci^Mwj}#!pfHGVYeJ}!$_gMebis$PBJ&>jBFbf@3iwz9A zJpE$}rZqFwIBKK-9GvA5PX-z|d5|=RgtW7S``bH-2)+6eUr9KsqAd)A6Xe64l9BNS zK~JN&4L^-%DG~d`l}Z2YdLBVkP{BcIf{rp{cgR=}nttc z5X414u(W;xujb{hV0A2D4xo^=cwaf5LGX+=I<|mgof;u&>1z53SvfJr6Z(bURPY4@ zr${4Zfy8R;!|d#6%2*K>nd@I~D~29M#VE+*I322pmygyaaJ?_gJBSC#psO5-CsIRO zV`<~xoA0ga(93sO@AXdm_>d{1TPm_2Rcm_+IbEp#`R%RunF7Tq)axYN%NjB@Jwuw; zN}nCkJUl|fWCQ{`9Q~1!un97WRQkX!VbYE^R&q*6F@u!Ei|1;40sM6-beS~WzU5oc zQ7K1zV=YH4%7XT3VY0yM8NX*EAK72IQM9M)`MWy6D7EKuhBI64z| zl+0Krm03EiZSQ=H5SQ~$|E;Yt^U*gFtZf`WIB-TKW>@e?jxnxBdg&ZH3A}O96sv_W z$(+`meMJyl!*<$-l2(6t<7Wj`pzI(6G$CXrsQ+>7pBM%6V4UPDBIi}yke)FdI?0nL zy7%OUuLRzf-Az3-c{%KNG{u}uA~%VA#dKlvm9mkV4Zz$7-HU^Z@rKlkX7%|Psl`d9 zb4%z6XP*P8$1z9qP`llIg{;@FkKR-uuRNNXMG-6I& z{713M!^4< zN`n-(9b!ERe>atge7Z)3ZP=9oM8MTIbD!+;R&N-5t1y?k&0R>~8G2H``)x@1m8}SY zc|zRxh7uyEnnw3mi8{@T|E$$%YroCYOS4Q}Ce0#Wv0)z*jKnL7Ukp~*zmFA+Pqc4> zX%9#$_Rp*%A3Y+UBKc#x9uM_=wFRa6!H9VwPx!g>O*56|sg!u2{t<}&NjOa#ftc@u z;G_LbY!*sUrjk1oEf+uG*sfbJu#h(NOmb(K+*yCRZaZ(e#olfn)?G*E%g^$i!;T z#9@ui#TevDG&4NTM}X$BD*R_CsnSnmcEWAM=(84svI+{ZUqiJ?dlXEaO4ri|J7@nG zcrq5#Ft`8zQvJR^7fkggTW3ob=2OH1oTHZ-fl9J<@MquC4>jZ*?p()Qh)`R~_&lOE z8*j>m80@6i&Ca)iqNlK5HT8vB3YgU5hFrLg;nd+=y`^P0i3JNNt`Uciq2u5|>U{57 zD6w*itm^+KC4Em!rfkK>urCZR9jg|PYWPg0#Y>>b zdklMsEhIV-w(UBOf0W%qKzX{AY<3uksv=dE-%TQgPz% zP2lH`qwkbRToGg1GL%NTCUutIMi>ePIy{}vKQP*FS2T)kz9qPk6}q=B&gyWWp4~oU zl6qPfcCcsa*ngVvDB?`_SqS?Ko@@35x1m9dleIvJcmCy0-$!$Y<|nI`Q3gNL`fPBs z$-GbtVy_4Kk5$fIo*qz!1u+_!Ry@Mxm0kD8C8g}G(luaz8J;9=OiusazT~8XWn70g9!l-NiVVafuP{w??#U+n zuK{D2xaIyV&UG?HFVNs1d*0d3+qpsWhvM9%NHd&T*M(v~nSnx&heI2A+I;}c;YAHh z(W{DvLfHT-R;Lbe)8Td|WjigvZlXw6n!KwG-~f+f9vdDc`9VLBLZm+p0*8CzJA*J0 zSwP0zfiHODbuAyBdmvhR9yAlnZNU};C{=q;9wo1fEPP#GQ$nZW(C=&bdF4riO*+4~ z7k**g4xaT>89cmTPt}Rv~V}allf~4BeuP^qKI2Yo)T^}B&*;;RM5!a z!hk9Ly|X<-oo`(j+N`x1hMMwkOkZYFIwr^sJ6so+lm(+UEvXP>f{nA(mr7mzeMy~IN%0+g9BZ)D@s7>4ml zbzRz#?jLqi=WNPoXg3|)eqMsiCf|!&?kFyjB2dHi;0O3*OW_jloLtdeIV-Fe2;@qU zf=ekX%{)({Cs72aEGjMEl=i_oWqZ%inAE4RzAlUUMh0tEXPzrph@U_&#@@%7G1Nd z=kkEOl^qW?E28LgilW>0>o&p{6#Vu*Q00d|?Pf` ztSTGECIdP?{7?%2P*Q|rQ7b{bsueguj1dk!2G{|4R8fFWKT6d7Ks!6KG(m37Ebbv- zR_ff71KEi8^)J>1fVk2Aia{~ZDV%IK0O|x2z(>!H7l>}zmKlhrthnu0G@wB4V~7?C z3hG}k#_v+rKVQXAE;N@}_+V|z7!ACJQmmK`jJ?c!n42tmci>OhvvO8xlr%dj@{Hsi zZK%WEM(iVb&h$ii7Ii9b5F@O$EGf|_6H4>@{QGby%r<4hqtc+)j0k}-#%ID|_eTN+ zG&_ASXlb5K{MKAz(@S8G*29PH%K{i#itTNrEv?sGU%ot?H@5aN7`8rP=TF+KQD`|z zf_bB19n%9@nL>6ceASa`qK}oeP%<6SO)nwd%;{SrkL<+ut*6A_i*GSMWBW4fO=97u zQQ8UQs`du{0amMM->78vi^{$s%kcDe!In=X&tTypWTz(1g@U zZJ}RdcBlI;xnY{>@m>P~Ic_&%U)HH_9yP<{T)v|pAz9LH5uPHi(tD671iCkQ*A(-RsA%Tw3&bpO z)6z>3$rGpo0y$wx5opP4QLDc`vM%8vpca<*Xv}hLJVS?M$EmxO3{tsMcs0C@_?nfR z^)>Ywxz=7_fb_Esxl(ef?F%mohhZCEN#%0#CX5&_cM?PRt$%)`SJaW)B*777_=_0L z$P59lkmcCK2#<(TyqMf(a&b(#$T@#?~sP$r6hiO4J7(q>p-g*t`6EEYdi9@gfc+wR|StFi0vV zJ<~H-eG44KiS6u{R%Y9N+XCC%zyVs+MbqjMmv@4CdeDJym{+b;ssx_1lQ%`iPK1osPc{6!ajE>oSyU@tp8>0+ zH$E;}`?f66HZ?>iW33-6ZUPZ#MjM=X(6ttw9LdtQQi-8-m9JCWCi_v&AJL%Rg>0Na zanBN-g!d%%;zN$ioAZKf&tHD#2*D;%Mp=UpgB(d$XOc$-;z31cz}!eZQDaP$w;Dh4 zLSh)9-bYpFI40LSAbMI+!kbbuCr%zV!MPkdKWVj0J;;?_lCSL=at37j$P=fv?f&BW zMVyr=$$J4jINoRTb7mBO!d9o+E3gVA9=A_^o{P1NW$V_>3qMdYSCTKYywWG~8f%LoL@H)u$*u(JRZQzXgm zwEVtB2o4_qab+aLCJTBBpKXWRzz)h}ibjvMIQds(pyueS8CKZCO zh^Eafq2pS+-|rO&qAE30)jg?jwO{PM)$k*;JvSd&7rPvopcDQEsYoPZjYB8gmff&! zV#vxY%l+kEWd~0bbyEqC>FZT=Sx@(Ru63WydQLETB=GKZkxkvoU^}+73zh%W913FP zPgtfKTYcTwsNJd)gI4>RrzN?IH4t5n^}#iFORi(hMnCP3YhP5GG#t@n zl`MMi6e*@zdTW~$FYXgCWb#{3#OOl%g8V%shQX%$@QE*z>@u$J zosX?yb~Pf6)SK~~C1Oj9)2zey@!Bwr!Q&n6gvbZYdyE6(VPi}_&|Le`4O6CY{+h%k zE#ocE{bw9$Tz`JBXRpUf{dsf2f*6Z4tgokf?dn*@#=bqron z!7J@jk17YyZ=>6pss#Q5q=#hVuj9-xwTeMPZzCJEI>)NrVSA5XI^+1_cdla8P)G z=vM;0RK=CSPg!L5dkOGcYggjd zQDa^Oj-RVgvkyE6Y47x3i1aBa@!ubM|FInPzW?UIopTWKp1hCAn_l;G*qip)<+iJx z>vuAN{QJz&2N6F)ZC;yV4d{1kpd@{MJ6O3Tc!slZcr9<#C%_#p2xP*~0Ey2HAHR0| zDUrf@Z|R1y4-W)_C2}pJ3+#hkqZz@=HRI!Wejm((bV+c|v;2{kKRfPI{!OWx3;3V( z>&BhU#KOC&K;_kzQS$B&KPy+LlzZNmV^{BgU}v9oM@fwe%fcIYzCY?qu(u1jii}(ok;n#w1gaGj=aJ&7RAA z*ihG_v8KW%j+hNhQsV4LuN~9nEi;}*?;bwF$({A1QmI1E!`NwjC8jV z;BCKd7!mzW?ojDkJVpJ6fti351_lC#1&~fyOVBs#jW{yz=mMad)l21aY(Zx%rIkFa z8=?u5YRZncJ;A!qwpCb}dwdwd`Iuw7NkjSMxIUjPe78(O7T38O?Pk^+t}X;Z#ez`J zlX+UZy(UB$LWIun)w_Q(w3~lZlX5eaIY0?H5tv=!iDsF!A=yuA+y=+D`Q{wJ(fuAPuV^7o55l&3wuOU*w=fQw;=A$AC{OAK z^+b<2jXPRJNE2Yvrv5quUI&Ihop|Nn%P%IY(BzIZ5_)slKPal3VruD?UU}Dg^F}qg z+MqV}r*cm4AWov>Ad`kM<0*=jGWzsUM^XI?^#wU<`6Bu*tM8bs#CVmnZE?3P)oTLj zFpkWm!V?2t*3b*TlVeIusn35o)S_}O_eGJE_`u|Vg|pkLSViDm%lk?VA?$UVlbZ6B zjFIw}P&MnrKWb`@i*#6rhNS#13$3hq?AE) z1T!aFZ(djDd(QsosJBKo=}it=_5@B+wwSFp#1i;>twc?z_73@f9xPF3TRxYc5Lk(d zswQFwgun=aIG`=H9Fmk28j@5yxmaB{t@J5ZbeD?{bHL+I2$Mq6DZWId$4;akXcjlJ zzEhSO))QTbJKj9|+nQi~K6)Y^IXL6l&iEA5oMa^AIF4I4{g6gI#X>w@4kZ;!J z*jcVxdfZ6)X~Ec6NGbvWSfNI0J0q-LE2PV8qcCnV{SlLEXfsAxoa>X-)njiw zx24z5<*10<;}D`O;{a#Vj;gGn8A)S#KD;zPA#ZG`Zw@RJ@9?HG`Ql}XI`$_G?f3dN;;mjdQsZ(_d`1 zAam&ojJqVaAK+Ki?!4H&VV|xDcmJd1;yTGMG0I&s#?@fAJN490Z`6|_#{UkbO{y?t z&3X-it(8s3{iA^HxB5rhjIZz)Ad&IFt@d zAgA+M8;#lC{<~cLtAf7(L70b9BHeu9Q~qFvj+b;a5R5h!)p%F=>3&mY7ei`ixg}${ zjYk~p72qW@B)?yXww|;o*g2m3XBI#C=KtCFNaJ@+DCU*`J0DjVSZ2?${-i7^Du$ zR=GRDl_6lvSjWn9g5WA?64%)XI&n{XDVs1GrNC_7Kjp!S7c<&Bt z8}JksjrVh<%klKs9F9;$zx$h^)>lW^Bj%PvcJYoFG5AoSG7-WsOM1rgyQ(22=$}B?X`3YA6a|X?s}l|s4S))?l+LNt3z&sJIbDVyhyZ~z0c}_z{Qpz`yTW5 z<@)=kOzb-K>&5Ss7itUp#BwXtupH>`M2F|QdM&@4b{o+_9F489)*N9mUzinL<1y4` z$!i@~5|N^fT_=vf{>R}!>o`aSS&Nk!I{X1r34?14f#TGCS z@ka#fyl^eOY7_K)hmAC1{(X}2u_$jXh#ul1B};%^U+`$CTmPz|TL^OgUyz|yzo+BmYW@4tO{Eu!+= zTm8#l0O~5CcoHIjg^HB#iZ8hj)iGs`B!7APml{gLqK;UsS0Y4M*{Qr_yLo3m{;xp?m)Igk|_bjfq5ass!e%<~30r>21Q;u)ZEP>LMj< z0GY4psw;RXekus1`}L^E}Jn!&ML#V)-)6 z+5Hr;H)@NmLTF(S4S`6Yn~!_*UU9b3GU7~)`}wtvV^hWyi3UQ)WMoB`GXarv5xi5` zWCR)=xhI+U3LHUBIqzdNq%r!kFmJf}<2SPcx~;$Pg67r{g~yXZS2}-)(AWB$cKKzD zSA6WYj!AgRjzP8bnldoLYquJK!buO-7;D|g0+q+W2;Qd?aam2SNdFHhs#&pLTgft; zAtMmW=r6jQsVc@B9vrdaR&gIpi2o$MdCYFg(DGQ36i1?IMU#N87j?KT1R<|_<@8Q9 zjeTs=#6*IlHgUq+Ry6GX+o;BqDLI1L`o|m2xRzr5`-i8MajF5Uoxqv}WAaJ(o1SCB z-R1EaLB2wppA(Sp>m9|kSFiGT*=v@WY6CiBVJ&b|9x&|#)o3HWpGHgk`!|J{Hf1r= z3=2u!TkZbx#rOSH{%KlaMH+70Xb)PY-VY@;@$ z|2cQMx9eScUMBF?cE>uH=E`Pif_C1pmK+CY0zH#ay)J@2x3+%pcwm#I{u>5o=v|b? znzRhsRRTWp1W!;670quLx$RZa>r+98L-`byJ}7$>S=^2rpR#L=Z& z!P`wC3>%TUz6NH2mf$dfmF(BD!r+}BfV)vj-)R#;c=m;~;=z*ZN3a~Z;_@q!t@$I8TMib;@^UK|jHRmCi- zGW<|UW%ZBcJCQ%TIA+N=X1qx=iK5vaW6Yy_F%z9MM!gDaJPCaWp4OYmOM z|K#yzobd=T;Jp$w8LP<#?dtc9F{e+6i{yAmQ1~LtHTJVRD$eIqO)XTQUjjbqym?ad zyh?RtW!2C%XeUrO%B0d{sP`F$=lO^zKC3Asqxih#0@Jh^JX(LYj|^9yAV9|#expi zSzeZ?t!H#;807MWB#PUx$M*Oq`_c$_dpz_CesoKf9|ORzIU(4&HaV0g^KppJ;RD7WHw zY&Lpa{5Kt>Xdps*aqET>>iyWmC;BBNM_zJFTSX|7#C>;>La}}$zs69Um`IBGz$U$P zpUt)%q!#@o%Ke1tRo&N=jS&0+C3TycPR3|Zp0()*scd{T<1f$S(d)4Hg=BJY$?RXx z2*k*=clwOJ4dW|_78($b&|w+mPM;r)*e9}%eB3F^c-2fx`nilpjghRC9yn$yu4_n= zyx8D}yC7_4>4(fJo3-=}B>&{XD`5Drcm(G~bW&x+d_dMLA8w$32|bhdff8T%XkSNL zC|B)^lf3Ak9$3VI4v1G8rdq*B219Xg%rVSQxEFm>kKza@fQ418J72JrVlt|j*DGfcjD5Hn3TqKS|2>=(<0L5xWOF_0nHAJ#<_UyEMzjg<`md0 z>d4gCTJ1jUP8|vyB}o(xikDbN<@$UP*f7oaGTEY~EM%E`8AP!HKQSXqA5fy-i#@Gu z58X6s`l9Lq<&PQ1l5=TY3>wd7yw5Eybv^vxrOe#*Ne2YZ;5Q>- zfT1>&R6)NNT4XvpU0=Zl1PP)lfTKCwVyFk&j-M!x1sSpof4#F87s_CHcKpP(7l4-0 z{9Uo2&~`KGyo>a_y-vHwL?FjS z&(vxAwTspgwYzrMW9|L$ZJ?0cRJM07wxCnmM`oBS6&!#b{##V2sCk|iDiC?ncr#?E zC_hNFb?@g*S0z>Sym5kM?(#3d)zQDYN_4Yck^4}CBjbS+7e93yL|>Bn;Ei6%Z?yvf z(EsWlefnrJO-LV;Q_>lRDI2Hs0MW};R%f9Y86)}bu+8n|zssL)?wqP>=9@?AB(nxj z6=3_K@^86)y*%aavSKi02oa1A<--*XgZ|VLPpxvJmhDPQZP59cz3Fe3;qd=E1nZaB zFC342i&@58DP<0~6-LzI{X1WS01@(Cr1S07zmw_zYTN>tml>Af0;(Y48`q~}(^`Ax z7$qxatNv56zU!LaEeU!>n>t_v!`9{0<9k&xj&phch`Nj>QM3GZwDKvzBUMO_Dnv!m zr*ZOCAn3yd#UN&6E+JLf$a)h*|n;l9Kd;=)ORu*Cxgt6l=dEd(L zD6q%#ikCODTAoZpeC|K2DqA~Motrz>4wu4---{?SA%ZMOG0BfvYN-ZBL{5Dx zA$9c>1=P7Nv~+)?Ssti5#>jpxkMA=-0iS^9vj}BmTBHkLm(?m-&Trpx4>2tK-0m{e zmFdu>UmU}Q{_%Q5wX6Qs-nea$^qAufFR6F=ND$>#JKYMSTNR4dVl7ez?Hpg{RSso>CIJzOQIy*=Q+{j!$fZlw&vO zwdFXPB*1kCXH^zo^oVeU=K!h47}wNdL$xOr;0L2SE-AnVI{F$U*38Kf;ed+LO$0HD zIk}{-zhvq2Q^Z2&Ur(!!nR!^7GNmKC>jC+z0mH!XQ40z_UC; zZ3!1icmzn+FgaF@N)}Gf>Mo;c4{-BX&Aw7qm-B2;bpf2xap4@|2U219*lKQe$_wqp z!^-o1iUB;;di=ihiC$mc7<{L-qb<8qjjIkI`(_fYp9Yq8rZFh$rRZgUrcHaPGK#f+ zlKbgbxw3wm1^ung)8D)g&3j0i6eaOMy>JBt0MiH3-Axd`5O#iUyl~UN7q?MIFGmmT zMVDfD9cRUf4^*8+Rj|H*seV9Dg6~|jVJ_l2<3Wy``&3DIFK-*GVWr)o3RYNArpVPv!{;# zpa;B3f-J=ty4*J74WsJCz!%ZOF|i#t?o75!=}3w%7jRarE1Y{pTZ(bwE>9;6RlY_c z+g~iaOzcH7G+Pe$I0+V+9R&a6R)&t)y~*_OXPhqNJ1Bb7>?~2Ji}aGT0X-u7`r3Fn z*$b@e^-Fu{Su^#0{kkyYAsIo*%PvU=YL)cy@aI%k=w(jf&MOfz{LNt*Tl%B04fm8o3gqo_6_=s+5{S zuk2HcUr&YC6S}bOB~LJqr}HFo&$@^AWWuGMzf6=Gvm+o_2S~F3o=+zvePPZ-V$f=L zTT;z<`YO_)X(O5B(HteGZgGV?rFy3p0N`(OCUGYkU1GcsxTo&5& zsfR=ESmId8Dc9r|@mNb8Cpx>dwgq@oMIjSJH-=V|E4-m*da38mmskMXboG=Q&s zERrpm}hTI-vmH8}*^@Q`gz{DR#AC>?k(3*^A%dc?(*s5IWFz?Vt*vYRz2?JP8 zi%gNZwsR&Xaay7_N_ybm*6r+J%FqC#y$3x2LU%M)LJ-TVX*f zrO@&Su^wyhUCg{C=1)51MP53T=o&gV1w?a9b_^6In0JsD<;P_DHl+3lBMQL@J#ZZM zKOsLYJR&1ng5ams6Xi()NCDdsn0+@xWp|!6ekpO-hnbrRN$2 z=;AhdX5mA>WPggTwr_maO++GS#`5f!tYj`Kw-OWdVrPUuMgBlcvD$2%1;M&Y>hf<} zao994)&?erJFF6|xiaAa{#_rx<*>g1-`qYv>;mJ~HMDaWuz9XTqs!*;px-@{5*Zpj zMY>lB@8%MjbcS47ekjT@UWaTM-)OQyWE+^n*GSxSt7RcTPwW+w9CgH^;kZJ|1IX)YZ-D!A8Mb-$`L_vJ-5j6t zP}KqH*Qa@U++&R&Q+E&AX_9cFVKKBds^Ld9O4m%&%l{IiGhA3i9$Cxlwm2zjvI8z| zw?`?gWRG6Sp8_!@YYO0B0MO_&tYtop7ekD$wGz4w(Xq4pK9jZra!@;l{0GjsbwO!- zfq_7{Ih;L;is|Qv!#G39NxaZiUDDO@+PBbg490jeu;q_iOTvYc^?+t9MfQz|)z}HfvIcdD3g0I(H{i zfXf0z`%XsaMO-hgq|!bNg?lm;yY{8J*9uwV>FnzL@m%Repb9+|5BnoM#*05b5UH~F z9*@;9{-f$Cib1lRI>pCUiUWRd6PR=(GR+)vm8jenEkc9;{rO^T{i~69C8jI(F4xPo zSgTpkrvVdql_+n~uXMCS7XPU0_!*FuLMDgyn1#T?^B`7vHK$ z(Z%OuBn0BJ?D3Xk#KTHta}0ISlz&_?9P}Xw9?9tJ0RD?6@AI=_JVT4)6sA_-4{+eD z)S=g&vcfFW7nl&43ZqT4hHG3EfJ6POa<6$b%9d+mOea?Z6Ho(Cq8C*h2Au(+nZ)(9i;dz3oeJrN;33FLZ-Qo4%)^}PO(qn# zX;MV7wg={BQo&T@z3t74Dj{HcV6oMxZEIF{BkLh-Jjf@*liTL$0wK`Zf){~-FQiD% zJ48(QBNH5Zk{pT@mDm~LO80WdDRt=mlAg92gljhYp@yy6a30h(JE3v0VzHQ{A}Y>3 z$2%Xh(rh0^PFml{XOvM_r5ra75Qt5m~k$SF?YD*#7$PI z{?Haw{|jJ8Kvix`*71sEpuIiLT0+@ zDl!|f-%uJM#XMgEQW9(&8iE$`f!kz2*3;+xoNiEeehxKfh=RDTRRz@Cf zCGnEk6?T1W_pmNrv~PP>nObEVD1lx+4{2Zt4T(lciJSehWlY?+%(9MFuE`TZApNfv zrsGAYmdAD4XgS(bZ05J~wxo>#YE~Sa-O>g5J96Vf-fa0>r?swCaA!dw7LzRTl*u&{)m zy~MQKsBQuc9`W{Y#JXa7jfnf?&b+$}i<$44m4D=Wa9)u8PIl@&MZrA;Mm-5`Eb~hbi~qYi2$rw>HOY2yQn(SY^Wv!{Ee`#9&V`}CfUcf%0{HU0#2 zgQc5s^2z!Z1z!3LrqrU`tIZ-1P4lT(a!F?H@|wJCeT)9oxB!bp25;4u3`oV#FzA4$ zx+hb=pe-li8DG6uj8z|POe4-<)N`(CqF&G#rc{$057ZHQZd|3veXd!iehDOIpl6bD z@u9@zT$;QdWFD|gdc(?d+={~@u;z8mEIh57WGOsnB>ect<9xp#fZM#IF)k_S>ouI& zdR2cjZyX76@OZ+s4;iVeg{rH|up)@Zj44GU+ z5j%UZjwmteR?|%ufsZOW@<02ZqF+(o{s0TIZ@+%^d*tKa5L-snkREkroR~vaXUz!h z#LId={J#LU%ACZKAa1^;53g|8n|yF`N56E99b%FvuZv_bdQtMWfZM>0c=5X2A5a=_ zFcn?Ywa;16h``@Ngey7!-yup&-_|Jq0uW9QizcLh)d-+j2Ya=eeXxl^B(=Us-XO!8|u*eA5OyE*W{Z>ZFdxC1z@K#s|_V1s9r_}E3rz?a&eMdByBTy&$204u7#;;9Nm;)DmSFG zo*@iwS&hp*SgSwCKMi86ghsKRw>5w7+MQn7|12nm3+4#nUT;k02&1P)dktD!q_glc z-c4ny?%T~TPYSpeX_D}`mKGwvo@&o=fAFgQGhrD{NgHPdPUwu3XoJSWvVVtXzr>p*iYl8*B7j%Xk_IyL~#+*m1*wENm%!S$-XzvDY?cJ5dRT+Be<3X+Pi zXIN}yjf=t2iO#gJgmK5HoHK9QzE8ycRDaaGzd{OADjfx$7kLn){-2J{J)Y_R{r{Vd zjYLFk8OA1sic!sZ&WBV|P9-f<>K!sd&Ko(T+2$C%hdG2qQF2I%95#f7cTRJtO$$j* ziRANr{eJ(y{&@cJdfu+<`M4g}{h{VzL2ES5(~pP4ng(Kxmd~D>KQ&wbvxnPC+as&O z4?~lG95&7*YLedcD?H(9J!nOA3FyhIol>8Duv%P?W_=YiS5qUD0P+3iHmCgRimm#c zzaQmmK{Dmb1Ei&?2N8)6gOBi@=KOnC`?oZsOm86WA2)Cn2KrRAksE=P(zQn*&4FOC zT|qv3e$x6{vWk6yccwXm$J?)}usQB|7oC>cfujr8g#zYX_5mg1`{$4+iJ7Ybko5Vp z&x|9T#kf5h1KKSTL52fP@%Zg;V4m&7zkLsr`nRP|qWpd$cCtFu(zd_4wu`jD!g?dQ zrBs3;sXu)2D&vb+I%CQ2`IVn+l0v^b97`=9;uWFv9#yM>reOd;6DVFJCBQ8nxU9IPkyhA1B>0QZ4oD0hMx6k-cndse&b7-8T#IpCSz^f0!XlwFxBhSU2(z(GWC~DU z6u?S{VfmF&G!pRhv7E)jv~0hh0I#9JLvu?68KEx|-fwZOA1;6ZP!D@+u%ZXk%uTqv zg}nWalWMj1)`!I;zMYoj_TR?h_S$2k!Yruk*g-x;_WZ)Ssh*24-9A+tsgYI1q-DO! zdM;(h18&0mmzhqbdBKQv@ZK)X)Du_Yvh5n;MjrLJX`hEV*S}urXpmcSJ)QO{)@_Mm z=M2E?-@fWJRqn*|qX8697Oq_^LR;q7RphuV-cT71Lt=(Ly*hj5>SLd}NW+}&vPa=A zO<^$f%uKK$7lUCq$JR~uhH2Hau#MS1W3v**yQqmyBvvS9G-Lo$?fa2=@y)&o;-i5+ z|2S|@0v(Xotj4Q7)xFZrnoP{q|2olM{W9#^YOmi?2rqprVd+d&CN-;gjG;hoziUl- zEkf9S9&i$EvtmO<-u(aoNz^)?O3OsK_>VjEQq%h*|K38{eh9}j+r)oXcK?uM;~yxv zMS{k9m}eVQozgZ5uMbb91fDL6|G@=V+Q#V%OLOieg!FHEb(yvwAH=-%?Ei!&MhQj` zJK5`2HO1CM#T1Xo{b5|L0LBy1bWkRY9;|z*T3| z#_ORbFO#=iI7g64DbY@-yKd1OWV}ox7bN2mWe`Hly@)7+o4@WKPEFf4V`0=5590N* z{|ox*?B{q9-4>_!iP^z2v|)+|J6ui)6RMP=RxAVygrtZ-HliP?-{e_y#boABP{`jy zHMow9x{S?tYC3F?Bm&Wf!O-F^>cCTvuLh;JXVwX2N+F|Ldl@6IAd34AmALw9XKSxQ zxIZLK^}D|?L9~<&$9tBt6=nj&8dtP1NAO~Hq=d)oH7}w6j3NavO^oI!Z$IX~1<<%Q7Bi&@+{P`2TWb(YwV62Z`0a?|sN!0BnVPh;|S1@>R6fHG}?+;w?e z?)d4k@pft(npBYg?%W_%Qo-V1LqscDALKz?hX&g~ly?&9LCyBcZgfqhvb&hB%#7q@KA%`%ca_masj1}N>$vP0p95N^>&B_T1}Ru;6_?wG z#=6m{5E_Y&pIP5vTgz()xWU5k;FcsqQ;x|w;;skQep0pAJa5T1si>_e)*ZuN#q9`# z-&k?8-oT8bCw`>GdPm8dJL(vxpDvoq{Lpu#PQpb&%pFXgmjJ`c4zrsszcGG%o4})# zRM=`K4A00qMz<2)?tM9NuZg9n@$wx+B@a_D974TnfRuY+RmrdNKLZM#EVSZS(e>>N z%|8EQsy=HwJp8OdZvb9OdW&DM(~U~V>G^#B`tUQ~wzly~oH@AdEAwxEhl|JL)zu0w z5;j5j9Ow;94>b-Y``FdLWoGYu4ECxNT9-3Obn~ObA-nkBb>|vx8A8rC;{6q+N{Tq> zgu;u43H>g5!($?Ote*AOqF`O&qS%3O=Hy=`k>T+uT}l$2^w$Kal9D;`mw9nAhNug)^;oNst^L3Egtos%fy%9~UIXu@7yF zcgKdJ{s4>tT{%GqfsPDmz*6AzvOZC`1 z?6Gqag7k+>VUP9F&ZFIK9|?b&Rw)9vEflm*ytM&25;AZvB=`O<0nv&EZUf8j4~pc? zE}Wda=6!FAR36d^I-WmBMm0^+ax~!~llG#*RhF~5d&>KX3rH+t6Ulh)AF3SbEPm_B zGn1FMC9IajbD2`lV*P!C$((9PtxnnjZgk6(=aoD{S1iyq5OnEk>igYg8L!7U4QQ2^ zLtrFwb|v`V)R_&<2}Tb^!fgeqsF*a+=m04YZV9F+`;71W(S>HDnq8ehv<}n?@36t= zgf!YUSeQ~Q>(?5vlV4iNU3_1z8XY~Atl3x!&FF6ixq;$KVP3wZ1|OIS4;rNFQx#k( zpuoH`j5nZupx{H`1L`!SP_-F9urS@wu;2u%&=ENbY&uuV7`!f7D0^ID(uNk&_OeVM*uX%|rz6v&4k+4IwR(V#PL%|oL zYL^cLsGc+lpBvdu@Z^`BK2j>H;1W}2vyxkLTA&rvVzJcS74!6@Qd&Sw@%ShB0mHKV zMgAco2v!cDIqMHy48;gdJS~s-ow}4a2Ism3EYj!o7bY;kbFMnHF^`Jc$H3k+(hc=v z$X`37Yd=XZhp8k#|3`Ex`^WXkR2K`4f>#uPOaVd^K9;N{$trb zmVOwYJStcCv-qfBbV)Uk6nay-MC(Hp*#CsBjNuZnNwd}FRXG=;WOnT{B~vB{QGiFU zoK-d=y@DKUXoq?-_1CY1-_Ff7!xDZ(C{WHPrx^K5m4fzXCeVoB5h`uV85-!KNjr?d zW0{x8^RkA*pRz>@pK_&6I$}dzK7csN_e_Ey~TyCP$mCUZ8<{~TFwuv^RNu`geNgrUoTOty&EbJae5 zfBVmS?gf2R_5yWy^{?I8$7mHuO+X*&ojF%;!&WcDE{?ZK*e%cP*;8T^>W7jyCJ&i& z&jqQA`{YWvgRkVjy(fa8Ahczh)^x(Ha<7ca<`N(L{MqxrAq08dxXFx4Ec7dO%kLcm zRS0CnCCM>#MDeY3j;X9@%NaEL1*V0olyE?p0<&b(LrCe2=uZ9mfaL4;CWU8=3uhle z9rnIz55U82V^k;*o-?RgSmW}fF1q?ndOA;TZk#$AJFC}oDuGIizZ8T=#hNFAeM|x- z>-E*3Br+bOEED$_q`g17Hko)XqqjebT+nU+5AZc3j$(BnCu81-=ID9qN$&+00#27;!HJ-s zc2)($6l9xJkR*M97~E+G=s2Ig=7L==e8;i^r7qD#^)LyF+{n%M ziQ`ccNQ8)`67t9moKdp}e?|X;kOCHzKR*w&N`o3l`{eg1#ePhJ0kyBt(Fc=?8%W-y zevGW2&o#{gFF?zHyH4)|D}@a)va=hH45M}oIApiu z+7E*rS$7skXmO5ELwoh(7Fp7f^n00)-1nILGqItX7i?JrV})&S45Voens!j}8F{?~ z+A=*Tq70HmxZIKPA?!I37UvS1MWGz$3pI!-5l`S>ixuU_n$&Of6yaeu;GfU)9u6 zf2Pi_?5;3~z?NvesNLXDs^UR-NLsxSU4L0->wf-$_t0NnKzOMGQxRY>-JyPm@yPdD^l1p=tB-NKoFe zCZW0+gc2n56EF!A4F+H_tDRcq+!+;^rs0|aX!dh*zsDi8qUxPrFwOEcwnYBU#bn0U zT`P%Ri=v5VaoucK9W&8n&xK_`T^-R(Slo1m{uADoHmybaIv`j_DmU{$sQPcc($aDx zD#FMxHs~7Vd}2^A*!wM?o}5ZBVBgG6KVy%j=#8wQegVoq#?^d5Drvq-0&Z4tFG_Wg zB9FM#7toqaKvK7sM=H#FC`X;lBR!(?KuJd^(@s%Q&f;znx~Zy2)ZXd2^@a6P9B%NX zD9~bNww4yj6NCKuxDq|}!RQx&=;nb9{tb_=W-YejO+wmsnx6QFsh^h&8mvoc4SpOl z5ifIhqax&*$93-KtiY_|bf!IaeWzsQOL<;)k3(qv$NGz0&FnVu_{DL(0z)(_%FX5a zeI(9{Kdy6Ay-(X7XL$5{kx~;7x-R;ZZdLZl1)Ugz<)E2);6`q@`Pui0aR60OWD?pL#$FC`7wMbTECrxq> zj{l2ENj@s+14Ak$MWU7&>}Q_bufFE<#7ilkPEv)y2>RuK4~)!O+fv!>0|=riVYN2B zws>HJo7t{Rt$Hk%{$(%LmLOI>d1sdV!*pQbQRJxG5pUy^>|B^DU-r&n~2JjVcmKD9B1_< z&FLOM-;KY3l2dJj%`q;5XnYgLTw;H#$e3Wb^K-rJZ%oB5IA+`nR6vretKCe#Jkj(` zg1|A5na$nbH?AC9Nu$gx_UjF7E)tlnnjQtG=T?cu$CgQ$a#Kbw z-;rrop}AZXkH~XypdZT{&B-)@(C?`v3^Kb5{TU+lB{WgE!cPZ~JmwbT@%B!ntBGhO zsx-n}vmUE9K-%qgl^&tjtG$0T(2#6~g@@|k}Fd{C}7o0i6f zp-DqjYU_rAn9@ss`b8UUTB6&I-$cSEif`4P<*&OpK|(nyjVM#F&4awaVwVfw1l`?E zo+a#-n!5h|o@G(dod4MqMdTF?(pS%-y5p)Yzkw%%S7~>G|^RPKWr^bct z`*k>KHXgeHDL4%}baT+n_sZnrxOHJ!OiKm0!z`N9@sA-^ZXjCr96_uSM{fFfCJJN= zOy64Lj6k%m5heG?%C3+MNdy3i058~EZE(-#DpEqnSj#lx@_241JYG>M&l%@>He|Zc zNsb`n>dKHWEj(uVMnX9#W|go@?sD#o{E3IH%D=lsrcDYU`;IPUO23(ximnkcT=cve zjk;Gm-DE_u<;Qv_SBsG#oE(=tsElPg!)tZGkpvC19}!-dq0Ic8{AvF80*EMYg^*TuXkd|2jv@6pmDVfo?4igh&uQWG23kD0Q_V9yu+io75s_v1_ zTwuagl36_62_3yfok|)xonNrMhA0!GVgKZuP^!UI1#u?U1yfY|QLKy1{#j!?@69dx z-BK)0uK+ng^adkM0i@X@qP-%gr*hyWUX%^YI3aDFYSN+CeQN9N5N3G@foT+(AWNha zcQ;)Mli|TC5VO4DZ1{^2%#Rz{s{s#hBIAA0_({~;qt zr-gQzKYR?awvpPrwHIS=<`S-m{)Jwp#C7G4jx-TFm3q187u`AMc;t1qwM(DTi(=1v z?(GrjG%=^Bcq=;GTiDE;!R?sorfY-A)VR-rXp&EvDur2?F)p)$v62l#icj(>jZ zXDfAgopY5oNVE%8i$}VlU7`|ZZF|BcfaR0{Smv5RYbt_dKj={G_WHB88@A9%7^<8U z>7v{#`4t0lJfwQZcPq?f#YPO3_C~q&+UxY`VN%%*xr<+fw$vbcilLdU!#a#lV(h#U zm$@qaAS<)iWjZ{w5yg}=cO>fS7k6u2*1L}OpXlQM!XPkwXR#Nr`MSvTf7AXXk6>U4 z>t85xxXulWd^&glVLk<+#VSZXz&L!hzY;_Yd#Ub=TEQ8|Do~}dKfqEc6KY)V3tNK0 z+%ff%=i7BMMrrMt6SS?Z>r{CE8SLSL$zZ)7kqPT-rO;*{KR@SIrUWlIP8x_S049(U za@;CftIR|sO`y{LZqq z8!_=IDt>k*Y5dx;RTu}Rj36&?2WFkWxN=pF=hwCjS8%2WAQc;s!pGM?`+VuxmlE*a z;Mt!1a`0W`ygn_#vt_sX#eKWVN*1$nbgJ3}UTZU-PR~Oh7|dH|eE0#{9J(tgC;>u0 z%^d^2Y41HN?{L%LTaCiGW~m?#;O5UT$_?(@zb|yYbP8?VJLGp{cqPaXkLP&HzVh8T zXkT4=>sLnmorn9}RvVr0d4$~wOFY!-+LKv^RsNK4x4N}P>ZGeA=AWm5E4ipYLe&9` zBBxiwR5H6~@Wa0E){ZtNjKolLv~q-?T$3((b$nNfy`pA>S{i+JI={VQH*q23IT!rBulQnxB0+hv>YaF z%!A!~h&1wqZT*v=vh!sq1fCsFBcg&LcMPPj(&8VW7zenOh@*z>a#Qil(XpZjTHk}q zmry$`H(wwgf`}(gGV}^92p<((_BHyND$SZ(+WqFKcO0u1_*7PiGkWxf3_x7&;Y;<3ex11hzNd*)$pV|?8o$7yl-rlt)=tykXdqC)BrRN>qPkb^Mm3cwXz-F= zC3L+G$WwMeazM-dH^3i1d~cIEk)vg+m!1l>jf*-ea4%3US2fh5t8c!1Doo(;m!7$t(DQ}Nq3_5h6$kch{XY-7X!v(B(b zNUP^Js;nhBmh36|;jlQ3?-L_EI-YjXdeeiP-l@m5F3Jt;#)`0ioS|=Bj+epb7Ye=g zrcr=mSuH94wTL*T4!I2Q0kQ@iiIoi?eK#2uaqO|5^~4Ti(Q{b zs>Rfk)U+KvSX8D(Cq%>JTnU*w&W?p%rz#fX!-x_gj9%K`8;5Lc9w=RiY5i?h{`@l` z^}xR|`Co1jXiLg4uk;-QF@y5UFEG9t1n-De1L+67F6m~vm*3YLr%IT0Tb&I8R%pKH z@%}g;+}DBk4Rvpu=NAmaeeK`jsI^oNvq>sj>>Dv5fu|wx? zf0INC@CyzJTHUvA>sVmpPe_@xvHs4C4$Rs^`r|ZYkw+Vx}h^`UAV*W zv|)VKN#mfHlQ95bU7GCb(UYij33*v3kn;f9>P1g@EG@~CJYqqVT~dq_i(vaU z$Z6Sk-gVuz5@xs#Xm}|YUx%=n7mC-EWihYR02i_B8@{9alFB`q!_;$rzTyM`T;BZR zu8_3GmOLnaq6etl;XOtE&y}L$T2r;)k~qSlY3draNU_LDNxE@ zo$8hc0Z1hKDncG759^N*qe`8##&e|YG3#uwYwV2ki3ILEd4V9`CuRAp$EXHj3PXO$ zh1MP$s|GSR8`swix~wl}TZviiUwFSD0-II0Xo48gbOCG89?^g{vAaD?r8<^l?Z5=1g;zp@W6DAB!!NfT`s|k zo|KBG2+kkmN0%0WIgtdnyggjFja%A_suHN`#rOW>;wg$!W#2w7Cv8476qFA(`K}vS zNA#EID63mgN=DN!7&w$ynvB%%Ts(A7!tPuW(&unX5*Kti9qs&-hLZL>zKdvclbxzG zaG2x*z1kp`$x>OD2*Hu4x2tm^UikLobn&9pluoaHU_w3~^Pb Result { let shape = mask.shape(); - let on_true = Tensor::new(on_true, on_false.device())?.broadcast_as(shape.dims())?; + let on_true = Tensor::new(on_true, on_false.device())? + .to_dtype(on_false.dtype())? + .broadcast_as(shape.dims())?; let m = mask.where_cond(&on_true, on_false)?; Ok(m) } diff --git a/candle-transformers/src/models/mamba.rs b/candle-transformers/src/models/mamba.rs index 836327ee..a75ee87a 100644 --- a/candle-transformers/src/models/mamba.rs +++ b/candle-transformers/src/models/mamba.rs @@ -1,4 +1,3 @@ -#![allow(unused)] /// A fast implementation of mamba for inference only. /// This is based on: https://github.com/LaurentMazare/mamba.rs use crate::models::with_tracing::{linear, linear_no_bias, Linear}; @@ -38,12 +37,12 @@ pub struct State { } impl State { - pub fn new(batch_size: usize, cfg: &Config, device: &Device) -> Result { + pub fn new(batch_size: usize, cfg: &Config, dtype: DType, device: &Device) -> Result { let mut hs = Vec::with_capacity(cfg.n_layer); let mut prev_xs = Vec::with_capacity(cfg.n_layer); for _i in 0..cfg.n_layer { - let h = Tensor::zeros((batch_size, cfg.d_inner(), D_STATE), DType::F32, device)?; - let x = Tensor::zeros((batch_size, cfg.d_inner()), DType::F32, device)?; + let h = Tensor::zeros((batch_size, cfg.d_inner(), D_STATE), dtype, device)?; + let x = Tensor::zeros((batch_size, cfg.d_inner()), dtype, device)?; hs.push(h); prev_xs.push([x.clone(), x.clone(), x.clone(), x.clone()]); } @@ -128,8 +127,8 @@ impl MambaBlock { let delta = delta.apply(&self.dt_proj)?; // softplus let delta = (delta.exp()? + 1.)?.log()?; - let a = self.a_log.to_dtype(candle::DType::F32)?.exp()?.neg()?; - let d = self.d.to_dtype(candle::DType::F32)?; + let a = self.a_log.to_dtype(delta.dtype())?.exp()?.neg()?; + let d = self.d.to_dtype(delta.dtype())?; // Selective scan part // Eqn (2a), page 3, h_t = Ab h_{t-1} + Bb x_t @@ -178,6 +177,7 @@ pub struct Model { layers: Vec, norm_f: RmsNorm, lm_head: Linear, + dtype: DType, } impl Model { @@ -196,6 +196,7 @@ impl Model { layers, norm_f, lm_head, + dtype: vb.dtype(), }) } @@ -208,4 +209,8 @@ impl Model { state.pos += 1; xs.apply(&self.norm_f)?.apply(&self.lm_head) } + + pub fn dtype(&self) -> DType { + self.dtype + } } diff --git a/candle-transformers/src/utils.rs b/candle-transformers/src/utils.rs index 3cbcac5c..d29995ed 100644 --- a/candle-transformers/src/utils.rs +++ b/candle-transformers/src/utils.rs @@ -2,7 +2,7 @@ use candle::{Result, Tensor}; pub fn apply_repeat_penalty(logits: &Tensor, penalty: f32, context: &[u32]) -> Result { let device = logits.device(); - let mut logits = logits.to_vec1::()?; + let mut logits = logits.to_dtype(candle::DType::F32)?.to_vec1::()?; let mut already_seen = std::collections::HashSet::new(); for token_id in context { if already_seen.contains(token_id) { From a0460cd2b13a396ff8545dc1bbffa741f2ec3d79 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 10 Apr 2024 21:19:21 +0200 Subject: [PATCH 131/131] Add the code-gemma models. (#2038) * Add the code-gemma models. * Tweak to the gemma config. --- candle-examples/examples/gemma/main.rs | 12 ++++++++++++ candle-transformers/src/models/gemma.rs | 19 +++++++++++++++---- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/candle-examples/examples/gemma/main.rs b/candle-examples/examples/gemma/main.rs index 0e37f5cd..a5f7d591 100644 --- a/candle-examples/examples/gemma/main.rs +++ b/candle-examples/examples/gemma/main.rs @@ -30,6 +30,14 @@ enum Which { InstructV1_1_2B, #[value(name = "1.1-7b-it")] InstructV1_1_7B, + #[value(name = "code-2b")] + CodeBase2B, + #[value(name = "code-7b")] + CodeBase7B, + #[value(name = "code-2b-it")] + CodeInstruct2B, + #[value(name = "code-7b-it")] + CodeInstruct7B, } struct TextGeneration { @@ -224,6 +232,10 @@ fn main() -> Result<()> { Which::Base7B => "google/gemma-7b".to_string(), Which::Instruct2B => "google/gemma-2b-it".to_string(), Which::Instruct7B => "google/gemma-7b-it".to_string(), + Which::CodeBase2B => "google/codegemma-2b".to_string(), + Which::CodeBase7B => "google/codegemma-7b".to_string(), + Which::CodeInstruct2B => "google/codegemma-2b-it".to_string(), + Which::CodeInstruct7B => "google/codegemma-7b-it".to_string(), }, }; let repo = api.repo(Repo::with_revision( diff --git a/candle-transformers/src/models/gemma.rs b/candle-transformers/src/models/gemma.rs index ab2a9582..15e4dccb 100644 --- a/candle-transformers/src/models/gemma.rs +++ b/candle-transformers/src/models/gemma.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use candle::{DType, Device, Module, Result, Tensor, D}; -use candle_nn::{linear_b as linear, Linear, VarBuilder}; +use candle_nn::{linear_b as linear, Activation, Linear, VarBuilder}; fn default_max_position_embeddings() -> usize { 4096 @@ -11,8 +11,9 @@ fn default_max_position_embeddings() -> usize { pub struct Config { pub attention_bias: bool, pub head_dim: usize, - #[serde(alias = "hidden_activation")] - pub hidden_act: candle_nn::Activation, + // The code gemma configs include both hidden_act and hidden_activation. + pub hidden_act: Option, + pub hidden_activation: Option, pub hidden_size: usize, pub intermediate_size: usize, pub num_attention_heads: usize, @@ -26,6 +27,16 @@ pub struct Config { pub max_position_embeddings: usize, } +impl Config { + fn hidden_act(&self) -> Result { + match (self.hidden_act, self.hidden_activation) { + (None, Some(act)) | (Some(act), None) => Ok(act), + (Some(_), Some(_)) => candle::bail!("both hidden_act and hidden_activation are set"), + (None, None) => candle::bail!("none of hidden_act and hidden_activation are set"), + } + } +} + #[derive(Debug, Clone)] struct RmsNorm { weight: Tensor, @@ -127,7 +138,7 @@ impl MLP { gate_proj, up_proj, down_proj, - act_fn: cfg.hidden_act, + act_fn: cfg.hidden_act()?, }) } }

+ 🕯️ +
+

Candle Moondream 2

+

Rust/WASM Demo

+

+ Moondream 2 + by + Vik + and model implementation on Candle by + Santiago Medina + +

+
+ +
+

+ Note: + When first run, the app will download and cache the model, which could + take a few minutes. Then, the embeddings and generation will take a + few minutes to start 😔. +

+
+
+ + +
+
+ + + +
+ +
+ Advanced Options + +
+ + + + 500 + + + + 0.00 + + + + 1.00 + + + + + 1.10 + + + +
+
+ +
+
+
+
+
+ +
+
+
+
+ + + +
+ +
+ +
+ +
+
+
+
+

Generation:

+
+ + + No output yet +
+
+
+
+
+

Examples:

+ + + + + +
+
+