From 9a27f11c3f79687e6e9e06264d4448d809c5c01a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 2 Nov 2023 17:48:07 +0100 Subject: [PATCH] Adding tons of profiling and removing the metal allocation (still slow). --- candle-core/Cargo.toml | 1 + candle-core/src/metal_backend.rs | 117 ++++++++++-------- candle-examples/examples/quantized/main.rs | 25 ++-- .../src/models/quantized_llama.rs | 18 +++ 4 files changed, 100 insertions(+), 61 deletions(-) diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index 69bf47cf..029f626f 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -30,6 +30,7 @@ safetensors = { workspace = true } thiserror = { workspace = true } yoke = { workspace = true } zip = { workspace = true } +tracing = { workspace = true } [dev-dependencies] anyhow = { workspace = true } diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs index 982e5ee1..0246e9ce 100644 --- a/candle-core/src/metal_backend.rs +++ b/candle-core/src/metal_backend.rs @@ -73,7 +73,11 @@ impl BackendStorage for MetalStorage { fn to_cpu_storage(&self) -> Result { match self.dtype{ - DType::F32 => Ok(CpuStorage::F32(self.buffer.read_to_vec(self.buffer.length() as usize / 4))), + DType::F32 => { +// self.buffer.read_to_vec(self.buffer.length() as usize / 4); + let mut buffer = vec![0.0; 32000]; +buffer[0] = 1.0; + Ok(CpuStorage::F32(buffer))}, dtype => todo!("Unsupported dtype {dtype:?}") } } @@ -271,13 +275,16 @@ impl MetalStorage { let elem_count = b * m * n; match (self.dtype, rhs.dtype) { (DType::F32, DType::F32) => { + let span= tracing::span!(tracing::Level::TRACE, "metal alloc matmul"); + let _enter = span.enter(); + + let out_buffer = self.device.new_buffer( + (elem_count * mem::size_of::()) as u64, + MTLResourceOptions::empty(), + ); if b != 1 { println!("TODO implement batched matmul for B={b}"); // bail!("Didn't implemented strided matmul yet"); - let out_buffer = self.device.new_buffer( - (elem_count * mem::size_of::()) as u64, - MTLResourceOptions::empty(), - ); return Ok(Self { buffer: out_buffer, device: self.device.clone(), @@ -286,20 +293,17 @@ impl MetalStorage { } if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() { println!("Didn't implemented non contiguous matmul yet {:?} {:?}", lhs_l.is_contiguous(), rhs_l.is_contiguous()); - let out_buffer = self.device.new_buffer( - (elem_count * mem::size_of::()) as u64, - MTLResourceOptions::empty(), - ); return Ok(Self { buffer: out_buffer, device: self.device.clone(), dtype: self.dtype(), }); } - let out_buffer = self.device.new_buffer( - (elem_count * mem::size_of::()) as u64, - MTLResourceOptions::empty(), - ); + return Ok(Self { + buffer: out_buffer, + device: self.device.clone(), + dtype: self.dtype(), + }); let m: u64 = m.try_into().expect("usize should fit u64"); let n: u64 = n.try_into().expect("usize should fit u64"); let k: u64 = k.try_into().expect("usize should fit u64"); @@ -359,6 +363,15 @@ impl MetalStorage { } } +impl MetalDevice{ + pub fn flush(&mut self){ + self.command_buffer.commit(); + self.command_buffer.wait_until_completed(); + self.command_buffer = self._command_queue.new_owned_command_buffer(); + } + +} + impl BackendDevice for MetalDevice { type Storage = MetalStorage; @@ -399,43 +412,47 @@ impl BackendDevice for MetalDevice { fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result { let option = metal::MTLResourceOptions::CPUCacheModeDefaultCache; - let buffer = match storage { - CpuStorage::U8(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - CpuStorage::U32(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - CpuStorage::I64(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - CpuStorage::BF16(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - CpuStorage::F16(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - CpuStorage::F32(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - CpuStorage::F64(storage) => self.device.new_buffer_with_data( - storage.as_ptr() as *const core::ffi::c_void, - (storage.len() * mem::size_of::()) as u64, - option, - ), - }; + let span= tracing::span!(tracing::Level::TRACE, "metal alloc"); + let _enter = span.enter(); + + let buffer = self.device.new_buffer(4, option); + // let buffer = match storage { + // CpuStorage::U8(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // CpuStorage::U32(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // CpuStorage::I64(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // CpuStorage::BF16(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // CpuStorage::F16(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // CpuStorage::F32(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // CpuStorage::F64(storage) => self.device.new_buffer_with_data( + // storage.as_ptr() as *const core::ffi::c_void, + // (storage.len() * mem::size_of::()) as u64, + // option, + // ), + // }; Ok(Self::Storage { buffer, device: self.clone(), diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs index 347e87a8..7ce7a531 100644 --- a/candle-examples/examples/quantized/main.rs +++ b/candle-examples/examples/quantized/main.rs @@ -232,7 +232,7 @@ fn main() -> anyhow::Result<()> { use tracing_subscriber::prelude::*; let args = Args::parse(); - let device = candle_examples::device(false)?; + let mut device = candle_examples::device(false)?; let temperature = if args.temperature == 0. { None } else { @@ -384,17 +384,20 @@ fn main() -> anyhow::Result<()> { for index in 0..to_sample { let input = Tensor::new(&[next_token], &device)?.unsqueeze(0)?; let logits = model.forward(&input, prompt_tokens.len() + index)?; + if let candle::Device::Metal(device) = &mut device{ + device.flush() + } let logits = logits.squeeze(0)?; - let logits = if args.repeat_penalty == 1. { - logits - } else { - let start_at = all_tokens.len().saturating_sub(args.repeat_last_n); - candle_transformers::utils::apply_repeat_penalty( - &logits, - args.repeat_penalty, - &all_tokens[start_at..], - )? - }; + // let logits = if args.repeat_penalty == 1. { + // logits + // } else { + // let start_at = all_tokens.len().saturating_sub(args.repeat_last_n); + // candle_transformers::utils::apply_repeat_penalty( + // &logits, + // args.repeat_penalty, + // &all_tokens[start_at..], + // )? + // }; // TODO Remove this once implementation is finished. let logits = logits.ones_like()?; next_token = logits_processor.sample(&logits)?; diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 3685d3de..fdf12fc6 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -79,6 +79,8 @@ fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result impl LayerWeights { fn apply_rotary_emb(&self, x: &Tensor, index_pos: usize) -> Result { let _enter = self.span_rot.enter(); + let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cos"); + let _enter = span.enter(); let (b_sz, n_head, seq_len, n_embd) = x.dims4()?; let cos = self .cos @@ -88,21 +90,37 @@ impl LayerWeights { .sin .narrow(0, index_pos, seq_len)? .reshape((seq_len, n_embd / 2, 1))?; + drop(_enter); + let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad"); + let _enter = span.enter(); let cos = cos.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?; let sin = sin.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?; + drop(_enter); // This mimics the llama.cpp behavior. // https://github.com/ggerganov/llama.cpp/blob/1f0bccb27929e261744c979bc75114955da49e98/ggml.c#L12104-L12105 // The x0 and x1 value are interleaved on the n_embd (= head_dim) dimension. // The resulting y0 and y1 are also interleaved with: // y0 = x0*cos - x1*sin // y1 = x0*sin + x1*cos + let span = tracing::span!(tracing::Level::TRACE, "attn-rot-reshape"); + let _enter = span.enter(); let x = x.reshape((b_sz, n_head, seq_len, n_embd / 2, 2))?; let x0 = x.narrow(D::Minus1, 0, 1)?; let x1 = x.narrow(D::Minus1, 1, 1)?; + drop(_enter); + let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad-mul"); + let _enter = span.enter(); let y0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?; let y1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?; + drop(_enter); + let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cat"); + let _enter = span.enter(); let rope = Tensor::cat(&[y0, y1], D::Minus1)?; + drop(_enter); + let span = tracing::span!(tracing::Level::TRACE, "attn-rot-flatten"); + let _enter = span.enter(); let rope = rope.flatten_from(D::Minus2)?; + drop(_enter); Ok(rope) }