From 9a27f11c3f79687e6e9e06264d4448d809c5c01a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 2 Nov 2023 17:48:07 +0100
Subject: [PATCH] Adding tons of profiling and removing the metal allocation
 (still slow).

---
 candle-core/Cargo.toml                        |   1 +
 candle-core/src/metal_backend.rs              | 117 ++++++++++--------
 candle-examples/examples/quantized/main.rs    |  25 ++--
 .../src/models/quantized_llama.rs             |  18 +++
 4 files changed, 100 insertions(+), 61 deletions(-)
diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
index 69bf47cf..029f626f 100644
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@@ -30,6 +30,7 @@ safetensors = { workspace = true }
 thiserror = { workspace = true }
 yoke = { workspace = true }
 zip = { workspace = true }
+tracing = { workspace = true }
 
 [dev-dependencies]
 anyhow = { workspace = true }
diff --git a/candle-core/src/metal_backend.rs b/candle-core/src/metal_backend.rs
index 982e5ee1..0246e9ce 100644
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@@ -73,7 +73,11 @@ impl BackendStorage for MetalStorage {
 
     fn to_cpu_storage(&self) -> Result<CpuStorage> {
         match self.dtype{
-            DType::F32 => Ok(CpuStorage::F32(self.buffer.read_to_vec(self.buffer.length() as usize / 4))),
+            DType::F32 => {
+// self.buffer.read_to_vec(self.buffer.length() as usize / 4);
+                let mut buffer = vec![0.0; 32000];
+buffer[0] = 1.0;
+                Ok(CpuStorage::F32(buffer))},
             dtype => todo!("Unsupported dtype {dtype:?}")
         }
     }
@@ -271,13 +275,16 @@ impl MetalStorage {
         let elem_count = b * m * n;
         match (self.dtype, rhs.dtype) {
             (DType::F32, DType::F32) => {
+            let span= tracing::span!(tracing::Level::TRACE, "metal alloc matmul");
+            let _enter = span.enter();
+
+                let out_buffer = self.device.new_buffer(
+                    (elem_count * mem::size_of::<f32>()) as u64,
+                    MTLResourceOptions::empty(),
+                );
                 if b != 1 {
                     println!("TODO implement batched matmul for B={b}");
                     // bail!("Didn't implemented strided matmul yet");
-                    let out_buffer = self.device.new_buffer(
-                        (elem_count * mem::size_of::<f32>()) as u64,
-                        MTLResourceOptions::empty(),
-                    );
                     return Ok(Self {
                         buffer: out_buffer,
                         device: self.device.clone(),
@@ -286,20 +293,17 @@ impl MetalStorage {
                 }
                 if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() {
                     println!("Didn't implemented non contiguous matmul yet {:?} {:?}", lhs_l.is_contiguous(), rhs_l.is_contiguous());
-                    let out_buffer = self.device.new_buffer(
-                        (elem_count * mem::size_of::<f32>()) as u64,
-                        MTLResourceOptions::empty(),
-                    );
                     return Ok(Self {
                         buffer: out_buffer,
                         device: self.device.clone(),
                         dtype: self.dtype(),
                     });
                 }
-                let out_buffer = self.device.new_buffer(
-                    (elem_count * mem::size_of::<f32>()) as u64,
-                    MTLResourceOptions::empty(),
-                );
+                return Ok(Self {
+                    buffer: out_buffer,
+                    device: self.device.clone(),
+                    dtype: self.dtype(),
+                });
                 let m: u64 = m.try_into().expect("usize should fit u64");
                 let n: u64 = n.try_into().expect("usize should fit u64");
                 let k: u64 = k.try_into().expect("usize should fit u64");
@@ -359,6 +363,15 @@ impl MetalStorage {
     }
 }
 
+impl MetalDevice{
+    pub fn flush(&mut self){
+        self.command_buffer.commit();
+        self.command_buffer.wait_until_completed();
+        self.command_buffer = self._command_queue.new_owned_command_buffer();
+    }
+
+}
+
 impl BackendDevice for MetalDevice {
     type Storage = MetalStorage;
 
@@ -399,43 +412,47 @@ impl BackendDevice for MetalDevice {
 
     fn storage_from_cpu_storage(&self, storage: &CpuStorage) -> Result<Self::Storage> {
         let option = metal::MTLResourceOptions::CPUCacheModeDefaultCache;
-        let buffer = match storage {
-            CpuStorage::U8(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<u8>()) as u64,
-                option,
-            ),
-            CpuStorage::U32(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<u32>()) as u64,
-                option,
-            ),
-            CpuStorage::I64(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<i64>()) as u64,
-                option,
-            ),
-            CpuStorage::BF16(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<bf16>()) as u64,
-                option,
-            ),
-            CpuStorage::F16(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<f16>()) as u64,
-                option,
-            ),
-            CpuStorage::F32(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<f32>()) as u64,
-                option,
-            ),
-            CpuStorage::F64(storage) => self.device.new_buffer_with_data(
-                storage.as_ptr() as *const core::ffi::c_void,
-                (storage.len() * mem::size_of::<f64>()) as u64,
-                option,
-            ),
-        };
+        let span= tracing::span!(tracing::Level::TRACE, "metal alloc");
+        let _enter = span.enter();
+
+        let buffer = self.device.new_buffer(4, option);
+        // let buffer = match storage {
+        //     CpuStorage::U8(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<u8>()) as u64,
+        //         option,
+        //     ),
+        //     CpuStorage::U32(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<u32>()) as u64,
+        //         option,
+        //     ),
+        //     CpuStorage::I64(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<i64>()) as u64,
+        //         option,
+        //     ),
+        //     CpuStorage::BF16(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<bf16>()) as u64,
+        //         option,
+        //     ),
+        //     CpuStorage::F16(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<f16>()) as u64,
+        //         option,
+        //     ),
+        //     CpuStorage::F32(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<f32>()) as u64,
+        //         option,
+        //     ),
+        //     CpuStorage::F64(storage) => self.device.new_buffer_with_data(
+        //         storage.as_ptr() as *const core::ffi::c_void,
+        //         (storage.len() * mem::size_of::<f64>()) as u64,
+        //         option,
+        //     ),
+        // };
         Ok(Self::Storage {
             buffer,
             device: self.clone(),
diff --git a/candle-examples/examples/quantized/main.rs b/candle-examples/examples/quantized/main.rs
index 347e87a8..7ce7a531 100644
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@@ -232,7 +232,7 @@ fn main() -> anyhow::Result<()> {
     use tracing_subscriber::prelude::*;
 
     let args = Args::parse();
-    let device = candle_examples::device(false)?;
+    let mut device = candle_examples::device(false)?;
     let temperature = if args.temperature == 0. {
         None
     } else {
@@ -384,17 +384,20 @@ fn main() -> anyhow::Result<()> {
         for index in 0..to_sample {
             let input = Tensor::new(&[next_token], &device)?.unsqueeze(0)?;
             let logits = model.forward(&input, prompt_tokens.len() + index)?;
+            if let candle::Device::Metal(device) = &mut device{
+                device.flush()
+            }
             let logits = logits.squeeze(0)?;
-            let logits = if args.repeat_penalty == 1. {
-                logits
-            } else {
-                let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
-                candle_transformers::utils::apply_repeat_penalty(
-                    &logits,
-                    args.repeat_penalty,
-                    &all_tokens[start_at..],
-                )?
-            };
+            // let logits = if args.repeat_penalty == 1. {
+            //     logits
+            // } else {
+            //     let start_at = all_tokens.len().saturating_sub(args.repeat_last_n);
+            //     candle_transformers::utils::apply_repeat_penalty(
+            //         &logits,
+            //         args.repeat_penalty,
+            //         &all_tokens[start_at..],
+            //     )?
+            // };
             // TODO Remove this once implementation is finished.
             let logits = logits.ones_like()?;
             next_token = logits_processor.sample(&logits)?;
diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs
index 3685d3de..fdf12fc6 100644
--- a/candle-transformers/src/models/quantized_llama.rs
+++ b/candle-transformers/src/models/quantized_llama.rs
@@ -79,6 +79,8 @@ fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result<Tensor>
 impl LayerWeights {
     fn apply_rotary_emb(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
         let _enter = self.span_rot.enter();
+        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cos");
+        let _enter = span.enter();
         let (b_sz, n_head, seq_len, n_embd) = x.dims4()?;
         let cos = self
             .cos
@@ -88,21 +90,37 @@ impl LayerWeights {
             .sin
             .narrow(0, index_pos, seq_len)?
             .reshape((seq_len, n_embd / 2, 1))?;
+        drop(_enter);
+        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad");
+        let _enter = span.enter();
         let cos = cos.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?;
         let sin = sin.broadcast_as((b_sz, 1, seq_len, n_embd / 2, 1))?;
+        drop(_enter);
         // This mimics the llama.cpp behavior.
         // https://github.com/ggerganov/llama.cpp/blob/1f0bccb27929e261744c979bc75114955da49e98/ggml.c#L12104-L12105
         // The x0 and x1 value are interleaved on the n_embd (= head_dim) dimension.
         // The resulting y0 and y1 are also interleaved with:
         //   y0 = x0*cos - x1*sin
         //   y1 = x0*sin + x1*cos
+        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-reshape");
+        let _enter = span.enter();
         let x = x.reshape((b_sz, n_head, seq_len, n_embd / 2, 2))?;
         let x0 = x.narrow(D::Minus1, 0, 1)?;
         let x1 = x.narrow(D::Minus1, 1, 1)?;
+        drop(_enter);
+        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-broad-mul");
+        let _enter = span.enter();
         let y0 = (x0.broadcast_mul(&cos)? - x1.broadcast_mul(&sin)?)?;
         let y1 = (x0.broadcast_mul(&sin)? + x1.broadcast_mul(&cos)?)?;
+        drop(_enter);
+        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-cat");
+        let _enter = span.enter();
         let rope = Tensor::cat(&[y0, y1], D::Minus1)?;
+        drop(_enter);
+        let span = tracing::span!(tracing::Level::TRACE, "attn-rot-flatten");
+        let _enter = span.enter();
         let rope = rope.flatten_from(D::Minus2)?;
+        drop(_enter);
         Ok(rope)
     }