Tmp state.

This commit is contained in:
Nicolas Patry
2023-11-10 15:35:46 +01:00
committed by Nicolas Patry
parent f710fab02e
commit d46670f7c0
14 changed files with 699 additions and 63 deletions

View File

@ -8,7 +8,7 @@ use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
pub enum DeviceLocation { pub enum DeviceLocation {
Cpu, Cpu,
Cuda { gpu_id: usize }, Cuda { gpu_id: usize },
Metal, Metal { gpu_id: usize },
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]

View File

@ -14,7 +14,9 @@ impl Tensor {
crate::DeviceLocation::Cuda { gpu_id } => { crate::DeviceLocation::Cuda { gpu_id } => {
format!(", cuda:{}", gpu_id) format!(", cuda:{}", gpu_id)
} }
_ => todo!(), crate::DeviceLocation::Metal { gpu_id } => {
format!(", metal:{}", gpu_id)
}
}; };
write!(f, "Tensor[")?; write!(f, "Tensor[")?;
@ -477,7 +479,9 @@ impl std::fmt::Display for Tensor {
crate::DeviceLocation::Cuda { gpu_id } => { crate::DeviceLocation::Cuda { gpu_id } => {
format!(", cuda:{}", gpu_id) format!(", cuda:{}", gpu_id)
} }
crate::DeviceLocation::Metal => todo!(), crate::DeviceLocation::Metal { gpu_id } => {
format!(", metal:{}", gpu_id)
}
}; };
write!( write!(

View File

@ -100,11 +100,30 @@ impl BackendStorage for MetalStorage {
} }
fn to_cpu_storage(&self) -> Result<CpuStorage> { fn to_cpu_storage(&self) -> Result<CpuStorage> {
// TODO Is this necessary
// self.buffer.synchronize();
match self.dtype { match self.dtype {
DType::U8 => Ok(CpuStorage::U8(
self.buffer.read_to_vec(self.buffer.length() as usize / 1),
)),
DType::U32 => Ok(CpuStorage::U32(
self.buffer.read_to_vec(self.buffer.length() as usize / 4),
)),
DType::I64 => Ok(CpuStorage::I64(
self.buffer.read_to_vec(self.buffer.length() as usize / 8),
)),
DType::F16 => Ok(CpuStorage::F16(
self.buffer.read_to_vec(self.buffer.length() as usize / 2),
)),
DType::BF16 => Ok(CpuStorage::BF16(
self.buffer.read_to_vec(self.buffer.length() as usize / 2),
)),
DType::F32 => Ok(CpuStorage::F32( DType::F32 => Ok(CpuStorage::F32(
self.buffer.read_to_vec(self.buffer.length() as usize / 4), self.buffer.read_to_vec(self.buffer.length() as usize / 4),
)), )),
dtype => todo!("Unsupported dtype {dtype:?}"), DType::F64 => Ok(CpuStorage::F64(
self.buffer.read_to_vec(self.buffer.length() as usize / 8),
)),
} }
} }
@ -132,6 +151,7 @@ impl BackendStorage for MetalStorage {
) )
.unwrap(); .unwrap();
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
return Ok(Self { return Ok(Self {
buffer, buffer,
device: device.clone(), device: device.clone(),
@ -200,6 +220,7 @@ impl BackendStorage for MetalStorage {
) )
.map_err(MetalError::from)?; .map_err(MetalError::from)?;
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
Ok(Self { Ok(Self {
buffer, buffer,
@ -242,6 +263,7 @@ impl BackendStorage for MetalStorage {
} }
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
// command_buffer.wait_until_scheduled(); // command_buffer.wait_until_scheduled();
// debug!( // debug!(
// "cast {:?} - {:?} - {:?}", // "cast {:?} - {:?} - {:?}",
@ -289,6 +311,7 @@ impl BackendStorage for MetalStorage {
todo!("TODO Implement the kernel calling {}", B::KERNEL); todo!("TODO Implement the kernel calling {}", B::KERNEL);
} }
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
Ok(Self { Ok(Self {
buffer, buffer,
@ -361,6 +384,7 @@ impl BackendStorage for MetalStorage {
.map_err(MetalError::from)?; .map_err(MetalError::from)?;
} }
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
Ok(Self { Ok(Self {
buffer, buffer,
@ -400,6 +424,7 @@ impl BackendStorage for MetalStorage {
) )
.map_err(MetalError::from)?; .map_err(MetalError::from)?;
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
Ok(Self { Ok(Self {
buffer, buffer,
device, device,
@ -489,6 +514,7 @@ impl BackendStorage for MetalStorage {
let dtype = self.dtype; let dtype = self.dtype;
let device = self.device(); let device = self.device();
let mut buffer = device.new_buffer(dst_el, dtype); let mut buffer = device.new_buffer(dst_el, dtype);
let out = self.to_cpu_storage().unwrap();
let name = match (ids.dtype, self.dtype) { let name = match (ids.dtype, self.dtype) {
(DType::U32, DType::F32) => "is_u32_f32", (DType::U32, DType::F32) => "is_u32_f32",
(left, right) => todo!("index select metal {left:?} {right:?}"), (left, right) => todo!("index select metal {left:?} {right:?}"),
@ -508,6 +534,7 @@ impl BackendStorage for MetalStorage {
) )
.map_err(MetalError::from)?; .map_err(MetalError::from)?;
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
Ok(Self { Ok(Self {
buffer, buffer,
device: device.clone(), device: device.clone(),
@ -556,39 +583,42 @@ impl BackendStorage for MetalStorage {
if el_count == 0 { if el_count == 0 {
return Ok(()); return Ok(());
} }
if src_l.is_contiguous() { // todo!("Copy strided {:?}", src_l.is_contiguous());
let command_buffer = self.device.command_queue.new_command_buffer(); // if src_l.is_contiguous() {
let blip = command_buffer.new_blit_command_encoder(); // let command_buffer = self.device.command_queue.new_command_buffer();
blip.copy_from_buffer( // let blip = command_buffer.new_blit_command_encoder();
&self.buffer, // blip.copy_from_buffer(
src_l.start_offset() as u64, // &self.buffer,
&dst.buffer, // src_l.start_offset() as u64,
dst_offset as u64, // &dst.buffer,
self.buffer.length(), // dst_offset as u64,
); // self.buffer.length(),
} else { // );
let command_buffer = self.device.command_queue.new_command_buffer(); // } else {
let kernel_name = match self.dtype { let command_buffer = self.device.command_queue.new_command_buffer();
DType::F32 => candle_metal_kernels::unary::strided::copy::FLOAT, let kernel_name = match self.dtype {
DType::F16 => candle_metal_kernels::unary::strided::copy::HALF, DType::F32 => candle_metal_kernels::unary::strided::copy::FLOAT,
DType::BF16 => candle_metal_kernels::unary::strided::copy::BFLOAT, DType::F16 => candle_metal_kernels::unary::strided::copy::HALF,
dtype => todo!("copy_strided not implemented for {dtype:?}"), DType::BF16 => candle_metal_kernels::unary::strided::copy::BFLOAT,
}; dtype => todo!("copy_strided not implemented for {dtype:?}"),
candle_metal_kernels::call_unary_strided( };
&self.device.device, candle_metal_kernels::call_unary_strided(
&command_buffer, &self.device.device,
&self.device.kernels, &command_buffer,
kernel_name, &self.device.kernels,
src_l.dims(), kernel_name,
&self.buffer, src_l.dims(),
&src_l.stride(), &self.buffer,
src_l.start_offset(), &src_l.stride(),
&mut dst.buffer, src_l.start_offset(),
dst_offset, &mut dst.buffer,
) dst_offset,
.map_err(MetalError::from)?; )
command_buffer.commit(); .map_err(MetalError::from)?;
} command_buffer.commit();
command_buffer.wait_until_completed();
// todo!("Output {:?}", dst.buffer.read_to_vec::<f32>(10));
// }
Ok(()) Ok(())
} }
} }
@ -616,28 +646,29 @@ impl MetalStorage {
match (self.dtype, rhs.dtype) { match (self.dtype, rhs.dtype) {
(DType::F32, DType::F32) => { (DType::F32, DType::F32) => {
let mut out_buffer = self.device.new_buffer(elem_count, self.dtype); let mut out_buffer = self.device.new_buffer(elem_count, self.dtype);
if b != 1 { // if b != 1 {
// debug!("TODO implement batched matmul for B={b}"); // // debug!("TODO implement batched matmul for B={b}");
// bail!("Didn't implemented strided matmul yet"); // crate::bail!("Didn't implemented strided matmul yet");
return Ok(Self { // return Ok(Self {
buffer: out_buffer, // buffer: out_buffer,
device: self.device.clone(), // device: self.device.clone(),
dtype: self.dtype(), // dtype: self.dtype(),
}); // });
} //}
if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() { // if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() {
// debug!( // // debug!(
// "TODO non contiguous matmul yet {:?} {:?} - {:?} - {transpose_right}", // // "TODO non contiguous matmul yet {:?} {:?} - {:?} - {transpose_right}",
// lhs_l.is_contiguous(), // // lhs_l.is_contiguous(),
// rhs_l.is_contiguous(), // // rhs_l.is_contiguous(),
// rhs_l // // rhs_l
// ); // // );
return Ok(Self { // crate::bail!("No not contiguous matmul");
buffer: out_buffer, // return Ok(Self {
device: self.device.clone(), // buffer: out_buffer,
dtype: self.dtype(), // device: self.device.clone(),
}); // dtype: self.dtype(),
} // });
// }
// debug!("TODO GEMM"); // debug!("TODO GEMM");
let command_buffer = self.device.command_queue.new_command_buffer(); let command_buffer = self.device.command_queue.new_command_buffer();
@ -659,7 +690,15 @@ impl MetalStorage {
.map_err(MetalError::from)?; .map_err(MetalError::from)?;
command_buffer.commit(); command_buffer.commit();
command_buffer.wait_until_completed();
// command_buffer.wait_until_scheduled(); // command_buffer.wait_until_scheduled();
//
let left = self.buffer.read_to_vec::<f32>(10);
let right = rhs.buffer.read_to_vec::<f32>(10);
let out = out_buffer.read_to_vec::<f32>(10);
println!("{b} {m} {n} {k} ");
println!("{left:?} {right:?} {out:?}");
Ok(Self { Ok(Self {
buffer: out_buffer, buffer: out_buffer,
@ -709,7 +748,9 @@ impl BackendDevice for MetalDevice {
} }
fn location(&self) -> crate::DeviceLocation { fn location(&self) -> crate::DeviceLocation {
crate::DeviceLocation::Metal crate::DeviceLocation::Metal {
gpu_id: self.registry_id() as usize,
}
} }
fn same_device(&self, rhs: &Self) -> bool { fn same_device(&self, rhs: &Self) -> bool {
@ -767,6 +808,8 @@ impl BackendDevice for MetalDevice {
option, option,
), ),
}; };
// TODO is that necessary ?
// buffer.did_modify_range(metal::NSRange::new(0, buffer.length()));
// debug!("Allocate 2 - buffer size {}", buffer.length()); // debug!("Allocate 2 - buffer size {}", buffer.length());
Ok(Self::Storage { Ok(Self::Storage {
buffer, buffer,

View File

@ -157,6 +157,8 @@ pub(crate) fn from_storage<S: Into<Shape>>(
) -> Tensor { ) -> Tensor {
let dtype = storage.dtype(); let dtype = storage.dtype();
let device = storage.device(); let device = storage.device();
let shape = shape.into();
// println!("{:?} {storage:?}", shape);
let tensor_ = Tensor_ { let tensor_ = Tensor_ {
id: TensorId::new(), id: TensorId::new(),
storage: Arc::new(RwLock::new(storage)), storage: Arc::new(RwLock::new(storage)),
@ -166,7 +168,11 @@ pub(crate) fn from_storage<S: Into<Shape>>(
dtype, dtype,
device, device,
}; };
Tensor(Arc::new(tensor_)) let result = Tensor(Arc::new(tensor_));
// todo!(" from_storage");
// let result = result.to_device(&Device::Cpu).unwrap();
// todo!(" {result}");
result
} }
impl Tensor { impl Tensor {

View File

@ -329,14 +329,18 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
.get_ids() .get_ids()
.to_vec(); .to_vec();
println!("{tokens:?}");
let start_gen = std::time::Instant::now(); let start_gen = std::time::Instant::now();
for index in 0.. { for index in 0..1 {
if tokens.len() >= config.seq_len { if tokens.len() >= config.seq_len {
break; break;
} }
let context_size = if index > 0 { 1 } else { tokens.len() }; let context_size = if index > 0 { 1 } else { tokens.len() };
let ctxt = &tokens[tokens.len().saturating_sub(context_size)..]; let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?; let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
// println!("Input {}", input);
// println!("Input {}", input.to_device(&candle::Device::Cpu)?);
let logits = model.forward(&input, index_pos)?; let logits = model.forward(&input, index_pos)?;
let logits = logits.i((0, logits.dim(1)? - 1))?; let logits = logits.i((0, logits.dim(1)? - 1))?;
let logits = if common_args.repeat_penalty == 1. || tokens.is_empty() { let logits = if common_args.repeat_penalty == 1. || tokens.is_empty() {

View File

@ -17,3 +17,4 @@ tracing = "0.1.37"
[dev-dependencies] [dev-dependencies]
half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] } half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
rand = "0.8.5"

View File

@ -0,0 +1,75 @@
use candle_metal_kernels::{call_affine, Kernels};
use metal::objc::rc::autoreleasepool;
use metal::{Device, MTLResourceOptions};
use rand;
use std::any::type_name;
use std::time::Instant;
fn main() {
let device = Device::system_default().unwrap();
let kernels = Kernels::new();
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
let f32_10k = (0..10000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let f32_100k = (0..100000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
"dtype", "kernel", "size", "runs", "total time", "avg time"
);
// f32
run_affine_bench(&device, &kernels, &f32_1k);
run_affine_bench(&device, &kernels, &f32_10k);
run_affine_bench(&device, &kernels, &f32_100k);
}
fn run_affine_bench<T: Clone>(device: &Device, kernels: &Kernels, v: &[T]) {
let command_queue = device.new_command_queue();
let options = MTLResourceOptions::StorageModeManaged;
let iterations = 10000;
let input = device.new_buffer_with_data(
v.as_ptr() as *const core::ffi::c_void,
core::mem::size_of_val(v) as u64,
options,
);
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
let mul: f32 = 1.2345;
let add: f32 = 2.3456;
let total_time = autoreleasepool(|| {
let command_buffer = command_queue.new_command_buffer();
let start = Instant::now();
for _ in 0..iterations {
call_affine(
&device,
command_buffer,
&kernels,
v.len(),
&input,
&mut output,
mul,
add,
)
.unwrap();
}
command_buffer.commit();
command_buffer.wait_until_completed();
start.elapsed()
});
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
type_name::<T>().split("::").last().unwrap(),
"affine",
v.len(),
iterations,
total_time,
total_time / iterations
);
}

View File

@ -0,0 +1,182 @@
use candle_metal_kernels::{binary, call_binary_contiguous, call_binary_strided, Kernels};
use half::{bf16, f16};
use metal::objc::rc::autoreleasepool;
use metal::{Device, MTLResourceOptions};
use rand;
use std::any::type_name;
use std::time::Instant;
fn main() {
let device = Device::system_default().unwrap();
let kernels = Kernels::new();
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
let f32_10k = (0..10000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let f32_100k = (0..100000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let f16_map = |v: &[f32]| v.iter().map(|v| f16::from_f32(*v)).collect::<Vec<_>>();
let f16_1k = f16_map(&f32_1k);
let f16_10k = f16_map(&f32_10k);
let f16_100k = f16_map(&f32_100k);
let bf16_map = |v: &[f32]| v.iter().map(|v| bf16::from_f32(*v)).collect::<Vec<_>>();
let bf16_1k = bf16_map(&f32_1k);
let bf16_10k = bf16_map(&f32_10k);
let bf16_100k = bf16_map(&f32_100k);
let f32_ckernels = [
binary::contiguous::add::FLOAT,
binary::contiguous::sub::FLOAT,
binary::contiguous::mul::FLOAT,
binary::contiguous::div::FLOAT,
];
let f32_skernels = [
binary::strided::add::FLOAT,
binary::strided::sub::FLOAT,
binary::strided::mul::FLOAT,
binary::strided::div::FLOAT,
];
let f16_ckernels = [
binary::contiguous::add::HALF,
binary::contiguous::sub::HALF,
binary::contiguous::mul::HALF,
binary::contiguous::div::HALF,
];
let f16_skernels = [
binary::strided::add::HALF,
binary::strided::sub::HALF,
binary::strided::mul::HALF,
binary::strided::div::HALF,
];
let bf16_ckernels = [
binary::contiguous::add::BFLOAT,
binary::contiguous::sub::BFLOAT,
binary::contiguous::mul::BFLOAT,
binary::contiguous::div::BFLOAT,
];
let bf16_skernels = [
binary::strided::add::BFLOAT,
binary::strided::sub::BFLOAT,
binary::strided::mul::BFLOAT,
binary::strided::div::BFLOAT,
];
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
"dtype", "kernel", "size", "runs", "total time", "avg time"
);
// f32
run_binary_bench(&device, &kernels, &f32_1k, f32_ckernels, f32_skernels);
run_binary_bench(&device, &kernels, &f32_10k, f32_ckernels, f32_skernels);
run_binary_bench(&device, &kernels, &f32_100k, f32_ckernels, f32_skernels);
// f16
run_binary_bench(&device, &kernels, &f16_1k, f16_ckernels, f16_skernels);
run_binary_bench(&device, &kernels, &f16_10k, f16_ckernels, f16_skernels);
run_binary_bench(&device, &kernels, &f16_100k, f16_ckernels, f16_skernels);
// bf16
run_binary_bench(&device, &kernels, &bf16_1k, bf16_ckernels, bf16_skernels);
run_binary_bench(&device, &kernels, &bf16_10k, bf16_ckernels, bf16_skernels);
run_binary_bench(&device, &kernels, &bf16_100k, bf16_ckernels, bf16_skernels);
}
fn run_binary_bench<T: Clone>(
device: &Device,
kernels: &Kernels,
v: &[T],
contiguous: [binary::contiguous::Kernel; 4],
strided: [binary::strided::Kernel; 4],
) {
let command_queue = device.new_command_queue();
let options = MTLResourceOptions::StorageModeManaged;
let iterations = 1000;
let input = device.new_buffer_with_data(
v.as_ptr() as *const core::ffi::c_void,
core::mem::size_of_val(v) as u64,
options,
);
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
// Contiguous
for kernel_name in contiguous {
let total_time = autoreleasepool(|| {
let command_buffer = command_queue.new_command_buffer();
let start = Instant::now();
for _ in 0..iterations {
call_binary_contiguous(
device,
&command_buffer,
kernels,
kernel_name,
v.len(),
&input,
&input,
&mut output,
)
.unwrap();
}
command_buffer.commit();
command_buffer.wait_until_completed();
start.elapsed()
});
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
type_name::<T>().split("::").last().unwrap(),
kernel_name.to_string(),
v.len(),
iterations,
total_time,
total_time / iterations
);
}
// Strided
let shape = vec![2, 5_000];
let strides = vec![2, 1];
let offset = 0;
for kernel_name in strided {
let total_time = autoreleasepool(|| {
let command_buffer = command_queue.new_command_buffer();
let start = Instant::now();
for _ in 0..iterations {
call_binary_strided(
device,
command_buffer,
&kernels,
kernel_name,
&shape,
&input,
&strides,
offset,
&input,
&strides,
offset,
&mut output,
)
.unwrap();
}
command_buffer.commit();
command_buffer.wait_until_completed();
start.elapsed()
});
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
type_name::<T>().split("::").last().unwrap(),
kernel_name.to_string(),
v.len(),
iterations,
total_time,
total_time / iterations
);
}
}

View File

@ -0,0 +1,84 @@
use candle_metal_kernels::{call_cast_contiguous, Kernels};
use metal::objc::rc::autoreleasepool;
use metal::{Device, MTLResourceOptions};
use rand;
use std::any::type_name;
use std::time::Instant;
fn main() {
let device = Device::system_default().unwrap();
let kernels = Kernels::new();
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
let f32_10k = (0..10000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let f32_100k = (0..100000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let contiguous_kernels = ["cast_u32_f32"];
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
"dtype", "kernel", "size", "runs", "total time", "avg time"
);
// f32
run_cast_bench(&device, &kernels, &f32_1k, &contiguous_kernels);
run_cast_bench(&device, &kernels, &f32_10k, &contiguous_kernels);
run_cast_bench(&device, &kernels, &f32_100k, &contiguous_kernels);
}
fn run_cast_bench<T: Clone>(
device: &Device,
kernels: &Kernels,
v: &[T],
contiguous: &[&'static str],
) {
let command_queue = device.new_command_queue();
let options = MTLResourceOptions::StorageModeManaged;
let iterations = 1000;
let input = device.new_buffer_with_data(
v.as_ptr() as *const core::ffi::c_void,
core::mem::size_of_val(v) as u64,
options,
);
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
// Contiguous
for kernel_name in contiguous {
let total_time = autoreleasepool(|| {
let command_buffer = command_queue.new_command_buffer();
let start = Instant::now();
for _ in 0..iterations {
call_cast_contiguous(
device,
&command_buffer,
kernels,
kernel_name,
v.len(),
&input,
&mut output,
)
.unwrap();
}
command_buffer.commit();
command_buffer.wait_until_completed();
start.elapsed()
});
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
type_name::<T>().split("::").last().unwrap(),
kernel_name.to_string(),
v.len(),
iterations,
total_time,
total_time / iterations
);
}
// Strided?
}

View File

@ -0,0 +1,197 @@
use candle_metal_kernels::{call_unary_contiguous, call_unary_strided, unary, Kernels};
use half::{bf16, f16};
use metal::objc::rc::autoreleasepool;
use metal::{Device, MTLResourceOptions};
use rand;
use std::any::type_name;
use std::time::Instant;
fn main() {
let device = Device::system_default().unwrap();
let kernels = Kernels::new();
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
let f32_10k = (0..10000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let f32_100k = (0..100000)
.map(|_| rand::random::<f32>())
.collect::<Vec<_>>();
let f16_map = |v: &[f32]| v.iter().map(|v| f16::from_f32(*v)).collect::<Vec<_>>();
let f16_1k = f16_map(&f32_1k);
let f16_10k = f16_map(&f32_10k);
let f16_100k = f16_map(&f32_100k);
let bf16_map = |v: &[f32]| v.iter().map(|v| bf16::from_f32(*v)).collect::<Vec<_>>();
let bf16_1k = bf16_map(&f32_1k);
let bf16_10k = bf16_map(&f32_10k);
let bf16_100k = bf16_map(&f32_100k);
let f32_ckernels = [
unary::contiguous::sin::FLOAT,
unary::contiguous::cos::FLOAT,
unary::contiguous::exp::FLOAT,
unary::contiguous::sqr::FLOAT,
unary::contiguous::sqrt::FLOAT,
unary::contiguous::neg::FLOAT,
unary::contiguous::copy::FLOAT,
];
let f32_skernels = [
unary::strided::sin::FLOAT,
unary::strided::cos::FLOAT,
unary::strided::exp::FLOAT,
unary::strided::sqr::FLOAT,
unary::strided::sqrt::FLOAT,
unary::strided::neg::FLOAT,
unary::strided::copy::FLOAT,
];
let f16_ckernels = [
unary::contiguous::sin::HALF,
unary::contiguous::cos::HALF,
unary::contiguous::exp::HALF,
unary::contiguous::sqr::HALF,
unary::contiguous::sqrt::HALF,
unary::contiguous::neg::HALF,
unary::contiguous::copy::HALF,
];
let f16_skernels = [
unary::strided::sin::HALF,
unary::strided::cos::HALF,
unary::strided::exp::HALF,
unary::strided::sqr::HALF,
unary::strided::sqrt::HALF,
unary::strided::neg::HALF,
unary::strided::copy::HALF,
];
let bf16_ckernels = [
unary::contiguous::sin::BFLOAT,
unary::contiguous::cos::BFLOAT,
unary::contiguous::exp::BFLOAT,
unary::contiguous::sqr::BFLOAT,
unary::contiguous::sqrt::BFLOAT,
unary::contiguous::neg::BFLOAT,
unary::contiguous::copy::BFLOAT,
];
let bf16_skernels = [
unary::strided::sin::BFLOAT,
unary::strided::cos::BFLOAT,
unary::strided::exp::BFLOAT,
unary::strided::sqr::BFLOAT,
unary::strided::sqrt::BFLOAT,
unary::strided::neg::BFLOAT,
unary::strided::copy::BFLOAT,
];
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
"dtype", "kernel", "size", "runs", "total time", "avg time"
);
// f32
run_unary_bench(&device, &kernels, &f32_1k, f32_ckernels, f32_skernels);
run_unary_bench(&device, &kernels, &f32_10k, f32_ckernels, f32_skernels);
run_unary_bench(&device, &kernels, &f32_100k, f32_ckernels, f32_skernels);
// f16
run_unary_bench(&device, &kernels, &f16_1k, f16_ckernels, f16_skernels);
run_unary_bench(&device, &kernels, &f16_10k, f16_ckernels, f16_skernels);
run_unary_bench(&device, &kernels, &f16_100k, f16_ckernels, f16_skernels);
// bf16
run_unary_bench(&device, &kernels, &bf16_1k, bf16_ckernels, bf16_skernels);
run_unary_bench(&device, &kernels, &bf16_10k, bf16_ckernels, bf16_skernels);
run_unary_bench(&device, &kernels, &bf16_100k, bf16_ckernels, bf16_skernels);
}
fn run_unary_bench<T: Clone>(
device: &Device,
kernels: &Kernels,
v: &[T],
contiguous: [unary::contiguous::Kernel; 7],
strided: [unary::strided::Kernel; 7],
) {
let command_queue = device.new_command_queue();
let options = MTLResourceOptions::StorageModeManaged;
let iterations = 10000;
let input = device.new_buffer_with_data(
v.as_ptr() as *const core::ffi::c_void,
core::mem::size_of_val(v) as u64,
options,
);
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
// Contiguous
for kernel_name in contiguous {
let total_time = autoreleasepool(|| {
let command_buffer = command_queue.new_command_buffer();
let start = Instant::now();
for _ in 0..iterations {
call_unary_contiguous(
device,
&command_buffer,
kernels,
kernel_name,
v.len(),
&input,
&mut output,
)
.unwrap();
}
command_buffer.commit();
command_buffer.wait_until_completed();
start.elapsed()
});
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
type_name::<T>().split("::").last().unwrap(),
kernel_name.to_string(),
v.len(),
iterations,
total_time,
total_time / iterations
);
}
// Strided
let shape = vec![2, 5_000];
let strides = vec![2, 1];
let offset = 0;
for kernel_name in strided {
let total_time = autoreleasepool(|| {
let command_buffer = command_queue.new_command_buffer();
let start = Instant::now();
for _ in 0..iterations {
call_unary_strided(
device,
command_buffer,
&kernels,
kernel_name,
&shape,
&input,
&strides,
offset,
&mut output,
0,
)
.unwrap();
}
command_buffer.commit();
command_buffer.wait_until_completed();
start.elapsed()
});
println!(
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
type_name::<T>().split("::").last().unwrap(),
kernel_name.to_string(),
v.len(),
iterations,
total_time,
total_time / iterations
);
}
}

View File

@ -47,7 +47,7 @@ kernel void FN_NAME_STRIDED( \
return; \ return; \
} \ } \
TYPENAME x = left[get_strided_index(thread_position_in_grid, num_dims, dims, left_strides)]; \ TYPENAME x = left[get_strided_index(thread_position_in_grid, num_dims, dims, left_strides)]; \
TYPENAME y = right[get_strided_index(thread_position_in_grid, num_dims, dims, left_strides)]; \ TYPENAME y = right[get_strided_index(thread_position_in_grid, num_dims, dims, right_strides)]; \
output[thread_position_in_grid] = OUT_TYPENAME(FN); \ output[thread_position_in_grid] = OUT_TYPENAME(FN); \
} }

View File

@ -112,7 +112,13 @@ macro_rules! ops{
($($name:ident),+) => { ($($name:ident),+) => {
pub mod contiguous { pub mod contiguous {
#[derive(Clone, Copy)]
pub struct Kernel(pub(crate) &'static str); pub struct Kernel(pub(crate) &'static str);
impl std::fmt::Display for Kernel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
$( $(
pub mod $name { pub mod $name {
use super::Kernel; use super::Kernel;
@ -124,7 +130,13 @@ macro_rules! ops{
} }
pub mod strided { pub mod strided {
#[derive(Clone, Copy)]
pub struct Kernel(pub(crate) &'static str); pub struct Kernel(pub(crate) &'static str);
impl std::fmt::Display for Kernel {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
$( $(
pub mod $name { pub mod $name {
use super::Kernel; use super::Kernel;
@ -859,6 +871,30 @@ mod tests {
assert_eq!(approx(expected, 4), vec![0.5403; 10_000]); assert_eq!(approx(expected, 4), vec![0.5403; 10_000]);
} }
#[test]
fn cos_strided_random() {
let v: Vec<_> = (0..10_000).map(|i| rand::random::<f32>()).collect();
let shape = vec![5_000, 2];
let strides = vec![1, 5_000];
let offset = 0;
let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
assert_eq!(approx(vec![results[0]], 4), approx(vec![expected[0]], 4));
assert_eq!(
approx(vec![results[1]], 4),
approx(vec![expected[5_000]], 4)
);
assert_eq!(approx(vec![results[2]], 4), approx(vec![expected[1]], 4));
assert_eq!(
approx(vec![results[3]], 4),
approx(vec![expected[5_001]], 4)
);
assert_eq!(
approx(vec![results[5_000]], 4),
approx(vec![expected[2_500]], 4)
);
}
#[test] #[test]
fn binary_add_f32() { fn binary_add_f32() {
let left = vec![1.0f32, 2.0, 3.0]; let left = vec![1.0f32, 2.0, 3.0];

View File

@ -9,6 +9,7 @@ pub struct Embedding {
impl Embedding { impl Embedding {
pub fn new(embeddings: Tensor, hidden_size: usize) -> Self { pub fn new(embeddings: Tensor, hidden_size: usize) -> Self {
// todo!("Embedding {embeddings}");
Self { Self {
embeddings, embeddings,
hidden_size, hidden_size,

View File

@ -165,6 +165,7 @@ impl CausalSelfAttention {
fn forward(&self, x: &Tensor, index_pos: usize, block_idx: usize) -> Result<Tensor> { fn forward(&self, x: &Tensor, index_pos: usize, block_idx: usize) -> Result<Tensor> {
let (b_sz, seq_len, n_embd) = x.dims3()?; let (b_sz, seq_len, n_embd) = x.dims3()?;
let q = self.q_proj.forward(x)?; let q = self.q_proj.forward(x)?;
todo!("X {q}");
let k = self.k_proj.forward(x)?; let k = self.k_proj.forward(x)?;
let v = self.v_proj.forward(x)?; let v = self.v_proj.forward(x)?;
@ -295,6 +296,7 @@ impl Block {
let residual = x; let residual = x;
let x = self.rms_1.forward(x)?; let x = self.rms_1.forward(x)?;
let x = (self.attn.forward(&x, index_pos, block_idx)? + residual)?; let x = (self.attn.forward(&x, index_pos, block_idx)? + residual)?;
todo!("---X {}", x);
let residual = &x; let residual = &x;
let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?; let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
Ok(x) Ok(x)
@ -327,6 +329,7 @@ impl Llama {
pub fn forward(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> { pub fn forward(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
let (_b_sz, _seq_len) = x.dims2()?; let (_b_sz, _seq_len) = x.dims2()?;
let mut x = self.wte.forward(x)?; let mut x = self.wte.forward(x)?;
//println!("Embeddings {}", self.wte.embeddings());
for (block_idx, block) in self.blocks.iter().enumerate() { for (block_idx, block) in self.blocks.iter().enumerate() {
x = block.forward(&x, index_pos, block_idx)?; x = block.forward(&x, index_pos, block_idx)?;
} }