mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 02:38:10 +00:00
Tmp state.
This commit is contained in:

committed by
Nicolas Patry

parent
f710fab02e
commit
d46670f7c0
@ -8,7 +8,7 @@ use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
|
|||||||
pub enum DeviceLocation {
|
pub enum DeviceLocation {
|
||||||
Cpu,
|
Cpu,
|
||||||
Cuda { gpu_id: usize },
|
Cuda { gpu_id: usize },
|
||||||
Metal,
|
Metal { gpu_id: usize },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
@ -14,7 +14,9 @@ impl Tensor {
|
|||||||
crate::DeviceLocation::Cuda { gpu_id } => {
|
crate::DeviceLocation::Cuda { gpu_id } => {
|
||||||
format!(", cuda:{}", gpu_id)
|
format!(", cuda:{}", gpu_id)
|
||||||
}
|
}
|
||||||
_ => todo!(),
|
crate::DeviceLocation::Metal { gpu_id } => {
|
||||||
|
format!(", metal:{}", gpu_id)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
write!(f, "Tensor[")?;
|
write!(f, "Tensor[")?;
|
||||||
@ -477,7 +479,9 @@ impl std::fmt::Display for Tensor {
|
|||||||
crate::DeviceLocation::Cuda { gpu_id } => {
|
crate::DeviceLocation::Cuda { gpu_id } => {
|
||||||
format!(", cuda:{}", gpu_id)
|
format!(", cuda:{}", gpu_id)
|
||||||
}
|
}
|
||||||
crate::DeviceLocation::Metal => todo!(),
|
crate::DeviceLocation::Metal { gpu_id } => {
|
||||||
|
format!(", metal:{}", gpu_id)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
write!(
|
write!(
|
||||||
|
@ -100,11 +100,30 @@ impl BackendStorage for MetalStorage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn to_cpu_storage(&self) -> Result<CpuStorage> {
|
fn to_cpu_storage(&self) -> Result<CpuStorage> {
|
||||||
|
// TODO Is this necessary
|
||||||
|
// self.buffer.synchronize();
|
||||||
match self.dtype {
|
match self.dtype {
|
||||||
|
DType::U8 => Ok(CpuStorage::U8(
|
||||||
|
self.buffer.read_to_vec(self.buffer.length() as usize / 1),
|
||||||
|
)),
|
||||||
|
DType::U32 => Ok(CpuStorage::U32(
|
||||||
|
self.buffer.read_to_vec(self.buffer.length() as usize / 4),
|
||||||
|
)),
|
||||||
|
DType::I64 => Ok(CpuStorage::I64(
|
||||||
|
self.buffer.read_to_vec(self.buffer.length() as usize / 8),
|
||||||
|
)),
|
||||||
|
DType::F16 => Ok(CpuStorage::F16(
|
||||||
|
self.buffer.read_to_vec(self.buffer.length() as usize / 2),
|
||||||
|
)),
|
||||||
|
DType::BF16 => Ok(CpuStorage::BF16(
|
||||||
|
self.buffer.read_to_vec(self.buffer.length() as usize / 2),
|
||||||
|
)),
|
||||||
DType::F32 => Ok(CpuStorage::F32(
|
DType::F32 => Ok(CpuStorage::F32(
|
||||||
self.buffer.read_to_vec(self.buffer.length() as usize / 4),
|
self.buffer.read_to_vec(self.buffer.length() as usize / 4),
|
||||||
)),
|
)),
|
||||||
dtype => todo!("Unsupported dtype {dtype:?}"),
|
DType::F64 => Ok(CpuStorage::F64(
|
||||||
|
self.buffer.read_to_vec(self.buffer.length() as usize / 8),
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,6 +151,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
return Ok(Self {
|
return Ok(Self {
|
||||||
buffer,
|
buffer,
|
||||||
device: device.clone(),
|
device: device.clone(),
|
||||||
@ -200,6 +220,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
)
|
)
|
||||||
.map_err(MetalError::from)?;
|
.map_err(MetalError::from)?;
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
buffer,
|
buffer,
|
||||||
@ -242,6 +263,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
// command_buffer.wait_until_scheduled();
|
// command_buffer.wait_until_scheduled();
|
||||||
// debug!(
|
// debug!(
|
||||||
// "cast {:?} - {:?} - {:?}",
|
// "cast {:?} - {:?} - {:?}",
|
||||||
@ -289,6 +311,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
todo!("TODO Implement the kernel calling {}", B::KERNEL);
|
todo!("TODO Implement the kernel calling {}", B::KERNEL);
|
||||||
}
|
}
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
buffer,
|
buffer,
|
||||||
@ -361,6 +384,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
.map_err(MetalError::from)?;
|
.map_err(MetalError::from)?;
|
||||||
}
|
}
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
buffer,
|
buffer,
|
||||||
@ -400,6 +424,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
)
|
)
|
||||||
.map_err(MetalError::from)?;
|
.map_err(MetalError::from)?;
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
buffer,
|
buffer,
|
||||||
device,
|
device,
|
||||||
@ -489,6 +514,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
let dtype = self.dtype;
|
let dtype = self.dtype;
|
||||||
let device = self.device();
|
let device = self.device();
|
||||||
let mut buffer = device.new_buffer(dst_el, dtype);
|
let mut buffer = device.new_buffer(dst_el, dtype);
|
||||||
|
let out = self.to_cpu_storage().unwrap();
|
||||||
let name = match (ids.dtype, self.dtype) {
|
let name = match (ids.dtype, self.dtype) {
|
||||||
(DType::U32, DType::F32) => "is_u32_f32",
|
(DType::U32, DType::F32) => "is_u32_f32",
|
||||||
(left, right) => todo!("index select metal {left:?} {right:?}"),
|
(left, right) => todo!("index select metal {left:?} {right:?}"),
|
||||||
@ -508,6 +534,7 @@ impl BackendStorage for MetalStorage {
|
|||||||
)
|
)
|
||||||
.map_err(MetalError::from)?;
|
.map_err(MetalError::from)?;
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
buffer,
|
buffer,
|
||||||
device: device.clone(),
|
device: device.clone(),
|
||||||
@ -556,39 +583,42 @@ impl BackendStorage for MetalStorage {
|
|||||||
if el_count == 0 {
|
if el_count == 0 {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
if src_l.is_contiguous() {
|
// todo!("Copy strided {:?}", src_l.is_contiguous());
|
||||||
let command_buffer = self.device.command_queue.new_command_buffer();
|
// if src_l.is_contiguous() {
|
||||||
let blip = command_buffer.new_blit_command_encoder();
|
// let command_buffer = self.device.command_queue.new_command_buffer();
|
||||||
blip.copy_from_buffer(
|
// let blip = command_buffer.new_blit_command_encoder();
|
||||||
&self.buffer,
|
// blip.copy_from_buffer(
|
||||||
src_l.start_offset() as u64,
|
// &self.buffer,
|
||||||
&dst.buffer,
|
// src_l.start_offset() as u64,
|
||||||
dst_offset as u64,
|
// &dst.buffer,
|
||||||
self.buffer.length(),
|
// dst_offset as u64,
|
||||||
);
|
// self.buffer.length(),
|
||||||
} else {
|
// );
|
||||||
let command_buffer = self.device.command_queue.new_command_buffer();
|
// } else {
|
||||||
let kernel_name = match self.dtype {
|
let command_buffer = self.device.command_queue.new_command_buffer();
|
||||||
DType::F32 => candle_metal_kernels::unary::strided::copy::FLOAT,
|
let kernel_name = match self.dtype {
|
||||||
DType::F16 => candle_metal_kernels::unary::strided::copy::HALF,
|
DType::F32 => candle_metal_kernels::unary::strided::copy::FLOAT,
|
||||||
DType::BF16 => candle_metal_kernels::unary::strided::copy::BFLOAT,
|
DType::F16 => candle_metal_kernels::unary::strided::copy::HALF,
|
||||||
dtype => todo!("copy_strided not implemented for {dtype:?}"),
|
DType::BF16 => candle_metal_kernels::unary::strided::copy::BFLOAT,
|
||||||
};
|
dtype => todo!("copy_strided not implemented for {dtype:?}"),
|
||||||
candle_metal_kernels::call_unary_strided(
|
};
|
||||||
&self.device.device,
|
candle_metal_kernels::call_unary_strided(
|
||||||
&command_buffer,
|
&self.device.device,
|
||||||
&self.device.kernels,
|
&command_buffer,
|
||||||
kernel_name,
|
&self.device.kernels,
|
||||||
src_l.dims(),
|
kernel_name,
|
||||||
&self.buffer,
|
src_l.dims(),
|
||||||
&src_l.stride(),
|
&self.buffer,
|
||||||
src_l.start_offset(),
|
&src_l.stride(),
|
||||||
&mut dst.buffer,
|
src_l.start_offset(),
|
||||||
dst_offset,
|
&mut dst.buffer,
|
||||||
)
|
dst_offset,
|
||||||
.map_err(MetalError::from)?;
|
)
|
||||||
command_buffer.commit();
|
.map_err(MetalError::from)?;
|
||||||
}
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
// todo!("Output {:?}", dst.buffer.read_to_vec::<f32>(10));
|
||||||
|
// }
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -616,28 +646,29 @@ impl MetalStorage {
|
|||||||
match (self.dtype, rhs.dtype) {
|
match (self.dtype, rhs.dtype) {
|
||||||
(DType::F32, DType::F32) => {
|
(DType::F32, DType::F32) => {
|
||||||
let mut out_buffer = self.device.new_buffer(elem_count, self.dtype);
|
let mut out_buffer = self.device.new_buffer(elem_count, self.dtype);
|
||||||
if b != 1 {
|
// if b != 1 {
|
||||||
// debug!("TODO implement batched matmul for B={b}");
|
// // debug!("TODO implement batched matmul for B={b}");
|
||||||
// bail!("Didn't implemented strided matmul yet");
|
// crate::bail!("Didn't implemented strided matmul yet");
|
||||||
return Ok(Self {
|
// return Ok(Self {
|
||||||
buffer: out_buffer,
|
// buffer: out_buffer,
|
||||||
device: self.device.clone(),
|
// device: self.device.clone(),
|
||||||
dtype: self.dtype(),
|
// dtype: self.dtype(),
|
||||||
});
|
// });
|
||||||
}
|
//}
|
||||||
if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() {
|
// if !lhs_l.is_contiguous() || !rhs_l.is_contiguous() {
|
||||||
// debug!(
|
// // debug!(
|
||||||
// "TODO non contiguous matmul yet {:?} {:?} - {:?} - {transpose_right}",
|
// // "TODO non contiguous matmul yet {:?} {:?} - {:?} - {transpose_right}",
|
||||||
// lhs_l.is_contiguous(),
|
// // lhs_l.is_contiguous(),
|
||||||
// rhs_l.is_contiguous(),
|
// // rhs_l.is_contiguous(),
|
||||||
// rhs_l
|
// // rhs_l
|
||||||
// );
|
// // );
|
||||||
return Ok(Self {
|
// crate::bail!("No not contiguous matmul");
|
||||||
buffer: out_buffer,
|
// return Ok(Self {
|
||||||
device: self.device.clone(),
|
// buffer: out_buffer,
|
||||||
dtype: self.dtype(),
|
// device: self.device.clone(),
|
||||||
});
|
// dtype: self.dtype(),
|
||||||
}
|
// });
|
||||||
|
// }
|
||||||
|
|
||||||
// debug!("TODO GEMM");
|
// debug!("TODO GEMM");
|
||||||
let command_buffer = self.device.command_queue.new_command_buffer();
|
let command_buffer = self.device.command_queue.new_command_buffer();
|
||||||
@ -659,7 +690,15 @@ impl MetalStorage {
|
|||||||
.map_err(MetalError::from)?;
|
.map_err(MetalError::from)?;
|
||||||
|
|
||||||
command_buffer.commit();
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
// command_buffer.wait_until_scheduled();
|
// command_buffer.wait_until_scheduled();
|
||||||
|
//
|
||||||
|
let left = self.buffer.read_to_vec::<f32>(10);
|
||||||
|
let right = rhs.buffer.read_to_vec::<f32>(10);
|
||||||
|
let out = out_buffer.read_to_vec::<f32>(10);
|
||||||
|
|
||||||
|
println!("{b} {m} {n} {k} ");
|
||||||
|
println!("{left:?} {right:?} {out:?}");
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
buffer: out_buffer,
|
buffer: out_buffer,
|
||||||
@ -709,7 +748,9 @@ impl BackendDevice for MetalDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn location(&self) -> crate::DeviceLocation {
|
fn location(&self) -> crate::DeviceLocation {
|
||||||
crate::DeviceLocation::Metal
|
crate::DeviceLocation::Metal {
|
||||||
|
gpu_id: self.registry_id() as usize,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn same_device(&self, rhs: &Self) -> bool {
|
fn same_device(&self, rhs: &Self) -> bool {
|
||||||
@ -767,6 +808,8 @@ impl BackendDevice for MetalDevice {
|
|||||||
option,
|
option,
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
// TODO is that necessary ?
|
||||||
|
// buffer.did_modify_range(metal::NSRange::new(0, buffer.length()));
|
||||||
// debug!("Allocate 2 - buffer size {}", buffer.length());
|
// debug!("Allocate 2 - buffer size {}", buffer.length());
|
||||||
Ok(Self::Storage {
|
Ok(Self::Storage {
|
||||||
buffer,
|
buffer,
|
||||||
|
@ -157,6 +157,8 @@ pub(crate) fn from_storage<S: Into<Shape>>(
|
|||||||
) -> Tensor {
|
) -> Tensor {
|
||||||
let dtype = storage.dtype();
|
let dtype = storage.dtype();
|
||||||
let device = storage.device();
|
let device = storage.device();
|
||||||
|
let shape = shape.into();
|
||||||
|
// println!("{:?} {storage:?}", shape);
|
||||||
let tensor_ = Tensor_ {
|
let tensor_ = Tensor_ {
|
||||||
id: TensorId::new(),
|
id: TensorId::new(),
|
||||||
storage: Arc::new(RwLock::new(storage)),
|
storage: Arc::new(RwLock::new(storage)),
|
||||||
@ -166,7 +168,11 @@ pub(crate) fn from_storage<S: Into<Shape>>(
|
|||||||
dtype,
|
dtype,
|
||||||
device,
|
device,
|
||||||
};
|
};
|
||||||
Tensor(Arc::new(tensor_))
|
let result = Tensor(Arc::new(tensor_));
|
||||||
|
// todo!(" from_storage");
|
||||||
|
// let result = result.to_device(&Device::Cpu).unwrap();
|
||||||
|
// todo!(" {result}");
|
||||||
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tensor {
|
impl Tensor {
|
||||||
|
@ -329,14 +329,18 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
|
|||||||
.get_ids()
|
.get_ids()
|
||||||
.to_vec();
|
.to_vec();
|
||||||
|
|
||||||
|
println!("{tokens:?}");
|
||||||
|
|
||||||
let start_gen = std::time::Instant::now();
|
let start_gen = std::time::Instant::now();
|
||||||
for index in 0.. {
|
for index in 0..1 {
|
||||||
if tokens.len() >= config.seq_len {
|
if tokens.len() >= config.seq_len {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let context_size = if index > 0 { 1 } else { tokens.len() };
|
let context_size = if index > 0 { 1 } else { tokens.len() };
|
||||||
let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
|
let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
|
||||||
let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
|
let input = Tensor::new(ctxt, &device)?.unsqueeze(0)?;
|
||||||
|
// println!("Input {}", input);
|
||||||
|
// println!("Input {}", input.to_device(&candle::Device::Cpu)?);
|
||||||
let logits = model.forward(&input, index_pos)?;
|
let logits = model.forward(&input, index_pos)?;
|
||||||
let logits = logits.i((0, logits.dim(1)? - 1))?;
|
let logits = logits.i((0, logits.dim(1)? - 1))?;
|
||||||
let logits = if common_args.repeat_penalty == 1. || tokens.is_empty() {
|
let logits = if common_args.repeat_penalty == 1. || tokens.is_empty() {
|
||||||
|
@ -17,3 +17,4 @@ tracing = "0.1.37"
|
|||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
|
half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
|
||||||
|
rand = "0.8.5"
|
||||||
|
75
candle-metal-kernels/examples/affine.rs
Normal file
75
candle-metal-kernels/examples/affine.rs
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
use candle_metal_kernels::{call_affine, Kernels};
|
||||||
|
use metal::objc::rc::autoreleasepool;
|
||||||
|
use metal::{Device, MTLResourceOptions};
|
||||||
|
use rand;
|
||||||
|
use std::any::type_name;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let device = Device::system_default().unwrap();
|
||||||
|
let kernels = Kernels::new();
|
||||||
|
|
||||||
|
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
|
||||||
|
let f32_10k = (0..10000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let f32_100k = (0..100000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
|
||||||
|
"dtype", "kernel", "size", "runs", "total time", "avg time"
|
||||||
|
);
|
||||||
|
|
||||||
|
// f32
|
||||||
|
run_affine_bench(&device, &kernels, &f32_1k);
|
||||||
|
run_affine_bench(&device, &kernels, &f32_10k);
|
||||||
|
run_affine_bench(&device, &kernels, &f32_100k);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_affine_bench<T: Clone>(device: &Device, kernels: &Kernels, v: &[T]) {
|
||||||
|
let command_queue = device.new_command_queue();
|
||||||
|
let options = MTLResourceOptions::StorageModeManaged;
|
||||||
|
|
||||||
|
let iterations = 10000;
|
||||||
|
let input = device.new_buffer_with_data(
|
||||||
|
v.as_ptr() as *const core::ffi::c_void,
|
||||||
|
core::mem::size_of_val(v) as u64,
|
||||||
|
options,
|
||||||
|
);
|
||||||
|
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
|
||||||
|
|
||||||
|
let mul: f32 = 1.2345;
|
||||||
|
let add: f32 = 2.3456;
|
||||||
|
let total_time = autoreleasepool(|| {
|
||||||
|
let command_buffer = command_queue.new_command_buffer();
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..iterations {
|
||||||
|
call_affine(
|
||||||
|
&device,
|
||||||
|
command_buffer,
|
||||||
|
&kernels,
|
||||||
|
v.len(),
|
||||||
|
&input,
|
||||||
|
&mut output,
|
||||||
|
mul,
|
||||||
|
add,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
|
start.elapsed()
|
||||||
|
});
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
|
||||||
|
type_name::<T>().split("::").last().unwrap(),
|
||||||
|
"affine",
|
||||||
|
v.len(),
|
||||||
|
iterations,
|
||||||
|
total_time,
|
||||||
|
total_time / iterations
|
||||||
|
);
|
||||||
|
}
|
182
candle-metal-kernels/examples/binary.rs
Normal file
182
candle-metal-kernels/examples/binary.rs
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
use candle_metal_kernels::{binary, call_binary_contiguous, call_binary_strided, Kernels};
|
||||||
|
use half::{bf16, f16};
|
||||||
|
use metal::objc::rc::autoreleasepool;
|
||||||
|
use metal::{Device, MTLResourceOptions};
|
||||||
|
use rand;
|
||||||
|
use std::any::type_name;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let device = Device::system_default().unwrap();
|
||||||
|
let kernels = Kernels::new();
|
||||||
|
|
||||||
|
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
|
||||||
|
let f32_10k = (0..10000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let f32_100k = (0..100000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let f16_map = |v: &[f32]| v.iter().map(|v| f16::from_f32(*v)).collect::<Vec<_>>();
|
||||||
|
let f16_1k = f16_map(&f32_1k);
|
||||||
|
let f16_10k = f16_map(&f32_10k);
|
||||||
|
let f16_100k = f16_map(&f32_100k);
|
||||||
|
|
||||||
|
let bf16_map = |v: &[f32]| v.iter().map(|v| bf16::from_f32(*v)).collect::<Vec<_>>();
|
||||||
|
let bf16_1k = bf16_map(&f32_1k);
|
||||||
|
let bf16_10k = bf16_map(&f32_10k);
|
||||||
|
let bf16_100k = bf16_map(&f32_100k);
|
||||||
|
|
||||||
|
let f32_ckernels = [
|
||||||
|
binary::contiguous::add::FLOAT,
|
||||||
|
binary::contiguous::sub::FLOAT,
|
||||||
|
binary::contiguous::mul::FLOAT,
|
||||||
|
binary::contiguous::div::FLOAT,
|
||||||
|
];
|
||||||
|
let f32_skernels = [
|
||||||
|
binary::strided::add::FLOAT,
|
||||||
|
binary::strided::sub::FLOAT,
|
||||||
|
binary::strided::mul::FLOAT,
|
||||||
|
binary::strided::div::FLOAT,
|
||||||
|
];
|
||||||
|
let f16_ckernels = [
|
||||||
|
binary::contiguous::add::HALF,
|
||||||
|
binary::contiguous::sub::HALF,
|
||||||
|
binary::contiguous::mul::HALF,
|
||||||
|
binary::contiguous::div::HALF,
|
||||||
|
];
|
||||||
|
let f16_skernels = [
|
||||||
|
binary::strided::add::HALF,
|
||||||
|
binary::strided::sub::HALF,
|
||||||
|
binary::strided::mul::HALF,
|
||||||
|
binary::strided::div::HALF,
|
||||||
|
];
|
||||||
|
let bf16_ckernels = [
|
||||||
|
binary::contiguous::add::BFLOAT,
|
||||||
|
binary::contiguous::sub::BFLOAT,
|
||||||
|
binary::contiguous::mul::BFLOAT,
|
||||||
|
binary::contiguous::div::BFLOAT,
|
||||||
|
];
|
||||||
|
let bf16_skernels = [
|
||||||
|
binary::strided::add::BFLOAT,
|
||||||
|
binary::strided::sub::BFLOAT,
|
||||||
|
binary::strided::mul::BFLOAT,
|
||||||
|
binary::strided::div::BFLOAT,
|
||||||
|
];
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
|
||||||
|
"dtype", "kernel", "size", "runs", "total time", "avg time"
|
||||||
|
);
|
||||||
|
|
||||||
|
// f32
|
||||||
|
run_binary_bench(&device, &kernels, &f32_1k, f32_ckernels, f32_skernels);
|
||||||
|
run_binary_bench(&device, &kernels, &f32_10k, f32_ckernels, f32_skernels);
|
||||||
|
run_binary_bench(&device, &kernels, &f32_100k, f32_ckernels, f32_skernels);
|
||||||
|
|
||||||
|
// f16
|
||||||
|
run_binary_bench(&device, &kernels, &f16_1k, f16_ckernels, f16_skernels);
|
||||||
|
run_binary_bench(&device, &kernels, &f16_10k, f16_ckernels, f16_skernels);
|
||||||
|
run_binary_bench(&device, &kernels, &f16_100k, f16_ckernels, f16_skernels);
|
||||||
|
|
||||||
|
// bf16
|
||||||
|
run_binary_bench(&device, &kernels, &bf16_1k, bf16_ckernels, bf16_skernels);
|
||||||
|
run_binary_bench(&device, &kernels, &bf16_10k, bf16_ckernels, bf16_skernels);
|
||||||
|
run_binary_bench(&device, &kernels, &bf16_100k, bf16_ckernels, bf16_skernels);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_binary_bench<T: Clone>(
|
||||||
|
device: &Device,
|
||||||
|
kernels: &Kernels,
|
||||||
|
v: &[T],
|
||||||
|
contiguous: [binary::contiguous::Kernel; 4],
|
||||||
|
strided: [binary::strided::Kernel; 4],
|
||||||
|
) {
|
||||||
|
let command_queue = device.new_command_queue();
|
||||||
|
let options = MTLResourceOptions::StorageModeManaged;
|
||||||
|
|
||||||
|
let iterations = 1000;
|
||||||
|
let input = device.new_buffer_with_data(
|
||||||
|
v.as_ptr() as *const core::ffi::c_void,
|
||||||
|
core::mem::size_of_val(v) as u64,
|
||||||
|
options,
|
||||||
|
);
|
||||||
|
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
|
||||||
|
|
||||||
|
// Contiguous
|
||||||
|
for kernel_name in contiguous {
|
||||||
|
let total_time = autoreleasepool(|| {
|
||||||
|
let command_buffer = command_queue.new_command_buffer();
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..iterations {
|
||||||
|
call_binary_contiguous(
|
||||||
|
device,
|
||||||
|
&command_buffer,
|
||||||
|
kernels,
|
||||||
|
kernel_name,
|
||||||
|
v.len(),
|
||||||
|
&input,
|
||||||
|
&input,
|
||||||
|
&mut output,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
|
start.elapsed()
|
||||||
|
});
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
|
||||||
|
type_name::<T>().split("::").last().unwrap(),
|
||||||
|
kernel_name.to_string(),
|
||||||
|
v.len(),
|
||||||
|
iterations,
|
||||||
|
total_time,
|
||||||
|
total_time / iterations
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strided
|
||||||
|
let shape = vec![2, 5_000];
|
||||||
|
let strides = vec![2, 1];
|
||||||
|
let offset = 0;
|
||||||
|
for kernel_name in strided {
|
||||||
|
let total_time = autoreleasepool(|| {
|
||||||
|
let command_buffer = command_queue.new_command_buffer();
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..iterations {
|
||||||
|
call_binary_strided(
|
||||||
|
device,
|
||||||
|
command_buffer,
|
||||||
|
&kernels,
|
||||||
|
kernel_name,
|
||||||
|
&shape,
|
||||||
|
&input,
|
||||||
|
&strides,
|
||||||
|
offset,
|
||||||
|
&input,
|
||||||
|
&strides,
|
||||||
|
offset,
|
||||||
|
&mut output,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
|
start.elapsed()
|
||||||
|
});
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
|
||||||
|
type_name::<T>().split("::").last().unwrap(),
|
||||||
|
kernel_name.to_string(),
|
||||||
|
v.len(),
|
||||||
|
iterations,
|
||||||
|
total_time,
|
||||||
|
total_time / iterations
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
84
candle-metal-kernels/examples/cast.rs
Normal file
84
candle-metal-kernels/examples/cast.rs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
use candle_metal_kernels::{call_cast_contiguous, Kernels};
|
||||||
|
use metal::objc::rc::autoreleasepool;
|
||||||
|
use metal::{Device, MTLResourceOptions};
|
||||||
|
use rand;
|
||||||
|
use std::any::type_name;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let device = Device::system_default().unwrap();
|
||||||
|
let kernels = Kernels::new();
|
||||||
|
|
||||||
|
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
|
||||||
|
let f32_10k = (0..10000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let f32_100k = (0..100000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let contiguous_kernels = ["cast_u32_f32"];
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
|
||||||
|
"dtype", "kernel", "size", "runs", "total time", "avg time"
|
||||||
|
);
|
||||||
|
|
||||||
|
// f32
|
||||||
|
run_cast_bench(&device, &kernels, &f32_1k, &contiguous_kernels);
|
||||||
|
run_cast_bench(&device, &kernels, &f32_10k, &contiguous_kernels);
|
||||||
|
run_cast_bench(&device, &kernels, &f32_100k, &contiguous_kernels);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_cast_bench<T: Clone>(
|
||||||
|
device: &Device,
|
||||||
|
kernels: &Kernels,
|
||||||
|
v: &[T],
|
||||||
|
contiguous: &[&'static str],
|
||||||
|
) {
|
||||||
|
let command_queue = device.new_command_queue();
|
||||||
|
let options = MTLResourceOptions::StorageModeManaged;
|
||||||
|
|
||||||
|
let iterations = 1000;
|
||||||
|
let input = device.new_buffer_with_data(
|
||||||
|
v.as_ptr() as *const core::ffi::c_void,
|
||||||
|
core::mem::size_of_val(v) as u64,
|
||||||
|
options,
|
||||||
|
);
|
||||||
|
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
|
||||||
|
|
||||||
|
// Contiguous
|
||||||
|
for kernel_name in contiguous {
|
||||||
|
let total_time = autoreleasepool(|| {
|
||||||
|
let command_buffer = command_queue.new_command_buffer();
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..iterations {
|
||||||
|
call_cast_contiguous(
|
||||||
|
device,
|
||||||
|
&command_buffer,
|
||||||
|
kernels,
|
||||||
|
kernel_name,
|
||||||
|
v.len(),
|
||||||
|
&input,
|
||||||
|
&mut output,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
|
start.elapsed()
|
||||||
|
});
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
|
||||||
|
type_name::<T>().split("::").last().unwrap(),
|
||||||
|
kernel_name.to_string(),
|
||||||
|
v.len(),
|
||||||
|
iterations,
|
||||||
|
total_time,
|
||||||
|
total_time / iterations
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strided?
|
||||||
|
}
|
197
candle-metal-kernels/examples/unary.rs
Normal file
197
candle-metal-kernels/examples/unary.rs
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
use candle_metal_kernels::{call_unary_contiguous, call_unary_strided, unary, Kernels};
|
||||||
|
use half::{bf16, f16};
|
||||||
|
use metal::objc::rc::autoreleasepool;
|
||||||
|
use metal::{Device, MTLResourceOptions};
|
||||||
|
use rand;
|
||||||
|
use std::any::type_name;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let device = Device::system_default().unwrap();
|
||||||
|
let kernels = Kernels::new();
|
||||||
|
|
||||||
|
let f32_1k = (0..1000).map(|_| rand::random::<f32>()).collect::<Vec<_>>();
|
||||||
|
let f32_10k = (0..10000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let f32_100k = (0..100000)
|
||||||
|
.map(|_| rand::random::<f32>())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let f16_map = |v: &[f32]| v.iter().map(|v| f16::from_f32(*v)).collect::<Vec<_>>();
|
||||||
|
let f16_1k = f16_map(&f32_1k);
|
||||||
|
let f16_10k = f16_map(&f32_10k);
|
||||||
|
let f16_100k = f16_map(&f32_100k);
|
||||||
|
|
||||||
|
let bf16_map = |v: &[f32]| v.iter().map(|v| bf16::from_f32(*v)).collect::<Vec<_>>();
|
||||||
|
let bf16_1k = bf16_map(&f32_1k);
|
||||||
|
let bf16_10k = bf16_map(&f32_10k);
|
||||||
|
let bf16_100k = bf16_map(&f32_100k);
|
||||||
|
|
||||||
|
let f32_ckernels = [
|
||||||
|
unary::contiguous::sin::FLOAT,
|
||||||
|
unary::contiguous::cos::FLOAT,
|
||||||
|
unary::contiguous::exp::FLOAT,
|
||||||
|
unary::contiguous::sqr::FLOAT,
|
||||||
|
unary::contiguous::sqrt::FLOAT,
|
||||||
|
unary::contiguous::neg::FLOAT,
|
||||||
|
unary::contiguous::copy::FLOAT,
|
||||||
|
];
|
||||||
|
let f32_skernels = [
|
||||||
|
unary::strided::sin::FLOAT,
|
||||||
|
unary::strided::cos::FLOAT,
|
||||||
|
unary::strided::exp::FLOAT,
|
||||||
|
unary::strided::sqr::FLOAT,
|
||||||
|
unary::strided::sqrt::FLOAT,
|
||||||
|
unary::strided::neg::FLOAT,
|
||||||
|
unary::strided::copy::FLOAT,
|
||||||
|
];
|
||||||
|
let f16_ckernels = [
|
||||||
|
unary::contiguous::sin::HALF,
|
||||||
|
unary::contiguous::cos::HALF,
|
||||||
|
unary::contiguous::exp::HALF,
|
||||||
|
unary::contiguous::sqr::HALF,
|
||||||
|
unary::contiguous::sqrt::HALF,
|
||||||
|
unary::contiguous::neg::HALF,
|
||||||
|
unary::contiguous::copy::HALF,
|
||||||
|
];
|
||||||
|
let f16_skernels = [
|
||||||
|
unary::strided::sin::HALF,
|
||||||
|
unary::strided::cos::HALF,
|
||||||
|
unary::strided::exp::HALF,
|
||||||
|
unary::strided::sqr::HALF,
|
||||||
|
unary::strided::sqrt::HALF,
|
||||||
|
unary::strided::neg::HALF,
|
||||||
|
unary::strided::copy::HALF,
|
||||||
|
];
|
||||||
|
let bf16_ckernels = [
|
||||||
|
unary::contiguous::sin::BFLOAT,
|
||||||
|
unary::contiguous::cos::BFLOAT,
|
||||||
|
unary::contiguous::exp::BFLOAT,
|
||||||
|
unary::contiguous::sqr::BFLOAT,
|
||||||
|
unary::contiguous::sqrt::BFLOAT,
|
||||||
|
unary::contiguous::neg::BFLOAT,
|
||||||
|
unary::contiguous::copy::BFLOAT,
|
||||||
|
];
|
||||||
|
let bf16_skernels = [
|
||||||
|
unary::strided::sin::BFLOAT,
|
||||||
|
unary::strided::cos::BFLOAT,
|
||||||
|
unary::strided::exp::BFLOAT,
|
||||||
|
unary::strided::sqr::BFLOAT,
|
||||||
|
unary::strided::sqrt::BFLOAT,
|
||||||
|
unary::strided::neg::BFLOAT,
|
||||||
|
unary::strided::copy::BFLOAT,
|
||||||
|
];
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11} | {5: <11}",
|
||||||
|
"dtype", "kernel", "size", "runs", "total time", "avg time"
|
||||||
|
);
|
||||||
|
|
||||||
|
// f32
|
||||||
|
run_unary_bench(&device, &kernels, &f32_1k, f32_ckernels, f32_skernels);
|
||||||
|
run_unary_bench(&device, &kernels, &f32_10k, f32_ckernels, f32_skernels);
|
||||||
|
run_unary_bench(&device, &kernels, &f32_100k, f32_ckernels, f32_skernels);
|
||||||
|
|
||||||
|
// f16
|
||||||
|
run_unary_bench(&device, &kernels, &f16_1k, f16_ckernels, f16_skernels);
|
||||||
|
run_unary_bench(&device, &kernels, &f16_10k, f16_ckernels, f16_skernels);
|
||||||
|
run_unary_bench(&device, &kernels, &f16_100k, f16_ckernels, f16_skernels);
|
||||||
|
|
||||||
|
// bf16
|
||||||
|
run_unary_bench(&device, &kernels, &bf16_1k, bf16_ckernels, bf16_skernels);
|
||||||
|
run_unary_bench(&device, &kernels, &bf16_10k, bf16_ckernels, bf16_skernels);
|
||||||
|
run_unary_bench(&device, &kernels, &bf16_100k, bf16_ckernels, bf16_skernels);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_unary_bench<T: Clone>(
|
||||||
|
device: &Device,
|
||||||
|
kernels: &Kernels,
|
||||||
|
v: &[T],
|
||||||
|
contiguous: [unary::contiguous::Kernel; 7],
|
||||||
|
strided: [unary::strided::Kernel; 7],
|
||||||
|
) {
|
||||||
|
let command_queue = device.new_command_queue();
|
||||||
|
let options = MTLResourceOptions::StorageModeManaged;
|
||||||
|
|
||||||
|
let iterations = 10000;
|
||||||
|
let input = device.new_buffer_with_data(
|
||||||
|
v.as_ptr() as *const core::ffi::c_void,
|
||||||
|
core::mem::size_of_val(v) as u64,
|
||||||
|
options,
|
||||||
|
);
|
||||||
|
let mut output = device.new_buffer(core::mem::size_of_val(v) as u64, options);
|
||||||
|
|
||||||
|
// Contiguous
|
||||||
|
for kernel_name in contiguous {
|
||||||
|
let total_time = autoreleasepool(|| {
|
||||||
|
let command_buffer = command_queue.new_command_buffer();
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..iterations {
|
||||||
|
call_unary_contiguous(
|
||||||
|
device,
|
||||||
|
&command_buffer,
|
||||||
|
kernels,
|
||||||
|
kernel_name,
|
||||||
|
v.len(),
|
||||||
|
&input,
|
||||||
|
&mut output,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
|
start.elapsed()
|
||||||
|
});
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
|
||||||
|
type_name::<T>().split("::").last().unwrap(),
|
||||||
|
kernel_name.to_string(),
|
||||||
|
v.len(),
|
||||||
|
iterations,
|
||||||
|
total_time,
|
||||||
|
total_time / iterations
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strided
|
||||||
|
let shape = vec![2, 5_000];
|
||||||
|
let strides = vec![2, 1];
|
||||||
|
let offset = 0;
|
||||||
|
for kernel_name in strided {
|
||||||
|
let total_time = autoreleasepool(|| {
|
||||||
|
let command_buffer = command_queue.new_command_buffer();
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..iterations {
|
||||||
|
call_unary_strided(
|
||||||
|
device,
|
||||||
|
command_buffer,
|
||||||
|
&kernels,
|
||||||
|
kernel_name,
|
||||||
|
&shape,
|
||||||
|
&input,
|
||||||
|
&strides,
|
||||||
|
offset,
|
||||||
|
&mut output,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
command_buffer.commit();
|
||||||
|
command_buffer.wait_until_completed();
|
||||||
|
|
||||||
|
start.elapsed()
|
||||||
|
});
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{0: <5} | {1: <19} | {2: <6} | {3: <5} | {4: <11?} | {5: <11?}",
|
||||||
|
type_name::<T>().split("::").last().unwrap(),
|
||||||
|
kernel_name.to_string(),
|
||||||
|
v.len(),
|
||||||
|
iterations,
|
||||||
|
total_time,
|
||||||
|
total_time / iterations
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -47,7 +47,7 @@ kernel void FN_NAME_STRIDED( \
|
|||||||
return; \
|
return; \
|
||||||
} \
|
} \
|
||||||
TYPENAME x = left[get_strided_index(thread_position_in_grid, num_dims, dims, left_strides)]; \
|
TYPENAME x = left[get_strided_index(thread_position_in_grid, num_dims, dims, left_strides)]; \
|
||||||
TYPENAME y = right[get_strided_index(thread_position_in_grid, num_dims, dims, left_strides)]; \
|
TYPENAME y = right[get_strided_index(thread_position_in_grid, num_dims, dims, right_strides)]; \
|
||||||
output[thread_position_in_grid] = OUT_TYPENAME(FN); \
|
output[thread_position_in_grid] = OUT_TYPENAME(FN); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -112,7 +112,13 @@ macro_rules! ops{
|
|||||||
($($name:ident),+) => {
|
($($name:ident),+) => {
|
||||||
|
|
||||||
pub mod contiguous {
|
pub mod contiguous {
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
pub struct Kernel(pub(crate) &'static str);
|
pub struct Kernel(pub(crate) &'static str);
|
||||||
|
impl std::fmt::Display for Kernel {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{}", self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
$(
|
$(
|
||||||
pub mod $name {
|
pub mod $name {
|
||||||
use super::Kernel;
|
use super::Kernel;
|
||||||
@ -124,7 +130,13 @@ macro_rules! ops{
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub mod strided {
|
pub mod strided {
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
pub struct Kernel(pub(crate) &'static str);
|
pub struct Kernel(pub(crate) &'static str);
|
||||||
|
impl std::fmt::Display for Kernel {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{}", self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
$(
|
$(
|
||||||
pub mod $name {
|
pub mod $name {
|
||||||
use super::Kernel;
|
use super::Kernel;
|
||||||
@ -859,6 +871,30 @@ mod tests {
|
|||||||
assert_eq!(approx(expected, 4), vec![0.5403; 10_000]);
|
assert_eq!(approx(expected, 4), vec![0.5403; 10_000]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cos_strided_random() {
|
||||||
|
let v: Vec<_> = (0..10_000).map(|i| rand::random::<f32>()).collect();
|
||||||
|
let shape = vec![5_000, 2];
|
||||||
|
let strides = vec![1, 5_000];
|
||||||
|
let offset = 0;
|
||||||
|
let results = run_strided(&v, unary::strided::cos::FLOAT, &shape, &strides, offset);
|
||||||
|
let expected: Vec<_> = v.iter().map(|v| v.cos()).collect();
|
||||||
|
assert_eq!(approx(vec![results[0]], 4), approx(vec![expected[0]], 4));
|
||||||
|
assert_eq!(
|
||||||
|
approx(vec![results[1]], 4),
|
||||||
|
approx(vec![expected[5_000]], 4)
|
||||||
|
);
|
||||||
|
assert_eq!(approx(vec![results[2]], 4), approx(vec![expected[1]], 4));
|
||||||
|
assert_eq!(
|
||||||
|
approx(vec![results[3]], 4),
|
||||||
|
approx(vec![expected[5_001]], 4)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
approx(vec![results[5_000]], 4),
|
||||||
|
approx(vec![expected[2_500]], 4)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn binary_add_f32() {
|
fn binary_add_f32() {
|
||||||
let left = vec![1.0f32, 2.0, 3.0];
|
let left = vec![1.0f32, 2.0, 3.0];
|
||||||
|
@ -9,6 +9,7 @@ pub struct Embedding {
|
|||||||
|
|
||||||
impl Embedding {
|
impl Embedding {
|
||||||
pub fn new(embeddings: Tensor, hidden_size: usize) -> Self {
|
pub fn new(embeddings: Tensor, hidden_size: usize) -> Self {
|
||||||
|
// todo!("Embedding {embeddings}");
|
||||||
Self {
|
Self {
|
||||||
embeddings,
|
embeddings,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
|
@ -165,6 +165,7 @@ impl CausalSelfAttention {
|
|||||||
fn forward(&self, x: &Tensor, index_pos: usize, block_idx: usize) -> Result<Tensor> {
|
fn forward(&self, x: &Tensor, index_pos: usize, block_idx: usize) -> Result<Tensor> {
|
||||||
let (b_sz, seq_len, n_embd) = x.dims3()?;
|
let (b_sz, seq_len, n_embd) = x.dims3()?;
|
||||||
let q = self.q_proj.forward(x)?;
|
let q = self.q_proj.forward(x)?;
|
||||||
|
todo!("X {q}");
|
||||||
let k = self.k_proj.forward(x)?;
|
let k = self.k_proj.forward(x)?;
|
||||||
let v = self.v_proj.forward(x)?;
|
let v = self.v_proj.forward(x)?;
|
||||||
|
|
||||||
@ -295,6 +296,7 @@ impl Block {
|
|||||||
let residual = x;
|
let residual = x;
|
||||||
let x = self.rms_1.forward(x)?;
|
let x = self.rms_1.forward(x)?;
|
||||||
let x = (self.attn.forward(&x, index_pos, block_idx)? + residual)?;
|
let x = (self.attn.forward(&x, index_pos, block_idx)? + residual)?;
|
||||||
|
todo!("---X {}", x);
|
||||||
let residual = &x;
|
let residual = &x;
|
||||||
let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
|
let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + residual)?;
|
||||||
Ok(x)
|
Ok(x)
|
||||||
@ -327,6 +329,7 @@ impl Llama {
|
|||||||
pub fn forward(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
|
pub fn forward(&self, x: &Tensor, index_pos: usize) -> Result<Tensor> {
|
||||||
let (_b_sz, _seq_len) = x.dims2()?;
|
let (_b_sz, _seq_len) = x.dims2()?;
|
||||||
let mut x = self.wte.forward(x)?;
|
let mut x = self.wte.forward(x)?;
|
||||||
|
//println!("Embeddings {}", self.wte.embeddings());
|
||||||
for (block_idx, block) in self.blocks.iter().enumerate() {
|
for (block_idx, block) in self.blocks.iter().enumerate() {
|
||||||
x = block.forward(&x, index_pos, block_idx)?;
|
x = block.forward(&x, index_pos, block_idx)?;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user