Follow crate conventions

This commit is contained in:
Ivar Flakstad
2024-01-01 20:37:56 +01:00
parent 6eb44d1bce
commit e8e24f1284
5 changed files with 75 additions and 77 deletions

View File

@ -22,7 +22,11 @@ fn criterion_benchmark(c: &mut Criterion) {
bencher.iter_custom(|iters| { bencher.iter_custom(|iters| {
let start = Instant::now(); let start = Instant::now();
for _i in 0..iters { for _i in 0..iters {
run(black_box((b, rows, columns)), black_box(DType::U8), black_box(&device1)); run(
black_box((b, rows, columns)),
black_box(DType::U8),
black_box(&device1),
);
} }
if let Device::Metal(device) = &device1 { if let Device::Metal(device) = &device1 {
device.wait_until_completed().unwrap(); device.wait_until_completed().unwrap();
@ -35,12 +39,18 @@ fn criterion_benchmark(c: &mut Criterion) {
group.finish(); group.finish();
let mut group = c.benchmark_group("fill_metal_f32"); let mut group = c.benchmark_group("fill_metal_f32");
group.throughput(Throughput::Bytes((flops * DType::F32.size_in_bytes()) as u64)); group.throughput(Throughput::Bytes(
(flops * DType::F32.size_in_bytes()) as u64,
));
group.bench_function("iter", move |bencher| { group.bench_function("iter", move |bencher| {
bencher.iter_custom(|iters| { bencher.iter_custom(|iters| {
let start = Instant::now(); let start = Instant::now();
for _i in 0..iters { for _i in 0..iters {
run(black_box((b, rows, columns)), black_box(DType::F32), black_box(&device2)); run(
black_box((b, rows, columns)),
black_box(DType::F32),
black_box(&device2),
);
} }
if let Device::Metal(device) = &device2 { if let Device::Metal(device) = &device2 {
device.wait_until_completed().unwrap(); device.wait_until_completed().unwrap();

View File

@ -3,7 +3,7 @@ use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvT
use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT}; use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
use crate::{CpuStorage, DType, Layout, Result, Shape}; use crate::{CpuStorage, DType, Layout, Result, Shape};
use candle_metal_kernels; use candle_metal_kernels;
use candle_metal_kernels::{FillOp, Unary, Kernels}; use candle_metal_kernels::Kernels;
use half::{bf16, f16}; use half::{bf16, f16};
use metal; use metal;
use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger};
@ -1405,15 +1405,14 @@ impl BackendDevice for MetalDevice {
let command_buffer = self.command_buffer()?; let command_buffer = self.command_buffer()?;
command_buffer.set_label("zeros"); command_buffer.set_label("zeros");
// This assumes the zero value of this DType is equal to 0x00u8 // This kernel assumes the zero value of this DType is equal to 0x00u8
// (which is true for all current types) // (which is true for all current types)
Unary::fill( candle_metal_kernels::call_fill_u8(
&self.device,
&command_buffer, &command_buffer,
&self.kernels, &self.kernels,
shape.elem_count(), shape.elem_count(),
&buffer, &buffer,
0u8, 0,
) )
.map_err(MetalError::from)?; .map_err(MetalError::from)?;
@ -1427,7 +1426,7 @@ impl BackendDevice for MetalDevice {
macro_rules! fill { macro_rules! fill {
($value:expr) => { ($value:expr) => {
Unary::fill( candle_metal_kernels::call_fill(
&self.device, &self.device,
&command_buffer, &command_buffer,
&self.kernels, &self.kernels,
@ -1439,7 +1438,14 @@ impl BackendDevice for MetalDevice {
}; };
} }
match dtype { match dtype {
DType::U8 => fill!(1u8), DType::U8 => candle_metal_kernels::call_fill_u8(
&command_buffer,
&self.kernels,
shape.elem_count(),
&buffer,
1u8,
)
.map_err(MetalError::from)?,
DType::U32 => fill!(1u32), DType::U32 => fill!(1u32),
DType::I64 => fill!(1i64), DType::I64 => fill!(1i64),
DType::BF16 => fill!(bf16::ONE), DType::BF16 => fill!(bf16::ONE),

View File

@ -20,7 +20,3 @@ num-traits = "0.2.17"
[dev-dependencies] [dev-dependencies]
rand = "0.8.5" rand = "0.8.5"
criterion = "0.5.1" criterion = "0.5.1"
[[bench]]
name = "fill"
harness = false

View File

@ -5,7 +5,6 @@ use metal::{
}; };
use std::collections::HashMap; use std::collections::HashMap;
use std::ffi::c_void; use std::ffi::c_void;
use std::marker::PhantomData;
use std::sync::RwLock; use std::sync::RwLock;
const AFFINE: &str = include_str!("affine.metal"); const AFFINE: &str = include_str!("affine.metal");
@ -1578,32 +1577,15 @@ fn divide(m: usize, b: usize) -> NSUInteger {
((m + b - 1) / b) as NSUInteger ((m + b - 1) / b) as NSUInteger
} }
pub struct Unary<T> { pub fn call_fill<T: FillOp>(
_marker: PhantomData<T>,
}
pub trait FillOp<T> {
const FILL_KERNEL: &'static str;
fn fill(
device: &Device, device: &Device,
command_buffer: &CommandBufferRef, command_buffer: &CommandBufferRef,
kernels: &Kernels, kernels: &Kernels,
elem_count: usize, elem_count: usize,
buffer: &Buffer, buffer: &Buffer,
value: T, value: T,
) -> Result<(), MetalKernelError>; ) -> Result<(), MetalKernelError> {
} let pipeline = kernels.load_pipeline(device, Source::Fill, T::FILL_KERNEL)?;
macro_rules ! impl_call_fill {
($($t:ty),*) => {
$(
impl FillOp<$t> for Unary<$t> {
const FILL_KERNEL: &'static str = concat!("fill_", stringify!($t));
#[inline(always)]
fn fill(device: &Device, command_buffer: &CommandBufferRef, kernels: &Kernels, elem_count: usize, buffer: &Buffer, value: $t) -> Result<(), MetalKernelError> {
let pipeline = kernels.load_pipeline(device, Source::Fill, Self::FILL_KERNEL)?;
let encoder = command_buffer.new_compute_command_encoder(); let encoder = command_buffer.new_compute_command_encoder();
encoder.wait_for_fence(&kernels.fence); encoder.wait_for_fence(&kernels.fence);
encoder.set_compute_pipeline_state(&pipeline); encoder.set_compute_pipeline_state(&pipeline);
@ -1618,29 +1600,19 @@ macro_rules ! impl_call_fill {
encoder.end_encoding(); encoder.end_encoding();
Ok(()) Ok(())
}
}
)*
};
} }
impl_call_fill!(u32, i64, f16, bf16, f32);
impl FillOp<u8> for Unary<u8> { pub fn call_fill_u8(
const FILL_KERNEL: &'static str = "";
#[inline(always)]
fn fill(
_: &Device,
command_buffer: &CommandBufferRef, command_buffer: &CommandBufferRef,
kernels: &Kernels, kernels: &Kernels,
elem_count: usize, elem_count: usize,
buffer: &Buffer, buffer: &Buffer,
value: u8, value: u8,
) -> Result<(), MetalKernelError> { ) -> Result<(), MetalKernelError> {
let blit = command_buffer.new_blit_command_encoder(); let blit = command_buffer.new_blit_command_encoder();
blit.wait_for_fence(&kernels.fence); blit.wait_for_fence(&kernels.fence);
blit.fill_buffer( blit.fill_buffer(
&buffer, buffer,
metal::NSRange { metal::NSRange {
location: 0, location: 0,
length: elem_count as NSUInteger, length: elem_count as NSUInteger,
@ -1651,8 +1623,22 @@ impl FillOp<u8> for Unary<u8> {
blit.end_encoding(); blit.end_encoding();
Ok(()) Ok(())
}
} }
pub trait FillOp: EncoderParam {
const FILL_KERNEL: &'static str;
}
macro_rules ! impl_call_fill {
($($t:ty),*) => {
$(
impl FillOp for $t {
const FILL_KERNEL: &'static str = concat!("fill_", stringify!($t));
}
)*
};
}
impl_call_fill!(u32, i64, f16, bf16, f32);
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;