Follow crate conventions

This commit is contained in:
Ivar Flakstad
2024-01-01 20:37:56 +01:00
parent 6eb44d1bce
commit e8e24f1284
5 changed files with 75 additions and 77 deletions

View File

@ -22,7 +22,11 @@ fn criterion_benchmark(c: &mut Criterion) {
bencher.iter_custom(|iters| {
let start = Instant::now();
for _i in 0..iters {
run(black_box((b, rows, columns)), black_box(DType::U8), black_box(&device1));
run(
black_box((b, rows, columns)),
black_box(DType::U8),
black_box(&device1),
);
}
if let Device::Metal(device) = &device1 {
device.wait_until_completed().unwrap();
@ -35,12 +39,18 @@ fn criterion_benchmark(c: &mut Criterion) {
group.finish();
let mut group = c.benchmark_group("fill_metal_f32");
group.throughput(Throughput::Bytes((flops * DType::F32.size_in_bytes()) as u64));
group.throughput(Throughput::Bytes(
(flops * DType::F32.size_in_bytes()) as u64,
));
group.bench_function("iter", move |bencher| {
bencher.iter_custom(|iters| {
let start = Instant::now();
for _i in 0..iters {
run(black_box((b, rows, columns)), black_box(DType::F32), black_box(&device2));
run(
black_box((b, rows, columns)),
black_box(DType::F32),
black_box(&device2),
);
}
if let Device::Metal(device) = &device2 {
device.wait_until_completed().unwrap();

View File

@ -3,7 +3,7 @@ use crate::conv::{ParamsConv1D, ParamsConv2D, ParamsConvTranspose1D, ParamsConvT
use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
use crate::{CpuStorage, DType, Layout, Result, Shape};
use candle_metal_kernels;
use candle_metal_kernels::{FillOp, Unary, Kernels};
use candle_metal_kernels::Kernels;
use half::{bf16, f16};
use metal;
use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger};
@ -1405,15 +1405,14 @@ impl BackendDevice for MetalDevice {
let command_buffer = self.command_buffer()?;
command_buffer.set_label("zeros");
// This assumes the zero value of this DType is equal to 0x00u8
// This kernel assumes the zero value of this DType is equal to 0x00u8
// (which is true for all current types)
Unary::fill(
&self.device,
candle_metal_kernels::call_fill_u8(
&command_buffer,
&self.kernels,
shape.elem_count(),
&buffer,
0u8,
0,
)
.map_err(MetalError::from)?;
@ -1427,7 +1426,7 @@ impl BackendDevice for MetalDevice {
macro_rules! fill {
($value:expr) => {
Unary::fill(
candle_metal_kernels::call_fill(
&self.device,
&command_buffer,
&self.kernels,
@ -1439,7 +1438,14 @@ impl BackendDevice for MetalDevice {
};
}
match dtype {
DType::U8 => fill!(1u8),
DType::U8 => candle_metal_kernels::call_fill_u8(
&command_buffer,
&self.kernels,
shape.elem_count(),
&buffer,
1u8,
)
.map_err(MetalError::from)?,
DType::U32 => fill!(1u32),
DType::I64 => fill!(1i64),
DType::BF16 => fill!(bf16::ONE),

View File

@ -20,7 +20,3 @@ num-traits = "0.2.17"
[dev-dependencies]
rand = "0.8.5"
criterion = "0.5.1"
[[bench]]
name = "fill"
harness = false

View File

@ -5,7 +5,6 @@ use metal::{
};
use std::collections::HashMap;
use std::ffi::c_void;
use std::marker::PhantomData;
use std::sync::RwLock;
const AFFINE: &str = include_str!("affine.metal");
@ -1578,32 +1577,15 @@ fn divide(m: usize, b: usize) -> NSUInteger {
((m + b - 1) / b) as NSUInteger
}
pub struct Unary<T> {
_marker: PhantomData<T>,
}
pub trait FillOp<T> {
const FILL_KERNEL: &'static str;
fn fill(
pub fn call_fill<T: FillOp>(
device: &Device,
command_buffer: &CommandBufferRef,
kernels: &Kernels,
elem_count: usize,
buffer: &Buffer,
value: T,
) -> Result<(), MetalKernelError>;
}
macro_rules ! impl_call_fill {
($($t:ty),*) => {
$(
impl FillOp<$t> for Unary<$t> {
const FILL_KERNEL: &'static str = concat!("fill_", stringify!($t));
#[inline(always)]
fn fill(device: &Device, command_buffer: &CommandBufferRef, kernels: &Kernels, elem_count: usize, buffer: &Buffer, value: $t) -> Result<(), MetalKernelError> {
let pipeline = kernels.load_pipeline(device, Source::Fill, Self::FILL_KERNEL)?;
) -> Result<(), MetalKernelError> {
let pipeline = kernels.load_pipeline(device, Source::Fill, T::FILL_KERNEL)?;
let encoder = command_buffer.new_compute_command_encoder();
encoder.wait_for_fence(&kernels.fence);
encoder.set_compute_pipeline_state(&pipeline);
@ -1619,18 +1601,8 @@ macro_rules ! impl_call_fill {
Ok(())
}
}
)*
};
}
impl_call_fill!(u32, i64, f16, bf16, f32);
impl FillOp<u8> for Unary<u8> {
const FILL_KERNEL: &'static str = "";
#[inline(always)]
fn fill(
_: &Device,
pub fn call_fill_u8(
command_buffer: &CommandBufferRef,
kernels: &Kernels,
elem_count: usize,
@ -1640,7 +1612,7 @@ impl FillOp<u8> for Unary<u8> {
let blit = command_buffer.new_blit_command_encoder();
blit.wait_for_fence(&kernels.fence);
blit.fill_buffer(
&buffer,
buffer,
metal::NSRange {
location: 0,
length: elem_count as NSUInteger,
@ -1652,7 +1624,21 @@ impl FillOp<u8> for Unary<u8> {
Ok(())
}
pub trait FillOp: EncoderParam {
const FILL_KERNEL: &'static str;
}
macro_rules ! impl_call_fill {
($($t:ty),*) => {
$(
impl FillOp for $t {
const FILL_KERNEL: &'static str = concat!("fill_", stringify!($t));
}
)*
};
}
impl_call_fill!(u32, i64, f16, bf16, f32);
#[cfg(test)]
mod tests;