Adding the actual backend

2025-06-22 04:22:50 +00:00 · 2023-11-09 19:53:14 +01:00
parent 976ad9f9c2
commit 39406a6721
13 changed files with 2612 additions and 1 deletions
--- a/candle-metal-kernels/src/affine.metal
+++ b/candle-metal-kernels/src/affine.metal
@ -0,0 +1,46 @@
+#include <metal_stdlib>
+
+METAL_FUNC uint get_strided_index(
+    uint idx,
+    constant size_t &num_dims,
+    constant size_t *dims,
+    constant size_t *strides
+) {
+    uint strided_i = 0;
+    for (uint d = 0; d < num_dims; d++) {
+        uint dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+using namespace metal;
+
+#define AFFINE(FN_NAME, TYPENAME) \
+kernel void FN_NAME( \
+    constant size_t &dim, \
+    constant float &mul, \
+    constant float &add, \
+    device const TYPENAME *input,  \
+    device TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const TYPENAME m = TYPENAME(mul); \
+    const TYPENAME a = TYPENAME(add); \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        output[i] = input[i] * m + a; \
+    } \
+} \
+
+AFFINE(affine_float, float)
+AFFINE(affine_half, half)
+
+
+#if __METAL_VERSION__ >= 310
+AFFINE(affine_bfloat, bfloat);
+#endif
--- a/candle-metal-kernels/src/binary.metal
+++ b/candle-metal-kernels/src/binary.metal
@ -0,0 +1,78 @@
+#include <metal_stdlib>
+
+METAL_FUNC uint get_strided_index(
+    uint idx,
+    constant size_t &num_dims,
+    constant size_t *dims,
+    constant size_t *strides
+) {
+    uint strided_i = 0;
+    for (uint d = 0; d < num_dims; d++) {
+        uint dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+using namespace metal;
+
+#define BINARY(FN, TYPENAME, OUT_TYPENAME, FN_NAME, FN_NAME_STRIDED) \
+kernel void FN_NAME( \
+    constant size_t &dim, \
+    device const TYPENAME *left,  \
+    device const TYPENAME *right,  \
+    device TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        TYPENAME x = left[i]; \
+        TYPENAME y = right[i]; \
+        output[i] = OUT_TYPENAME(FN); \
+    } \
+}\
+kernel void FN_NAME_STRIDED( \
+    constant size_t &dim, \
+    constant size_t &num_dims, \
+    constant size_t *dims, \
+    constant size_t *left_strides, \
+    constant size_t *right_strides, \
+    device const TYPENAME *left,  \
+    device const TYPENAME *right,  \
+    device TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        TYPENAME x = left[get_strided_index(i, num_dims, dims, left_strides)]; \
+        TYPENAME y = left[get_strided_index(i, num_dims, dims, right_strides)]; \
+        output[i] = OUT_TYPENAME(FN); \
+    } \
+}
+
+#define BINARY_OP(FN, NAME) \
+BINARY(FN, float, float, NAME##_float, NAME##_float_strided); \
+BINARY(FN, half, half, NAME##_half, NAME##_half_strided);
+
+#define BFLOAT_BINARY_OP(FN, NAME) \
+BINARY(FN, bfloat, bfloat, NAME##_bfloat, NAME##_bfloat_strided);
+
+
+BINARY_OP(x + y, add)
+BINARY_OP(x - y, sub)
+BINARY_OP(x * y, mul)
+BINARY_OP(x / y, div)
+
+#if __METAL_VERSION__ >= 310
+BFLOAT_BINARY_OP(x + y, add)
+BFLOAT_BINARY_OP(x - y, sub)
+BFLOAT_BINARY_OP(x * y, mul)
+BFLOAT_BINARY_OP(x / y, div)
+#endif
--- a/candle-metal-kernels/src/cast.metal
+++ b/candle-metal-kernels/src/cast.metal
@ -0,0 +1,58 @@
+#include <metal_stdlib>
+
+METAL_FUNC uint get_strided_index(
+    uint idx,
+    constant size_t &num_dims,
+    constant size_t *dims,
+    constant size_t *strides
+) {
+    uint strided_i = 0;
+    for (uint d = 0; d < num_dims; d++) {
+        uint dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+
+using namespace metal;
+
+#define CAST(FN_NAME, FN_NAME_STRIDED, LEFT_TYPENAME, RIGHT_TYPENAME) \
+kernel void FN_NAME( \
+    constant size_t &dim, \
+    device const LEFT_TYPENAME *input,  \
+    device RIGHT_TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        output[i] = RIGHT_TYPENAME(input[i]); \
+    } \
+} \
+kernel void FN_NAME_STRIDED( \
+    constant size_t &dim, \
+    constant size_t &num_dims, \
+    constant size_t *dims, \
+    constant size_t *strides, \
+    device const LEFT_TYPENAME *input,  \
+    device RIGHT_TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        output[i] = RIGHT_TYPENAME(input[get_strided_index(i, num_dims, dims, strides)]); \
+    } \
+}
+
+
+CAST(cast_u32_f32, cast_u32_f32_strided, int32_t, float)
+
+#if __METAL_VERSION__ >= 310
+#endif
--- a/candle-metal-kernels/src/indexing.metal
+++ b/candle-metal-kernels/src/indexing.metal
@ -0,0 +1,75 @@
+#include <metal_stdlib>
+using namespace metal;
+
+template <typename T, typename I>
+void index_add(
+    device I *ids [[buffer(0)]],
+    device T *inp [[buffer(1)]],
+    device T *out [[buffer(2)]],
+
+    constant uint &ids_dim_size,
+    constant uint &left_size,
+    constant uint &dst_dim_size,
+    constant uint &right_size,
+
+    uint threadgroup_size [[threads_per_threadgroup]],
+    uint threadgroup_position_in_grid [[threadgroup_position_in_grid]],
+    uint thread_index [[thread_index_in_threadgroup]]
+) {
+
+    const uint gid = thread_index + (threadgroup_position_in_grid * threadgroup_size);
+    if (gid >= left_size * right_size) {
+        return;
+    }
+
+    const uint i = gid;
+    const uint pre = i / right_size;
+    const uint post = i % right_size;
+
+    for (uint j = 0; j < ids_dim_size; j++) {
+        const uint idx = ids[j];
+        const uint src_i = (pre * ids_dim_size + j) * right_size + post;
+        const uint dst_i = (pre * dst_dim_size + idx) * right_size + post;
+        out[dst_i] += inp[src_i];
+    }
+}
+
+#define IA_OP(TYPENAME, INDEX_TYPENAME, FN_NAME) \
+kernel void FN_NAME( \
+    device INDEX_TYPENAME *ids [[buffer(0)]], \
+    device TYPENAME *inp [[buffer(1)]], \
+    device TYPENAME *out [[buffer(2)]], \
+    constant uint &ids_dim_size, \
+    constant uint &left_size, \
+    constant uint &dst_dim_size, \
+    constant uint &right_size, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint threadgroup_position_in_grid [[threadgroup_position_in_grid]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { index_add<TYPENAME, INDEX_TYPENAME>(ids, inp, out, ids_dim_size, left_size, dst_dim_size, right_size, threadgroup_size, threadgroup_position_in_grid, thread_index); } \
+
+
+
+#if __METAL_VERSION__ >= 310
+IA_OP(bfloat, int64_t, ia_i64_bf16)
+IA_OP(bfloat, uint32_t, ia_u32_bf16)
+IA_OP(bfloat, uint8_t, ia_u8_bf16)
+#endif
+
+IA_OP(half, uint32_t, ia_u32_f16)
+IA_OP(half, uint8_t, ia_u8_f16)
+
+IA_OP(float, int64_t, ia_i64_f32)
+IA_OP(uint8_t, int64_t, ia_i64_u8)
+IA_OP(int64_t, int64_t, ia_i64_i64)
+IA_OP(uint32_t, int64_t, ia_i64_u32)
+
+IA_OP(float, uint32_t, ia_u32_f32)
+IA_OP(uint8_t, uint32_t, ia_u32_u8)
+IA_OP(int64_t, uint32_t, ia_u32_i64)
+IA_OP(uint32_t, uint32_t, ia_u32_u32)
+
+IA_OP(float, uint8_t, ia_u8_f32)
+IA_OP(uint8_t, uint8_t, ia_u8_u8)
+IA_OP(uint32_t, uint8_t, ia_u8_u32)
+IA_OP(int64_t, uint8_t, ia_u8_i64)
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
--- a/candle-metal-kernels/src/reduce.metal
+++ b/candle-metal-kernels/src/reduce.metal
@ -0,0 +1,124 @@
+#include <metal_stdlib>
+using namespace metal;
+
+METAL_FUNC uint get_strided_index(
+    uint idx,
+    constant size_t &num_dims,
+    constant size_t *dims,
+    constant size_t *strides
+) {
+    uint strided_i = 0;
+    for (uint d = 0; d < num_dims; d++) {
+        uint dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+constant int THREADGROUP_SIZE = 256;
+
+kernel void fast_sum_float(
+    constant size_t &src_numel,
+    constant size_t &el_to_sum_per_block,
+    device const float *src, 
+    device float *dst,
+    uint id [[ thread_position_in_grid ]],
+    uint tid [[ thread_index_in_threadgroup ]],
+    uint dst_id [[ threadgroup_position_in_grid ]],
+    uint blockDim [[ threads_per_threadgroup ]]
+) {
+    
+   threadgroup float shared_memory[THREADGROUP_SIZE];
+     
+   shared_memory[tid] = 0;
+   // Elements summed in this block range from dst_id * el_to_sum_per_block
+   // to (dst_id + 1) * el_to_sum_per_block.
+   size_t start_idx = dst_id * el_to_sum_per_block;
+   size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel);
+   size_t idx = start_idx + tid;
+
+   while (idx < stop_idx) {
+     // TODO: Fast version for the contiguous case.
+     // size_t strided_i = get_strided_index(idx, num_dims, dims, strides);
+     shared_memory[tid] += src[idx];
+     idx += blockDim;
+   }
+     
+   threadgroup_barrier(mem_flags::mem_none);
+   
+   // reduction in shared memory
+   for (uint s = blockDim / 2; s > 0; s >>= 1) {
+       if (tid < s) {
+           shared_memory[tid] += shared_memory[tid + s];
+       }
+       threadgroup_barrier(mem_flags::mem_none);
+   }
+   
+   dst[dst_id] = shared_memory[0];
+}
+
+kernel void softmax_float(
+    constant size_t &src_numel,
+    constant size_t &el_to_sum_per_block,
+    device const float *src, 
+    device float *dst,
+    uint id [[ thread_position_in_grid ]],
+    uint tid [[ thread_index_in_threadgroup ]],
+    uint dst_id [[ threadgroup_position_in_grid ]],
+    uint blockDim [[ threads_per_threadgroup ]]
+) {
+    
+   threadgroup float shared_memory[THREADGROUP_SIZE];
+     
+   shared_memory[tid] = -INFINITY;
+   // Elements summed in this block range from dst_id * el_to_sum_per_block
+   // to (dst_id + 1) * el_to_sum_per_block.
+   size_t start_idx = dst_id * el_to_sum_per_block;
+   size_t stop_idx = min(start_idx + el_to_sum_per_block, src_numel);
+   size_t idx = start_idx + tid;
+
+   while (idx < stop_idx) {
+     // TODO: Fast version for the contiguous case.
+     shared_memory[tid] = max(shared_memory[tid], src[idx]);
+     idx += blockDim;
+   }
+     
+   threadgroup_barrier(mem_flags::mem_none);
+   
+   // reduction in shared memory
+   for (uint s = blockDim / 2; s > 0; s >>= 1) {
+       if (tid < s) {
+           shared_memory[tid] = max(shared_memory[tid], shared_memory[tid + s]);
+       }
+       threadgroup_barrier(mem_flags::mem_none);
+   }
+   
+   float max = shared_memory[0];
+
+   shared_memory[tid] = 0;
+
+   // Restart
+   idx = start_idx + tid;
+   while (idx < stop_idx) {
+     // TODO: Fast version for the contiguous case.
+     const float val = exp(src[idx] - max);
+     dst[idx] = val; 
+     shared_memory[tid] += val;
+     idx += blockDim;
+   }
+   // reduction in shared memory
+   for (uint s = blockDim / 2; s > 0; s >>= 1) {
+       if (tid < s) {
+           shared_memory[tid] += shared_memory[tid + s];
+       }
+       threadgroup_barrier(mem_flags::mem_none);
+   }
+
+   const float inv_acc = 1/shared_memory[0];
+   idx = start_idx + tid;
+   while (idx < stop_idx) {
+     dst[idx] *= inv_acc; 
+     idx += blockDim;
+   }
+}
--- a/candle-metal-kernels/src/ternary.metal
+++ b/candle-metal-kernels/src/ternary.metal
@ -0,0 +1,57 @@
+#include <metal_stdlib>
+#
+using namespace metal;
+
+METAL_FUNC uint get_strided_index(
+    uint idx,
+    constant size_t &num_dims,
+    constant size_t *dims,
+    constant size_t *strides
+) {
+    uint strided_i = 0;
+    for (uint d = 0; d < num_dims; d++) {
+        uint dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+
+#define WHERE_OP(TYPENAME, ID_TYPENAME, FN_NAME) \
+kernel void FN_NAME(  \
+    constant size_t &numel,  \
+    constant size_t &num_dims, \
+    constant size_t *dims, \
+    constant size_t *strides, \
+    constant size_t *strides_t, \
+    constant size_t *strides_f, \
+    device const ID_TYPENAME *ids, \
+    device const TYPENAME *t, \
+    device const TYPENAME *f, \
+    device TYPENAME *out ,\
+    uint i [[ thread_position_in_grid ]] \
+) {  \
+   uint strided_i = get_strided_index(i, num_dims, dims, strides); \
+   uint strided_i_t = get_strided_index(i, num_dims, dims, strides_t); \
+   uint strided_i_f = get_strided_index(i, num_dims, dims, strides_f); \
+   out[i] = ids[strided_i] ? t[strided_i_t] : f[strided_i_f]; \
+} \
+
+// WHERE_OP(float, int64_t, where_i64_f32)
+// WHERE_OP(double, int64_t, where_i64_f64)
+// WHERE_OP(uint8_t, int64_t, where_i64_u8)
+// WHERE_OP(uint32_t, int64_t, where_i64_u32)
+// WHERE_OP(int64_t, int64_t, where_i64_i64)
+// 
+// WHERE_OP(float, uint32_t, where_u32_f32)
+// WHERE_OP(double, uint32_t, where_u32_f64)
+// WHERE_OP(uint8_t, uint32_t, where_u32_u8)
+// WHERE_OP(uint32_t, uint32_t, where_u32_u32)
+// WHERE_OP(int64_t, uint32_t, where_u32_i64)
+
+WHERE_OP(float, uint8_t, where_u8_f32)
+// WHERE_OP(double, uint8_t, where_u8_f64)
+// WHERE_OP(uint8_t, uint8_t, where_u8_u8)
+// WHERE_OP(uint32_t, uint8_t, where_u8_u32)
+// WHERE_OP(int64_t, uint8_t, where_u8_i64)
--- a/candle-metal-kernels/src/unary.metal
+++ b/candle-metal-kernels/src/unary.metal
@ -0,0 +1,82 @@
+#include <metal_stdlib>
+
+METAL_FUNC uint get_strided_index(
+    uint idx,
+    constant size_t &num_dims,
+    constant size_t *dims,
+    constant size_t *strides
+) {
+    uint strided_i = 0;
+    for (uint d = 0; d < num_dims; d++) {
+        uint dim_idx = num_dims - 1 - d;
+        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
+        idx /= dims[dim_idx];
+    }
+    return strided_i;
+}
+
+template <typename T> METAL_FUNC T sqr(T in){ return in * in; }
+template <typename T> METAL_FUNC T neg(T in){ return -in; }
+template <typename T> METAL_FUNC T id(T in){ return in; }
+
+
+using namespace metal;
+
+#define UNARY(FN, TYPENAME, FN_NAME, FN_NAME_STRIDED) \
+kernel void FN_NAME( \
+    constant size_t &dim, \
+    device const TYPENAME *input,  \
+    device TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        output[i] = TYPENAME(FN(input[i])); \
+    } \
+}\
+kernel void FN_NAME_STRIDED( \
+    constant size_t &dim, \
+    constant size_t &num_dims, \
+    constant size_t *dims, \
+    constant size_t *strides, \
+    device const TYPENAME *input,  \
+    device TYPENAME *output, \
+    uint threadgroup_size [[threads_per_threadgroup]], \
+    uint thread_index [[thread_index_in_threadgroup]] \
+) { \
+    const size_t length = (dim  + threadgroup_size - 1) / threadgroup_size; \
+    const size_t start = thread_index * length; \
+    const size_t stop = min(start + length, dim); \
+    for (size_t i = start; i < stop; i++){ \
+        output[i] = TYPENAME(FN(input[get_strided_index(i, num_dims, dims, strides)])); \
+    } \
+}
+
+#define UNARY_OP(NAME) \
+UNARY(NAME, float, NAME##_float, NAME##_float_strided); \
+UNARY(NAME, half, NAME##_half, NAME##_half_strided);
+
+#define BFLOAT_UNARY_OP(NAME) \
+UNARY(NAME, bfloat, NAME##_bfloat, NAME##_bfloat_strided);
+
+
+UNARY_OP(cos)
+UNARY_OP(sin)
+UNARY_OP(sqr)
+UNARY_OP(sqrt)
+UNARY_OP(neg)
+UNARY_OP(exp)
+UNARY(id, float, copy_float, copy_float_strided)
+UNARY(id, half, copy_half, copy_half_strided)
+
+#if __METAL_VERSION__ >= 310
+BFLOAT_UNARY_OP(cos)
+BFLOAT_UNARY_OP(sin)
+BFLOAT_UNARY_OP(sqr)
+BFLOAT_UNARY_OP(sqrt)
+BFLOAT_UNARY_OP(neg)
+BFLOAT_UNARY_OP(exp)
+#endif