Add the const-set op. (#2910)

* Add the const-set op. * Cuda implementation. * Bugfix. * Metal cleanup. * Add the metal kernels. * Add some testing. * Finish the metal implementation. * Bump the version.
2025-06-16 10:38:54 +00:00 · 2025-04-19 10:07:02 +02:00
parent b2904a830b
commit a4c56a958e
20 changed files with 414 additions and 209 deletions
--- a/candle-kernels/Cargo.toml
+++ b/candle-kernels/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "candle-kernels"
-version = "0.9.0-alpha.4"
+version = "0.9.0-alpha.5"
 edition = "2021"

 description = "CUDA kernels for Candle"
--- a/candle-kernels/src/fill.cu
+++ b/candle-kernels/src/fill.cu
@ -1,5 +1,6 @@
 #include<stdint.h>
 #include "cuda_fp16.h"
+#include "cuda_utils.cuh"

 template<typename T>
 __device__ void fill_with(T *buf, T value, const size_t numel) {
@ -36,13 +37,45 @@ COPY2D_OP(uint8_t, copy2d_u8)
 COPY2D_OP(uint32_t, copy2d_u32)
 COPY2D_OP(int64_t, copy2d_i64)

+#define CONST_SET_OP(TYPENAME, FN_NAME) \
+extern "C" __global__ void FN_NAME( \
+    const size_t numel, \
+    const size_t num_dims, \
+    const size_t *info, \
+    const TYPENAME inp, \
+    TYPENAME *out \
+) { \
+    const size_t *dims = info; \
+    const size_t *strides = info + num_dims; \
+    if (info == nullptr || is_contiguous(num_dims, dims, strides)) { \
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
+            out[i] = inp; \
+        } \
+    } \
+    else { \
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
+            unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
+            out[strided_i] = inp; \
+        } \
+    } \
+} \
+
+CONST_SET_OP(float, const_set_f32)
+CONST_SET_OP(double, const_set_f64)
+CONST_SET_OP(uint8_t, const_set_u8)
+CONST_SET_OP(uint32_t, const_set_u32)
+CONST_SET_OP(int64_t, const_set_i64)
+
+
 #if __CUDA_ARCH__ >= 530
 extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); }
 COPY2D_OP(__half, copy2d_f16)
+CONST_SET_OP(__half, const_set_f16)
 #endif

 #if __CUDA_ARCH__ >= 800
 #include <cuda_bf16.h>
 extern "C" __global__ void fill_bf16(__nv_bfloat16 *buf, __nv_bfloat16 value, const size_t numel) { fill_with(buf, value, numel); }
 COPY2D_OP(__nv_bfloat16, copy2d_bf16)
+CONST_SET_OP(__nv_bfloat16, const_set_bf16)
 #endif