Cuda kernels for fast min/max reductions (#203)

* Add the min/max cuda kernels. * Better integration of the cuda kernels.
2025-06-18 11:37:11 +00:00 · 2023-07-19 19:12:27 +02:00
parent 001f9a59ce
commit 536c5e702e
3 changed files with 130 additions and 22 deletions
--- a/candle-kernels/src/cuda_utils.cuh
+++ b/candle-kernels/src/cuda_utils.cuh
@ -1,4 +1,6 @@
 #include "compatibility.cuh"
+#include<stdint.h>
+#include<cmath>

 // TODO: This is often used to check that the data is contiguous so that
 // kernels can be easily mapped. However this only returns true for row
@ -140,6 +142,9 @@ __device__ __forceinline__ double absg(double a) { return fabs(a); }
 __device__ __forceinline__ float copysigng(float a, float b) { return copysignf(a, b); }
 __device__ __forceinline__ double copysigng(double a, double b) { return copysign(a, b); }

+__device__ __forceinline__ uint32_t ming(uint32_t a, uint32_t b) { return min(a, b); }
+__device__ __forceinline__ uint32_t maxg(uint32_t a, uint32_t b) { return max(a, b); }
+
 #if __CUDA_ARCH__ >= 530
 __device__ __forceinline__ __half powg(__half a, __half b) { return __float2half(powf(__half2float(a), __half2float(b))); }
 __device__ __forceinline__ bool isnang(__half a) { return __hisnan(a); }