Refactor the hierarchy.

2025-06-17 02:58:50 +00:00 · 2023-06-27 11:57:27 +02:00
parent 6c4a960b15
commit d7f729fb8f
41 changed files with 35 additions and 33 deletions
--- a/candle-kernels/src/compatibility.cuh
+++ b/candle-kernels/src/compatibility.cuh
@ -0,0 +1,171 @@
+#include "cuda_fp16.h"
+
+// Table showing which features are supported on which compute capability
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
+
+// FIXME: the minimum compute capabilities are just guesses since the table is not specific enough
+
+// #if __CUDA_ARCH__ < 600
+// __device__ __forceinline__ __half __hmax(__half a, __half b) {
+//     return __float2half(fmaxf(__half2float(a), __half2float(b)));
+// }
+// __device__ __forceinline__ __half __hmin(__half a, __half b) {
+//     return __float2half(fminf(__half2float(a), __half2float(b)));
+// }
+// #endif
+
+#if __CUDA_ARCH__ < 800
+__device__ __forceinline__ __half __hmax_nan(__half a, __half b) {
+    // return __hisnan(a) ? a : (__hisnan(b) ? b : __hmax(a, b));
+}
+__device__ __forceinline__ __half __hmin_nan(__half a, __half b) {
+    // return __hisnan(a) ? a : (__hisnan(b) ? b : __hmin(a, b));
+}
+#endif
+
+#if __CUDA_ARCH__ < 600
+// Copied from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+__device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#endif
+
+
+#if __CUDA_ARCH__ < 700
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
+// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
+// Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
+__device__ __half atomicAdd(__half *address, __half val) {
+   //  unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+   //  unsigned int old = *address_as_ui;
+   //  unsigned int assumed;
+   //  bool unaligned = (size_t) address & 2;
+   //  do {
+   //      assumed = old;
+   //      unsigned int hsum;
+   //      hsum = unaligned ? (old >> 16) : (old & 0xffff);
+   //      hsum = __half_as_ushort(__ushort_as_half(hsum) + val); 
+   //      old = atomicCAS(address_as_ui, assumed,
+   //          unaligned ? (old & 0xffff) | (hsum << 16) : (old & 0xffff0000) | hsum
+   //      );
+
+   // } while (assumed != old);
+   // return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
+}
+#endif
+
+
+__device__ __forceinline__ __half atomicMaxf(__half* address, __half val) {
+#if __CUDA_ARCH__ < 700
+    // On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery.
+    // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
+    unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    bool unaligned = (size_t) address & 2;
+    do {
+        assumed = old;
+        unsigned int hmax;
+        hmax = unaligned ? (old >> 16) : (old & 0xffff);
+        hmax = __half_as_ushort(__hmax_nan(val, __ushort_as_half(hmax))); 
+        old = atomicCAS(address_as_ui, assumed,
+            unaligned ? (old & 0xffff) | (hmax << 16) : (old & 0xffff0000) | hmax
+        );
+
+    } while (assumed != old);
+    return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
+#else
+    // Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+    unsigned short int* casted_address = (unsigned short int*)address;
+    unsigned short int old = *casted_address;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmax_nan(val, __ushort_as_half(assumed))));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+    return __ushort_as_half(old);
+#endif
+}
+
+// atomicMax is not implemented for floats,
+// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ float atomicMaxf(float * addr, float value) {
+    if (signbit(value)) {
+        return __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));        
+    } else {
+        return __int_as_float(atomicMax((int *)addr, __float_as_int(value)));
+    }
+}
+
+__device__ __forceinline__ double atomicMaxf(double * addr, double value) {
+    if (signbit(value)) {
+        return __longlong_as_double(atomicMin((unsigned long long int *)addr, __double_as_longlong(value)));
+    } else {
+        return __longlong_as_double(atomicMax((long long int *)addr, __double_as_longlong(value)));
+    }
+}
+
+
+__device__ __forceinline__ __half atomicMinf(__half* address, __half val) {
+#if __CUDA_ARCH__ < 700
+    // On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery.
+    // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
+    unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    bool unaligned = (size_t) address & 2;
+    do {
+        assumed = old;
+        unsigned int hmin;
+        hmin = unaligned ? (old >> 16) : (old & 0xffff);
+        hmin = __half_as_ushort(__hmin_nan(val, __ushort_as_half(hmin))); 
+        old = atomicCAS(address_as_ui, assumed,
+            unaligned ? (old & 0xffff) | (hmin << 16) : (old & 0xffff0000) | hmin
+        );
+
+    } while (assumed != old);
+    return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
+#else
+    // Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+    unsigned short int* casted_address = (unsigned short int*)address;
+    unsigned short int old = *casted_address;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmin_nan(val, __ushort_as_half(assumed))));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+    return __ushort_as_half(old);
+#endif
+}
+
+// atomicMin is not implemented for floats,
+// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ float atomicMinf(float * addr, float value) {
+    if (signbit(value)) {
+        return __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));
+    } else {
+        return __int_as_float(atomicMin((int *)addr, __float_as_int(value)));
+    }
+}
+
+__device__ __forceinline__ double atomicMinf(double * addr, double value) {
+    if (signbit(value)) {
+        return __longlong_as_double(atomicMax((unsigned long long int *)addr, __double_as_longlong(value)));
+    } else {
+        return __longlong_as_double(atomicMin((long long int *)addr, __double_as_longlong(value)));
+    }
+}