From bc131b402b09887748141da99c942b547f02a4fc Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 25 Aug 2023 10:38:19 +0000
Subject: [PATCH 1/5] Repairing cast bf16/f16

---
 candle-kernels/src/cast.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu
index ea611eba..0c4ddbc6 100644
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@@ -13,13 +13,13 @@ extern "C" __global__ void FN_NAME( \
     const size_t *strides = info + num_dims; \
     if (is_contiguous(num_dims, dims, strides)) { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
-            out[i] = inp[i]; \
+            out[i] = (DST_TYPENAME) inp[i]; \
         } \
     } \
     else { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
             unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
-            out[i] = inp[strided_i]; \
+            out[i] = (DST_TYPENAME) inp[strided_i]; \
         } \
     } \
 } \
@@ -29,12 +29,12 @@ CAST_OP(__nv_bfloat16, __nv_bfloat16, cast_bf16_bf16)
 
 // CAST_OP(__nv_bfloat16, uint8_t, cast_bf16_u8)
 CAST_OP(__nv_bfloat16, uint32_t, cast_bf16_u32)
-// CAST_OP(__nv_bfloat16, __half,   cast_bf16_f16)
+CAST_OP(__nv_bfloat16, __half,   cast_bf16_f16)
 CAST_OP(__nv_bfloat16, float,    cast_bf16_f32)
 CAST_OP(__nv_bfloat16, double,   cast_bf16_f64)
 CAST_OP(uint8_t, __nv_bfloat16, cast_u8_bf16)
 CAST_OP(uint32_t, __nv_bfloat16, cast_u32_bf16)
-// CAST_OP(__half,   __nv_bfloat16, cast_f16_bf16)
+CAST_OP(__half,   __nv_bfloat16, cast_f16_bf16)
 CAST_OP(float,    __nv_bfloat16, cast_f32_bf16)
 CAST_OP(double,   __nv_bfloat16, cast_f64_bf16)
 #endif

From db8bab8b7a10e161339a1e9c51c503028a66cb0b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 25 Aug 2023 10:49:22 +0000
Subject: [PATCH 2/5] Different casting ?

---
 candle-kernels/src/cast.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu
index 0c4ddbc6..3e1f5414 100644
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@@ -13,13 +13,13 @@ extern "C" __global__ void FN_NAME( \
     const size_t *strides = info + num_dims; \
     if (is_contiguous(num_dims, dims, strides)) { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
-            out[i] = (DST_TYPENAME) inp[i]; \
+            out[i] = DST_TYPENAME(inp[i]); \
         } \
     } \
     else { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
             unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
-            out[i] = (DST_TYPENAME) inp[strided_i]; \
+            out[i] = DST_TYPENAME(inp[strided_i]); \
         } \
     } \
 } \

From 1c1e34735e2d20f20ad17c03f01697ecaae5a8d1 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 25 Aug 2023 11:40:36 +0000
Subject: [PATCH 3/5] `static_cast` ?

---
 candle-kernels/src/cast.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu
index 3e1f5414..0a2282fc 100644
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@@ -13,13 +13,13 @@ extern "C" __global__ void FN_NAME( \
     const size_t *strides = info + num_dims; \
     if (is_contiguous(num_dims, dims, strides)) { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
-            out[i] = DST_TYPENAME(inp[i]); \
+            out[i] = static_cast<DST_TYPENAME>(inp[i]); \
         } \
     } \
     else { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
             unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
-            out[i] = DST_TYPENAME(inp[strided_i]); \
+            out[i] = static_cast<DST_TYPENAME>(inp[strided_i]); \
         } \
     } \
 } \

From be371e827c141e9452b0dd8790209e0b3642648c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 25 Aug 2023 11:54:30 +0000
Subject: [PATCH 4/5] Intermediary float cast is necessary for cuda 11.8

---
 candle-kernels/src/cast.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu
index 0a2282fc..03ca1ec7 100644
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@@ -13,13 +13,13 @@ extern "C" __global__ void FN_NAME( \
     const size_t *strides = info + num_dims; \
     if (is_contiguous(num_dims, dims, strides)) { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
-            out[i] = static_cast<DST_TYPENAME>(inp[i]); \
+            out[i] = (DST_TYPENAME) (float) inp[i]; \
         } \
     } \
     else { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
             unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
-            out[i] = static_cast<DST_TYPENAME>(inp[strided_i]); \
+            out[i] = (DST_TYPENAME) (float) inp[strided_i]; \
         } \
     } \
 } \

From d4e75d582506520ba6a76330bba4c14dcdcc19d8 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 25 Aug 2023 12:01:12 +0000
Subject: [PATCH 5/5] Let's keep the dirty code on its own.

---
 candle-kernels/src/cast.cu | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu
index 03ca1ec7..ab2045a3 100644
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@@ -2,6 +2,29 @@
 #include<stdint.h>
 
 #define CAST_OP(SRC_TYPENAME, DST_TYPENAME, FN_NAME) \
+extern "C" __global__ void FN_NAME( \
+    const size_t numel, \
+    const size_t num_dims, \
+    const size_t *info, \
+    const SRC_TYPENAME *inp, \
+    DST_TYPENAME *out \
+) { \
+    const size_t *dims = info; \
+    const size_t *strides = info + num_dims; \
+    if (is_contiguous(num_dims, dims, strides)) { \
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
+            out[i] = inp[i]; \
+        } \
+    } \
+    else { \
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
+            unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
+            out[i] = inp[strided_i]; \
+        } \
+    } \
+} \
+
+#define CAST_BF_OP(SRC_TYPENAME, DST_TYPENAME, FN_NAME) \
 extern "C" __global__ void FN_NAME( \
     const size_t numel, \
     const size_t num_dims, \
@@ -29,14 +52,14 @@ CAST_OP(__nv_bfloat16, __nv_bfloat16, cast_bf16_bf16)
 
 // CAST_OP(__nv_bfloat16, uint8_t, cast_bf16_u8)
 CAST_OP(__nv_bfloat16, uint32_t, cast_bf16_u32)
-CAST_OP(__nv_bfloat16, __half,   cast_bf16_f16)
 CAST_OP(__nv_bfloat16, float,    cast_bf16_f32)
 CAST_OP(__nv_bfloat16, double,   cast_bf16_f64)
 CAST_OP(uint8_t, __nv_bfloat16, cast_u8_bf16)
 CAST_OP(uint32_t, __nv_bfloat16, cast_u32_bf16)
-CAST_OP(__half,   __nv_bfloat16, cast_f16_bf16)
 CAST_OP(float,    __nv_bfloat16, cast_f32_bf16)
 CAST_OP(double,   __nv_bfloat16, cast_f64_bf16)
+CAST_BF_OP(__nv_bfloat16, __half,   cast_bf16_f16)
+CAST_BF_OP(__half,   __nv_bfloat16, cast_f16_bf16)
 #endif
 
 #if __CUDA_ARCH__ >= 530