Add cast_bf16_x/cast_x_bf16 when CUDA_ARCH<800 but CUDA_VERSION >= 11000 (#1919)

- it make possible to load bf16 models on T4(sm75)
2025-06-15 10:26:33 +00:00 · 2024-03-23 20:44:10 +08:00
parent 6f877592a7
commit 790037390c
1 changed files with 12 additions and 0 deletions
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@ -83,6 +83,18 @@ CAST_OP(double,   __nv_bfloat16, cast_f64_bf16)
 CAST_THROUGH_OP(__nv_bfloat16, uint8_t, float, cast_bf16_u8)
 CAST_THROUGH_OP(__nv_bfloat16, __half,   float, cast_bf16_f16)
 CAST_THROUGH_OP(__half,   __nv_bfloat16, float, cast_f16_bf16)
+#else
+#include <cuda.h>
+#if CUDA_VERSION >= 11000
+CAST_OP(__nv_bfloat16, float,    cast_bf16_f32)
+CAST_OP(float,    __nv_bfloat16, cast_f32_bf16)
+CAST_THROUGH_OP(__nv_bfloat16, uint8_t, float, cast_bf16_u8)
+CAST_THROUGH_OP(__nv_bfloat16, __half,  float, cast_bf16_f16)
+CAST_THROUGH_OP(__nv_bfloat16, double,  float, cast_bf16_f64)
+CAST_THROUGH_OP(__half,   __nv_bfloat16, float, cast_f16_bf16)
+CAST_THROUGH_OP(double,   __nv_bfloat16, float, cast_f64_bf16)
+CAST_THROUGH_OP(uint8_t,   __nv_bfloat16, float, cast_u8_bf16)
+#endif
 #endif

 #if __CUDA_ARCH__ >= 530