Refactor the hierarchy.

This commit is contained in:
Nicolas Patry
2023-06-27 11:57:27 +02:00
parent 6c4a960b15
commit d7f729fb8f
41 changed files with 35 additions and 33 deletions

View File

@ -0,0 +1,37 @@
#include "cuda_utils.cuh"
#include<stdint.h>
#define AFFINE_OP(TYPENAME, FN_NAME) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t *info, \
const TYPENAME *inp, \
TYPENAME *out, \
const TYPENAME mul, \
const TYPENAME add \
) { \
const size_t *dims = info; \
const size_t *strides = info + num_dims; \
if (is_contiguous(num_dims, dims, strides)) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
TYPENAME x = inp ? inp[i] : out[i]; \
out[i] = x * mul + add; \
} \
} \
else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
TYPENAME x = inp ? inp[strided_i] : out[i]; \
out[i] = x * mul + add; \
} \
} \
} \
#if __CUDA_ARCH__ >= 530
AFFINE_OP(__half, affine_f16)
#endif
AFFINE_OP(float, affine_f32)
AFFINE_OP(double, affine_f64)
AFFINE_OP(uint32_t, affine_u32)

View File

@ -0,0 +1,22 @@
#include "binary_op_macros.cuh"
#include<stdint.h>
#if __CUDA_ARCH__ >= 530
BINARY_OP(__half, badd_f16, x + y)
BINARY_OP(__half, bdiv_f16, x / y)
BINARY_OP(__half, bmul_f16, x * y)
BINARY_OP(__half, bsub_f16, x - y)
#endif
BINARY_OP(float, badd_f32, x + y)
BINARY_OP(double, badd_f64, x + y);
BINARY_OP(uint32_t, badd_u32, x + y);
BINARY_OP(float, bdiv_f32, x / y)
BINARY_OP(double, bdiv_f64, x / y);
BINARY_OP(uint32_t, bdiv_u32, x / y);
BINARY_OP(float, bmul_f32, x * y)
BINARY_OP(double, bmul_f64, x * y);
BINARY_OP(uint32_t, bmul_u32, x * y);
BINARY_OP(float, bsub_f32, x - y)
BINARY_OP(double, bsub_f64, x - y);
BINARY_OP(uint32_t, bsub_u32, x - y);

View File

@ -0,0 +1,65 @@
#include "cuda_utils.cuh"
#define BINARY_OP(TYPENAME, FN_NAME, FUNC) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t *dims_and_strides, \
const TYPENAME *lhs, \
const TYPENAME *rhs, \
TYPENAME *out \
) { \
const size_t *dims = dims_and_strides; \
const size_t *lhs_strides = dims_and_strides + 1 * num_dims; \
const size_t *rhs_strides = dims_and_strides + 2 * num_dims; \
bool lhs_cont = is_contiguous(num_dims, dims, lhs_strides); \
bool rhs_cont = is_contiguous(num_dims, dims, rhs_strides); \
if (lhs_cont && rhs_cont) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
TYPENAME x = lhs ? lhs[i] : out[i]; \
TYPENAME y = rhs ? rhs[i] : out[i]; \
out[i] = FUNC; \
} \
} else if (lhs_cont) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned int tmp_i = i; \
unsigned int rhs_i = 0; \
for (int d = num_dims - 1; d >= 0; d--) { \
unsigned int i_dim = tmp_i % dims[d]; \
rhs_i += i_dim * rhs_strides[d]; \
tmp_i /= dims[d]; \
} \
TYPENAME x = lhs ? lhs[i] : out[i]; \
TYPENAME y = rhs ? rhs[rhs_i] : out[i]; \
out[i] = FUNC; \
} \
} else if (rhs_cont) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned int tmp_i = i; \
unsigned int lhs_i = 0; \
for (int d = num_dims - 1; d >= 0; d--) { \
unsigned int i_dim = tmp_i % dims[d]; \
lhs_i += i_dim * lhs_strides[d]; \
tmp_i /= dims[d]; \
} \
TYPENAME x = lhs ? lhs[lhs_i] : out[i]; \
TYPENAME y = rhs ? rhs[i] : out[i]; \
out[i] = FUNC; \
} \
} else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned int tmp_i = i; \
unsigned int lhs_i = 0; \
unsigned int rhs_i = 0; \
for (int d = num_dims - 1; d >= 0; d--) { \
unsigned int i_dim = tmp_i % dims[d]; \
lhs_i += i_dim * lhs_strides[d]; \
rhs_i += i_dim * rhs_strides[d]; \
tmp_i /= dims[d]; \
} \
TYPENAME x = lhs ? lhs[lhs_i] : out[i]; \
TYPENAME y = rhs ? rhs[rhs_i] : out[i]; \
out[i] = FUNC; \
} \
} \
} \

View File

@ -0,0 +1,48 @@
#include "cuda_utils.cuh"
#include<stdint.h>
#define CAST_OP(SRC_TYPENAME, DST_TYPENAME, FN_NAME) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t *info, \
const SRC_TYPENAME *inp, \
DST_TYPENAME *out \
) { \
const size_t *dims = info; \
const size_t *strides = info + num_dims; \
if (is_contiguous(num_dims, dims, strides)) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
out[i] = inp[i]; \
} \
} \
else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
out[i] = inp[strided_i]; \
} \
} \
} \
#if __CUDA_ARCH__ >= 530
CAST_OP(__half, __half, cast_f16_f16)
CAST_OP(__half, uint32_t, cast_f16_u32)
CAST_OP(__half, float, cast_f16_f32)
CAST_OP(__half, double, cast_f16_f64)
CAST_OP(uint32_t, __half, cast_u32_f16)
CAST_OP(float, __half, cast_f32_f16)
CAST_OP(double, __half, cast_f64_f16)
#endif
CAST_OP(uint32_t, uint32_t, cast_u32_u32)
CAST_OP(uint32_t, float, cast_u32_f32)
CAST_OP(uint32_t, double, cast_u32_f64)
CAST_OP(float, uint32_t, cast_f32_u32)
CAST_OP(float, float, cast_f32_f32)
CAST_OP(float, double, cast_f32_f64)
CAST_OP(double, uint32_t, cast_f64_u32)
CAST_OP(double, float, cast_f64_f32)
CAST_OP(double, double, cast_f64_f64)

View File

@ -0,0 +1,171 @@
#include "cuda_fp16.h"
// Table showing which features are supported on which compute capability
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
// FIXME: the minimum compute capabilities are just guesses since the table is not specific enough
// #if __CUDA_ARCH__ < 600
// __device__ __forceinline__ __half __hmax(__half a, __half b) {
// return __float2half(fmaxf(__half2float(a), __half2float(b)));
// }
// __device__ __forceinline__ __half __hmin(__half a, __half b) {
// return __float2half(fminf(__half2float(a), __half2float(b)));
// }
// #endif
#if __CUDA_ARCH__ < 800
__device__ __forceinline__ __half __hmax_nan(__half a, __half b) {
// return __hisnan(a) ? a : (__hisnan(b) ? b : __hmax(a, b));
}
__device__ __forceinline__ __half __hmin_nan(__half a, __half b) {
// return __hisnan(a) ? a : (__hisnan(b) ? b : __hmin(a, b));
}
#endif
#if __CUDA_ARCH__ < 600
// Copied from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
__device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
#if __CUDA_ARCH__ < 700
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
// Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
__device__ __half atomicAdd(__half *address, __half val) {
// unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
// unsigned int old = *address_as_ui;
// unsigned int assumed;
// bool unaligned = (size_t) address & 2;
// do {
// assumed = old;
// unsigned int hsum;
// hsum = unaligned ? (old >> 16) : (old & 0xffff);
// hsum = __half_as_ushort(__ushort_as_half(hsum) + val);
// old = atomicCAS(address_as_ui, assumed,
// unaligned ? (old & 0xffff) | (hsum << 16) : (old & 0xffff0000) | hsum
// );
// } while (assumed != old);
// return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
}
#endif
__device__ __forceinline__ __half atomicMaxf(__half* address, __half val) {
#if __CUDA_ARCH__ < 700
// On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery.
// Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
bool unaligned = (size_t) address & 2;
do {
assumed = old;
unsigned int hmax;
hmax = unaligned ? (old >> 16) : (old & 0xffff);
hmax = __half_as_ushort(__hmax_nan(val, __ushort_as_half(hmax)));
old = atomicCAS(address_as_ui, assumed,
unaligned ? (old & 0xffff) | (hmax << 16) : (old & 0xffff0000) | hmax
);
} while (assumed != old);
return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
#else
// Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
unsigned short int* casted_address = (unsigned short int*)address;
unsigned short int old = *casted_address;
unsigned short int assumed;
do {
assumed = old;
old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmax_nan(val, __ushort_as_half(assumed))));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __ushort_as_half(old);
#endif
}
// atomicMax is not implemented for floats,
// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
__device__ __forceinline__ float atomicMaxf(float * addr, float value) {
if (signbit(value)) {
return __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));
} else {
return __int_as_float(atomicMax((int *)addr, __float_as_int(value)));
}
}
__device__ __forceinline__ double atomicMaxf(double * addr, double value) {
if (signbit(value)) {
return __longlong_as_double(atomicMin((unsigned long long int *)addr, __double_as_longlong(value)));
} else {
return __longlong_as_double(atomicMax((long long int *)addr, __double_as_longlong(value)));
}
}
__device__ __forceinline__ __half atomicMinf(__half* address, __half val) {
#if __CUDA_ARCH__ < 700
// On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery.
// Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
bool unaligned = (size_t) address & 2;
do {
assumed = old;
unsigned int hmin;
hmin = unaligned ? (old >> 16) : (old & 0xffff);
hmin = __half_as_ushort(__hmin_nan(val, __ushort_as_half(hmin)));
old = atomicCAS(address_as_ui, assumed,
unaligned ? (old & 0xffff) | (hmin << 16) : (old & 0xffff0000) | hmin
);
} while (assumed != old);
return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
#else
// Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
unsigned short int* casted_address = (unsigned short int*)address;
unsigned short int old = *casted_address;
unsigned short int assumed;
do {
assumed = old;
old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmin_nan(val, __ushort_as_half(assumed))));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __ushort_as_half(old);
#endif
}
// atomicMin is not implemented for floats,
// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
__device__ __forceinline__ float atomicMinf(float * addr, float value) {
if (signbit(value)) {
return __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));
} else {
return __int_as_float(atomicMin((int *)addr, __float_as_int(value)));
}
}
__device__ __forceinline__ double atomicMinf(double * addr, double value) {
if (signbit(value)) {
return __longlong_as_double(atomicMax((unsigned long long int *)addr, __double_as_longlong(value)));
} else {
return __longlong_as_double(atomicMin((long long int *)addr, __double_as_longlong(value)));
}
}

View File

@ -0,0 +1,158 @@
#include "cuda_fp16.h"
#include "compatibility.cuh"
// TODO: This is often used to check that the data is contiguous so that
// kernels can be easily mapped. However this only returns true for row
// major, if all the inputs are column major, we could apply the fast path
// too (but we wouldn't if some of them are row major and some column major).
__device__ bool is_contiguous(
const size_t num_dims,
const size_t *dims,
const size_t *strides
) {
size_t acc = 1;
for (unsigned int d = 0; d < num_dims; d++) {
unsigned int dim_idx = num_dims - 1 - d;
if (acc != strides[dim_idx]) {
return false;
}
acc *= dims[dim_idx];
}
return true;
}
__device__ unsigned int get_strided_index(
unsigned int idx,
const size_t num_dims,
const size_t *dims,
const size_t *strides
) {
unsigned int strided_i = 0;
for (unsigned int d = 0; d < num_dims; d++) {
unsigned int dim_idx = num_dims - 1 - d;
strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
idx /= dims[dim_idx];
}
return strided_i;
}
__device__ unsigned int restrided(
const unsigned int strided_i,
const size_t num_dims,
const size_t *dims,
const size_t *strides,
const size_t *new_strides
) {
unsigned int idx = 0;
for (int d = 0; d < num_dims; d++) {
idx += (strides[d] == 0 ? 0 : (strided_i / strides[d]) % dims[d]) * new_strides[d];
}
return idx;
}
// Sourced from https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
// Input must be less than or equal to 2 ^ 16
// used in reductions
__device__ __forceinline__ unsigned int next_power_of_two(unsigned int v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v++;
return v;
}
// Efficiently computes the sum of each chunk in "data" of size chunk_len, and
// stores the sums in out[i / chunk_len]
template<typename T>
__device__ void chunk_sum(
const size_t chunk_len,
const T data,
T* out
) {
__shared__ T buf[1024];
// assumes that threads where i >= numel have already exited
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int block_i = threadIdx.x;
// Fall back to atomicAdd if chunk_len is small to reduce overhead
if (chunk_len <= 2) {
atomicAdd(out + i / chunk_len, data);
return;
}
buf[block_i] = data;
unsigned int chunk_i = i % chunk_len;
unsigned int chunk_start = max((int)(block_i - chunk_i), 0);
unsigned int chunk_end = min((unsigned int)(block_i + chunk_len - chunk_i), blockDim.x);
chunk_i = block_i - chunk_start;
size_t max_chunk_len = min(chunk_end - chunk_start, blockDim.x);
size_t incr = next_power_of_two(max_chunk_len) >> 1;
__syncthreads();
// Uses sequential addressing as discussed in
// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
for (; incr > 0; incr >>= 1) {
unsigned int block_i_2 = block_i + incr;
if (block_i_2 < chunk_end && chunk_i < incr) {
// This is sound because __syncthreads and the conditions above
// ensure that no data races occur
buf[block_i] += buf[block_i_2];
}
__syncthreads();
}
if (block_i == chunk_start) {
atomicAdd(out + i / chunk_len, buf[block_i]);
}
}
__device__ __forceinline__ bool isnang(float a) { return isnan(a); }
__device__ __forceinline__ bool isnang(double a) { return isnan(a); }
__device__ __forceinline__ float recipg(float a) { return 1.0 / a; }
__device__ __forceinline__ double recipg(double a) { return 1.0 / a; }
__device__ __forceinline__ float cosg(float a) { return cosf(a); }
__device__ __forceinline__ double cosg(double a) { return cos(a); }
__device__ __forceinline__ float sing(float a) { return sinf(a); }
__device__ __forceinline__ double sing(double a) { return sin(a); }
__device__ __forceinline__ float sqrtg(float a) { return sqrtf(a); }
__device__ __forceinline__ double sqrtg(double a) { return sqrt(a); }
__device__ __forceinline__ float powg(float a, float b) { return powf(a, b); }
__device__ __forceinline__ double powg(double a, double b) { return pow(a, b); }
__device__ __forceinline__ float tanhg(float a) { return tanhf(a); }
__device__ __forceinline__ double tanhg(double a) { return tanh(a); }
__device__ __forceinline__ float maxg(float a, float b) { return fmaxf(a, b); }
__device__ __forceinline__ double maxg(double a, double b) { return fmax(a, b); }
__device__ __forceinline__ float ming(float a, float b) { return fminf(a, b); }
__device__ __forceinline__ double ming(double a, double b) { return fmin(a, b); }
__device__ __forceinline__ float logg(float a) { return logf(a); }
__device__ __forceinline__ double logg(double a) { return log(a); }
__device__ __forceinline__ float expg(float a) { return expf(a); }
__device__ __forceinline__ double expg(double a) { return exp(a); }
__device__ __forceinline__ float absg(float a) { return fabsf(a); }
__device__ __forceinline__ double absg(double a) { return fabs(a); }
__device__ __forceinline__ float copysigng(float a, float b) { return copysignf(a, b); }
__device__ __forceinline__ double copysigng(double a, double b) { return copysign(a, b); }
#if __CUDA_ARCH__ >= 530
__device__ __forceinline__ __half powg(__half a, __half b) { return __float2half(powf(__half2float(a), __half2float(b))); }
__device__ __forceinline__ bool isnang(__half a) { return __hisnan(a); }
__device__ __forceinline__ __half sqrtg(__half a) { return hsqrt(a); }
__device__ __forceinline__ __half cosg(__half a) { return hcos(a); }
__device__ __forceinline__ __half sing(__half a) { return hsin(a); }
__device__ __forceinline__ __half recipg(__half a) { __half one = 1.0; return one / a; }
__device__ __forceinline__ __half maxg(__half a, __half b) { return __hmax_nan(a, b); }
__device__ __forceinline__ __half tanhg(__half a) { return __float2half(tanhf(__half2float(a))); }
__device__ __forceinline__ __half ming(__half a, __half b) { return __hmin_nan(a, b); }
__device__ __forceinline__ __half logg(__half a) { return hlog(a); }
__device__ __forceinline__ __half expg(__half a) { return hexp(a); }
__device__ __forceinline__ __half absg(__half a) { return __habs(a); }
__device__ __forceinline__ __half copysigng(__half a, __half b) { return __float2half(copysignf(__half2float(a), __half2float(b))); }
#endif

View File

@ -0,0 +1,38 @@
// WARNING: THIS IS ONLY VALID ASSUMING THAT inp IS CONTIGUOUS!
// TODO: proper error reporting when ids are larger than v_size.
#include "cuda_utils.cuh"
#include<stdint.h>
#define EMB_OP(TYPENAME, FN_NAME) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t *info, \
const uint32_t *ids, \
const TYPENAME *inp, \
TYPENAME *out, \
const size_t h_size, \
const size_t v_size \
) { \
const size_t *dims = info; \
const size_t *strides = info + num_dims; \
if (is_contiguous(num_dims, dims, strides)) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
memcpy(&out[i * h_size], &inp[ids[i] * h_size], h_size * sizeof(TYPENAME)); \
} \
} \
else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
memcpy(&out[i * h_size], &inp[ids[strided_i] * h_size], h_size * sizeof(TYPENAME)); \
} \
} \
} \
#if __CUDA_ARCH__ >= 530
EMB_OP(__half, emb_f16)
#endif
EMB_OP(float, emb_f32)
EMB_OP(double, emb_f64)
EMB_OP(uint32_t, emb_u32)

View File

@ -0,0 +1,11 @@
#include "cuda_fp16.h"
template<typename T>
__device__ void fill_with(T *buf, T value, const size_t numel) {
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
buf[i] = value;
}
}
extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); }
extern "C" __global__ void fill_f32(float *buf, float value, const size_t numel) { fill_with(buf, value, numel); }
extern "C" __global__ void fill_f64(double *buf, double value, const size_t numel) { fill_with(buf, value, numel); }

View File

@ -0,0 +1,8 @@
pub const AFFINE: &str = include_str!(concat!(env!("OUT_DIR"), "/affine.ptx"));
pub const BINARY: &str = include_str!(concat!(env!("OUT_DIR"), "/binary.ptx"));
pub const CAST: &str = include_str!(concat!(env!("OUT_DIR"), "/cast.ptx"));
pub const EMBEDDINGS: &str = include_str!(concat!(env!("OUT_DIR"), "/embeddings.ptx"));
pub const FILL: &str = include_str!(concat!(env!("OUT_DIR"), "/fill.ptx"));
pub const REDUCE: &str = include_str!(concat!(env!("OUT_DIR"), "/reduce.ptx"));
pub const TERNARY: &str = include_str!(concat!(env!("OUT_DIR"), "/ternary.ptx"));
pub const UNARY: &str = include_str!(concat!(env!("OUT_DIR"), "/unary.ptx"));

View File

@ -0,0 +1,52 @@
// TODO: Use a proper distributed reduction rather than atomicAdd.
// https://people.maths.ox.ac.uk/gilesm/cuda/prac4/reduction.pdf
#include "cuda_utils.cuh"
#include<stdint.h>
#define SUM_OP(TYPENAME, FN_NAME) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t num_sum_dims, \
const size_t *info, \
const TYPENAME *inp, \
TYPENAME *out \
) { \
const size_t *dims = info; \
const size_t *strides = info + num_dims; \
const size_t *sum_dims_l = info + 2*num_dims; \
const size_t *sum_dims_s = info + 2*num_dims + num_sum_dims; \
if (is_contiguous(num_dims, dims, strides)) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
size_t dst_index = i; \
for (unsigned int nd = 0; nd < num_sum_dims; ++nd) { \
size_t stride = sum_dims_s[nd]; \
size_t pre = dst_index / stride; \
size_t post = dst_index % stride; \
dst_index = (pre / sum_dims_l[nd]) * stride + post; \
} \
atomicAdd(out + dst_index, inp[i]); \
} \
} \
else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
size_t dst_index = i; \
for (unsigned int nd = 0; nd < num_sum_dims; ++nd) { \
size_t stride = sum_dims_s[nd]; \
size_t pre = dst_index / stride; \
size_t post = dst_index % stride; \
dst_index = (pre / sum_dims_l[nd]) * stride + post; \
} \
atomicAdd(out + dst_index, inp[strided_i]); \
} \
} \
} \
#if __CUDA_ARCH__ >= 530
SUM_OP(__half, sum_f16)
#endif
SUM_OP(float, sum_f32)
SUM_OP(double, sum_f64)
SUM_OP(uint32_t, sum_u32)

View File

@ -0,0 +1,41 @@
#include "cuda_utils.cuh"
#include<stdint.h>
#define WHERE_OP(TYPENAME, FN_NAME) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t *info, \
const uint32_t *ids, \
const TYPENAME *t, \
const TYPENAME *f, \
TYPENAME *out \
) { \
const size_t *dims = info; \
const size_t *strides = info + num_dims; \
const size_t *strides_t = info + 2*num_dims; \
const size_t *strides_f = info + 2*num_dims; \
if (is_contiguous(num_dims, dims, strides) \
&& is_contiguous(num_dims, dims, strides_f) \
&& is_contiguous(num_dims, dims, strides_t)) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
out[i] = ids[i] ? t[i] : f[i]; \
} \
} \
else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
unsigned strided_i_t = get_strided_index(i, num_dims, dims, strides_t); \
unsigned strided_i_f = get_strided_index(i, num_dims, dims, strides_f); \
out[i] = ids[strided_i] ? t[strided_i_t] : f[strided_i_f]; \
} \
} \
} \
#if __CUDA_ARCH__ >= 530
WHERE_OP(__half, where_f16)
#endif
WHERE_OP(float, where_f32)
WHERE_OP(double, where_f64)
WHERE_OP(uint32_t, where_u32)

View File

@ -0,0 +1,69 @@
#include "cuda_utils.cuh"
#define UNARY_OP(TYPENAME, FN_NAME, FUNC) \
extern "C" __global__ void FN_NAME( \
const size_t numel, \
const size_t num_dims, \
const size_t *info, \
const TYPENAME *inp, \
TYPENAME *out \
) { \
const size_t *dims = info; \
const size_t *strides = info + num_dims; \
if (is_contiguous(num_dims, dims, strides)) { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
TYPENAME x = inp ? inp[i] : out[i]; \
out[i] = FUNC; \
} \
} \
else { \
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
TYPENAME x = inp ? inp[strided_i] : out[i]; \
out[i] = FUNC; \
} \
} \
} \
template<typename T>
__device__ T gelu_fwd(T x) {
T x_sq = x * x;
T x_cube = x_sq * x;
T alpha = x + static_cast<T>(0.044715) * x_cube;
return static_cast<T>(0.5) * x * (static_cast<T>(1.0) + tanhg(static_cast<T>(M_2_SQRTPI * M_SQRT1_2) * alpha));
}
#if __CUDA_ARCH__ >= 530
UNARY_OP(__half, ucopy_f16, x)
UNARY_OP(__half, uneg_f16, -x)
UNARY_OP(__half, uexp_f16, expg(x))
UNARY_OP(__half, ulog_f16, logg(x))
UNARY_OP(__half, usin_f16, sing(x))
UNARY_OP(__half, ucos_f16, cosg(x))
UNARY_OP(__half, uabs_f16, absg(x))
UNARY_OP(__half, usqr_f16, x*x)
UNARY_OP(__half, usqrt_f16, sqrtg(x))
UNARY_OP(__half, gelu_f16, gelu_fwd(x))
#endif
UNARY_OP(float, ucopy_f32, x)
UNARY_OP(double, ucopy_f64, x)
UNARY_OP(float, uneg_f32, -x)
UNARY_OP(double, uneg_f64, -x)
UNARY_OP(float, uexp_f32, expg(x))
UNARY_OP(double, uexp_f64, expg(x))
UNARY_OP(float, ulog_f32, logg(x))
UNARY_OP(double, ulog_f64, logg(x))
UNARY_OP(float, usin_f32, sing(x))
UNARY_OP(double, usin_f64, sing(x))
UNARY_OP(float, ucos_f32, cosg(x))
UNARY_OP(double, ucos_f64, cosg(x))
UNARY_OP(float, uabsg_f32, absg(x))
UNARY_OP(double, uabsg_f64, absg(x))
UNARY_OP(float, usqr_f32, x*x)
UNARY_OP(double, usqr_f64, x*x)
UNARY_OP(float, usqrt_f32, sqrtg(x))
UNARY_OP(double, usqrt_f64, sqrtg(x))
UNARY_OP(float, gelu_f32, gelu_fwd(x))
UNARY_OP(double, gelu_f64, gelu_fwd(x))