mirror of
https://github.com/huggingface/candle.git
synced 2025-06-19 19:58:35 +00:00
Kernel build example (#224)
* Build example kernels. * Add some sample custom kernel. * Get the example kernel to compile. * Add some cuda code. * More cuda custom op. * More cuda custom ops.
This commit is contained in:
@ -0,0 +1,37 @@
|
||||
#include "reduction_utils.cuh"
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ void
|
||||
rms_norm_kernel(scalar_t *__restrict__ out, // [num_tokens, hidden_size]
|
||||
const scalar_t *__restrict__ input, // [num_tokens, hidden_size]
|
||||
const scalar_t *__restrict__ weight, // [hidden_size]
|
||||
const float epsilon, const int num_tokens,
|
||||
const int hidden_size) {
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
variance += x * x;
|
||||
}
|
||||
variance = blockReduceSum<float>(variance);
|
||||
if (threadIdx.x == 0) {
|
||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||
out[blockIdx.x * hidden_size + idx] =
|
||||
((scalar_t)(x * s_variance)) * weight[idx];
|
||||
}
|
||||
}
|
||||
extern "C" __global__ void rms_norm_kernel_f32(
|
||||
float *__restrict__ out, // [num_tokens, hidden_size]
|
||||
const float *__restrict__ input, // [num_tokens, hidden_size]
|
||||
const float *__restrict__ weight, // [hidden_size]
|
||||
const float epsilon, const int num_tokens,
|
||||
const int hidden_size) {
|
||||
rms_norm_kernel(out, input, weight, epsilon, num_tokens, hidden_size);
|
||||
}
|
||||
|
Reference in New Issue
Block a user