mirror of
https://github.com/huggingface/candle.git
synced 2025-06-19 19:58:35 +00:00
Integrate the kernels bits.
This commit is contained in:
27
kernels/src/affine.cu
Normal file
27
kernels/src/affine.cu
Normal file
@ -0,0 +1,27 @@
|
||||
extern "C" __global__ void affine_f32(
|
||||
const size_t numel,
|
||||
const float *x,
|
||||
float *y,
|
||||
const float mul,
|
||||
const float add
|
||||
) {
|
||||
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i >= numel) {
|
||||
return;
|
||||
}
|
||||
y[i] = x[i] * mul + add;
|
||||
}
|
||||
|
||||
extern "C" __global__ void affine_f64(
|
||||
const size_t numel,
|
||||
const double *x,
|
||||
double *y,
|
||||
const double mul,
|
||||
const double add
|
||||
) {
|
||||
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i >= numel) {
|
||||
return;
|
||||
}
|
||||
y[i] = x[i] * mul + add;
|
||||
}
|
11
kernels/src/fill.cu
Normal file
11
kernels/src/fill.cu
Normal file
@ -0,0 +1,11 @@
|
||||
#include "cuda_fp16.h"
|
||||
|
||||
template<typename T>
|
||||
__device__ void fill_with(T *buf, T value, const size_t numel) {
|
||||
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
|
||||
buf[i] = value;
|
||||
}
|
||||
}
|
||||
extern "C" __global__ void fill_f16(__half *buf, __half value, const size_t numel) { fill_with(buf, value, numel); }
|
||||
extern "C" __global__ void fill_f32(float *buf, float value, const size_t numel) { fill_with(buf, value, numel); }
|
||||
extern "C" __global__ void fill_f64(double *buf, double value, const size_t numel) { fill_with(buf, value, numel); }
|
2
kernels/src/lib.rs
Normal file
2
kernels/src/lib.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub const AFFINE: &str = include_str!(concat!(env!("OUT_DIR"), "/affine.ptx"));
|
||||
pub const FILL: &str = include_str!(concat!(env!("OUT_DIR"), "/fill.ptx"));
|
Reference in New Issue
Block a user