mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 02:38:10 +00:00
738 lines
27 KiB
Rust
738 lines
27 KiB
Rust
use super::{GgmlDType, QStorage};
|
|
use crate::quantized::k_quants::GgmlType;
|
|
use crate::{backend::BackendDevice, cuda_backend::WrapErr};
|
|
use crate::{builder_arg as barg, CudaDevice, CudaStorage, Result};
|
|
use half::f16;
|
|
|
|
use cudarc::driver::{CudaSlice, CudaView, PushKernelArg};
|
|
|
|
#[derive(Clone, Debug)]
|
|
struct PaddedCudaSlice {
|
|
inner: CudaSlice<u8>,
|
|
len: usize,
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct QCudaStorage {
|
|
data: PaddedCudaSlice,
|
|
dtype: GgmlDType,
|
|
device: CudaDevice,
|
|
}
|
|
|
|
static FORCE_DMMV: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
|
|
|
|
pub fn set_force_dmmv(f: bool) {
|
|
FORCE_DMMV.store(f, std::sync::atomic::Ordering::Relaxed)
|
|
}
|
|
|
|
pub const WARP_SIZE: usize = 32;
|
|
pub const MMQ_X_Q4_0_AMPERE: usize = 4;
|
|
pub const MMQ_Y_Q4_0_AMPERE: usize = 32;
|
|
pub const NWARPS_Q4_0_AMPERE: usize = 4;
|
|
pub const GGML_CUDA_MMV_X: usize = 32;
|
|
pub const GGML_CUDA_MMV_Y: usize = 1;
|
|
pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256;
|
|
pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256;
|
|
pub const MATRIX_ROW_PADDING: usize = 512;
|
|
|
|
fn ceil_div(p: usize, q: usize) -> usize {
|
|
p.div_ceil(q)
|
|
}
|
|
|
|
fn pad(p: usize, q: usize) -> usize {
|
|
ceil_div(p, q) * q
|
|
}
|
|
|
|
fn quantize_q8_1(
|
|
src: &CudaView<f32>,
|
|
dst: &mut CudaSlice<u8>,
|
|
elem_count: usize,
|
|
ky: usize,
|
|
dev: &CudaDevice,
|
|
) -> Result<()> {
|
|
let kx = elem_count;
|
|
let kx_padded = pad(kx, MATRIX_ROW_PADDING);
|
|
let num_blocks = ceil_div(kx_padded, CUDA_QUANTIZE_BLOCK_SIZE);
|
|
let func = dev.get_or_load_func("quantize_q8_1", &candle_kernels::QUANTIZED)?;
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (num_blocks as u32, ky as u32, 1),
|
|
block_dim: (CUDA_QUANTIZE_BLOCK_SIZE as u32, 1, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
let mut builder = func.builder();
|
|
builder.arg(src);
|
|
builder.arg(dst);
|
|
barg!(builder, kx as i32, kx_padded as i32);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
Ok(())
|
|
}
|
|
|
|
fn dequantize_f32(
|
|
data: &PaddedCudaSlice,
|
|
dtype: GgmlDType,
|
|
elem_count: usize,
|
|
dev: &CudaDevice,
|
|
) -> Result<CudaStorage> {
|
|
let nb = elem_count.div_ceil(256);
|
|
let (kernel_name, is_k, block_dim, num_blocks) = match dtype {
|
|
GgmlDType::Q4_0 => ("dequantize_block_q4_0_f32", false, 32, nb),
|
|
GgmlDType::Q4_1 => ("dequantize_block_q4_1_f32", false, 32, nb),
|
|
GgmlDType::Q5_0 => (
|
|
"dequantize_block_q5_0_f32",
|
|
false,
|
|
CUDA_DEQUANTIZE_BLOCK_SIZE,
|
|
ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
|
|
),
|
|
GgmlDType::Q5_1 => (
|
|
"dequantize_block_q5_1_f32",
|
|
false,
|
|
CUDA_DEQUANTIZE_BLOCK_SIZE,
|
|
ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
|
|
),
|
|
GgmlDType::Q8_0 => ("dequantize_block_q8_0_f32", false, 32, nb),
|
|
GgmlDType::Q2K => ("dequantize_block_q2_K_f32", true, 64, nb),
|
|
GgmlDType::Q3K => ("dequantize_block_q3_K_f32", true, 64, nb),
|
|
GgmlDType::Q4K => ("dequantize_block_q4_K_f32", true, 32, nb),
|
|
GgmlDType::Q5K => ("dequantize_block_q5_K_f32", true, 64, nb),
|
|
GgmlDType::Q6K => ("dequantize_block_q6_K_f32", true, 64, nb),
|
|
GgmlDType::Q8K => ("dequantize_block_q8_K_f32", true, 32, nb),
|
|
_ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
|
|
};
|
|
let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
|
|
let dst = unsafe { dev.alloc::<f32>(elem_count)? };
|
|
// See e.g.
|
|
// https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (num_blocks as u32, 1, 1),
|
|
block_dim: (block_dim as u32, 1, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
|
|
if is_k {
|
|
let mut builder = func.builder();
|
|
builder.arg(&data.inner);
|
|
builder.arg(&dst);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
} else {
|
|
let nb32 = match dtype {
|
|
GgmlDType::Q5_0 | GgmlDType::Q5_1 => elem_count,
|
|
_ => elem_count / 32,
|
|
};
|
|
let mut builder = func.builder();
|
|
builder.arg(&data.inner);
|
|
builder.arg(&dst);
|
|
barg!(builder, nb32 as i32);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
}
|
|
Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
|
|
}
|
|
|
|
fn dequantize_f16(
|
|
data: &PaddedCudaSlice,
|
|
dtype: GgmlDType,
|
|
elem_count: usize,
|
|
dev: &CudaDevice,
|
|
) -> Result<CudaStorage> {
|
|
let nb = elem_count.div_ceil(256);
|
|
let (kernel_name, is_k, block_dim, num_blocks) = match dtype {
|
|
GgmlDType::Q4_0 => ("dequantize_block_q4_0_f16", false, 32, nb),
|
|
GgmlDType::Q4_1 => ("dequantize_block_q4_1_f16", false, 32, nb),
|
|
GgmlDType::Q5_0 => (
|
|
"dequantize_block_q5_0_f16",
|
|
false,
|
|
CUDA_DEQUANTIZE_BLOCK_SIZE,
|
|
ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
|
|
),
|
|
GgmlDType::Q5_1 => (
|
|
"dequantize_block_q5_1_f16",
|
|
false,
|
|
CUDA_DEQUANTIZE_BLOCK_SIZE,
|
|
ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE),
|
|
),
|
|
GgmlDType::Q8_0 => ("dequantize_block_q8_0_f16", false, 32, nb),
|
|
GgmlDType::Q2K => ("dequantize_block_q2_K_f16", true, 64, nb),
|
|
GgmlDType::Q3K => ("dequantize_block_q3_K_f16", true, 64, nb),
|
|
GgmlDType::Q4K => ("dequantize_block_q4_K_f16", true, 32, nb),
|
|
GgmlDType::Q5K => ("dequantize_block_q5_K_f16", true, 64, nb),
|
|
GgmlDType::Q6K => ("dequantize_block_q6_K_f16", true, 64, nb),
|
|
GgmlDType::Q8K => ("dequantize_block_q8_K_f16", true, 32, nb),
|
|
_ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
|
|
};
|
|
let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
|
|
let dst = unsafe { dev.alloc::<f16>(elem_count)? };
|
|
// See e.g.
|
|
// https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (num_blocks as u32, 1, 1),
|
|
block_dim: (block_dim as u32, 1, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
|
|
if is_k {
|
|
let mut builder = func.builder();
|
|
builder.arg(&data.inner);
|
|
builder.arg(&dst);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
} else {
|
|
let nb32 = match dtype {
|
|
GgmlDType::Q5_0 | GgmlDType::Q5_1 => elem_count,
|
|
_ => elem_count / 32,
|
|
};
|
|
let mut builder = func.builder();
|
|
builder.arg(&data.inner);
|
|
builder.arg(&dst);
|
|
barg!(builder, nb32 as i32);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
}
|
|
Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
|
|
}
|
|
|
|
fn dequantize_mul_mat_vec(
|
|
data: &PaddedCudaSlice,
|
|
y: &CudaView<f32>,
|
|
dtype: GgmlDType,
|
|
ncols: usize,
|
|
nrows: usize,
|
|
dev: &CudaDevice,
|
|
) -> Result<CudaStorage> {
|
|
let data_elems = data.len / dtype.type_size() * dtype.block_size();
|
|
if data_elems < ncols * nrows {
|
|
crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems)
|
|
}
|
|
if y.len() != ncols {
|
|
crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len())
|
|
}
|
|
let kernel_name = match dtype {
|
|
GgmlDType::Q4_0 => "dequantize_mul_mat_vec_q4_0_cuda",
|
|
GgmlDType::Q4_1 => "dequantize_mul_mat_vec_q4_1_cuda",
|
|
GgmlDType::Q5_0 => "dequantize_mul_mat_vec_q5_0_cuda",
|
|
GgmlDType::Q5_1 => "dequantize_mul_mat_vec_q5_1_cuda",
|
|
GgmlDType::Q8_0 => "dequantize_mul_mat_vec_q8_0_cuda",
|
|
GgmlDType::Q2K => "dequantize_mul_mat_vec_q2_k",
|
|
GgmlDType::Q3K => "dequantize_mul_mat_vec_q3_k",
|
|
GgmlDType::Q4K => "dequantize_mul_mat_vec_q4_k",
|
|
GgmlDType::Q5K => "dequantize_mul_mat_vec_q5_k",
|
|
GgmlDType::Q6K => "dequantize_mul_mat_vec_q6_k",
|
|
_ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
|
|
};
|
|
let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
|
|
let dst = unsafe { dev.alloc::<f32>(nrows)? };
|
|
let block_num_y = ceil_div(nrows, GGML_CUDA_MMV_Y);
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (block_num_y as u32, 1, 1),
|
|
block_dim: (WARP_SIZE as u32, GGML_CUDA_MMV_Y as u32, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
|
|
let mut builder = func.builder();
|
|
builder.arg(&data.inner);
|
|
builder.arg(y);
|
|
builder.arg(&dst);
|
|
barg!(builder, ncols as i32, nrows as i32);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
|
|
}
|
|
|
|
fn mul_mat_vec_via_q8_1(
|
|
data: &PaddedCudaSlice,
|
|
y: &CudaView<f32>,
|
|
dtype: GgmlDType,
|
|
ncols: usize,
|
|
nrows: usize,
|
|
b_size: usize,
|
|
dev: &CudaDevice,
|
|
) -> Result<CudaStorage> {
|
|
let data_elems = data.len / dtype.type_size() * dtype.block_size();
|
|
if data_elems < ncols * nrows {
|
|
crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems)
|
|
}
|
|
if y.len() != ncols * b_size {
|
|
crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len())
|
|
}
|
|
if b_size == 0 || b_size > 8 {
|
|
crate::bail!("only bsize between 1 and 8 are supported, got {b_size}")
|
|
}
|
|
// Start by quantizing y
|
|
let ncols_padded = pad(ncols, MATRIX_ROW_PADDING);
|
|
let y_size_in_bytes =
|
|
b_size * ncols_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
|
|
let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
|
|
quantize_q8_1(y, &mut y_q8_1, ncols, b_size, dev)?;
|
|
|
|
let kernel_name = match dtype {
|
|
GgmlDType::Q4_0 => "mul_mat_vec_q4_0_q8_1_cuda",
|
|
GgmlDType::Q4_1 => "mul_mat_vec_q4_1_q8_1_cuda",
|
|
GgmlDType::Q5_0 => "mul_mat_vec_q5_0_q8_1_cuda",
|
|
GgmlDType::Q5_1 => "mul_mat_vec_q5_1_q8_1_cuda",
|
|
GgmlDType::Q8_0 => "mul_mat_vec_q8_0_q8_1_cuda",
|
|
GgmlDType::Q2K => "mul_mat_vec_q2_K_q8_1_cuda",
|
|
GgmlDType::Q3K => "mul_mat_vec_q3_K_q8_1_cuda",
|
|
GgmlDType::Q4K => "mul_mat_vec_q4_K_q8_1_cuda",
|
|
GgmlDType::Q5K => "mul_mat_vec_q5_K_q8_1_cuda",
|
|
GgmlDType::Q6K => "mul_mat_vec_q6_K_q8_1_cuda",
|
|
_ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
|
|
};
|
|
let kernel_name = format!("{kernel_name}{b_size}");
|
|
let func = dev.get_or_load_func(&kernel_name, &candle_kernels::QUANTIZED)?;
|
|
let dst = unsafe { dev.alloc::<f32>(nrows * b_size)? };
|
|
// https://github.com/ggerganov/llama.cpp/blob/facb8b56f8fd3bb10a693bf0943ae9d69d0828ef/ggml-cuda/mmvq.cu#L98
|
|
let (nblocks, nwarps) = match b_size {
|
|
1 => (nrows as u32, 4),
|
|
2..=4 => ((nrows as u32).div_ceil(2), 4),
|
|
5..=8 => ((nrows as u32).div_ceil(2), 2),
|
|
_ => crate::bail!("unexpected bsize {b_size}"),
|
|
};
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (nblocks, 1, 1),
|
|
block_dim: (WARP_SIZE as u32, nwarps, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
|
|
let mut builder = func.builder();
|
|
builder.arg(&data.inner);
|
|
builder.arg(&y_q8_1);
|
|
builder.arg(&dst);
|
|
barg!(
|
|
builder,
|
|
/* ncols_x */ ncols as i32,
|
|
/* nrows_x */ nrows as i32,
|
|
/* nrows_y */ ncols_padded as i32,
|
|
/* nrows_dst */ nrows as i32
|
|
);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
fn mul_mat_via_q8_1(
|
|
data: &PaddedCudaSlice,
|
|
y: &CudaView<f32>,
|
|
dtype: GgmlDType,
|
|
x_rows: usize,
|
|
x_cols: usize,
|
|
y_rows: usize,
|
|
y_cols: usize,
|
|
dev: &CudaDevice,
|
|
) -> Result<CudaStorage> {
|
|
let data_elems = data.len / dtype.type_size() * dtype.block_size();
|
|
if data_elems < x_rows * x_cols {
|
|
crate::bail!("unexpected lhs size {}, {x_rows} {x_cols}", data_elems)
|
|
}
|
|
if y.len() != y_rows * y_cols {
|
|
crate::bail!("unexpected y size {}, {y_rows} {y_cols}", y.len())
|
|
}
|
|
if x_cols != y_rows {
|
|
crate::bail!("unexpected x/y size {x_rows} {x_cols} {y_rows} {y_cols}")
|
|
}
|
|
let k = x_cols;
|
|
// Start by quantizing y
|
|
let k_padded = pad(k, MATRIX_ROW_PADDING);
|
|
let y_size_in_bytes =
|
|
k_padded * y_cols * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
|
|
let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
|
|
quantize_q8_1(y, &mut y_q8_1, k, y_cols, dev)?;
|
|
|
|
let (kernel_name, mmq_x, mmq_y) = match dtype {
|
|
GgmlDType::Q4_0 => ("mul_mat_q4_0", 64, 128),
|
|
GgmlDType::Q4_1 => ("mul_mat_q4_1", 64, 128),
|
|
GgmlDType::Q5_0 => ("mul_mat_q5_0", 128, 64),
|
|
GgmlDType::Q5_1 => ("mul_mat_q5_1", 128, 64),
|
|
GgmlDType::Q8_0 => ("mul_mat_q8_0", 128, 64),
|
|
GgmlDType::Q2K => ("mul_mat_q2_K", 64, 128),
|
|
GgmlDType::Q3K => ("mul_mat_q3_K", 128, 128),
|
|
GgmlDType::Q4K => ("mul_mat_q4_K", 64, 128),
|
|
GgmlDType::Q5K => ("mul_mat_q5_K", 64, 128),
|
|
GgmlDType::Q6K => ("mul_mat_q6_K", 64, 64),
|
|
_ => crate::bail!("unsupported dtype for quantized matmul {dtype:?}"),
|
|
};
|
|
let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?;
|
|
let dst = unsafe { dev.alloc::<f32>(x_rows * y_cols)? };
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (
|
|
ceil_div(x_rows, mmq_y) as u32,
|
|
ceil_div(y_cols, mmq_x) as u32,
|
|
1,
|
|
),
|
|
block_dim: (WARP_SIZE as u32, 4, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
|
|
let mut builder = func.builder();
|
|
builder.arg(/* vx */ &data.inner);
|
|
builder.arg(/* vy */ &y_q8_1);
|
|
builder.arg(/* dst */ &dst);
|
|
barg!(
|
|
builder,
|
|
/* ncols_x */ x_cols as i32,
|
|
/* nrows_x */ x_rows as i32,
|
|
/* ncols_y */ y_cols as i32,
|
|
/* nrows_y */ k_padded as i32,
|
|
/* nrows_dst */ x_rows as i32
|
|
);
|
|
unsafe { builder.launch(cfg) }.w()?;
|
|
Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone()))
|
|
}
|
|
|
|
impl QCudaStorage {
|
|
pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result<Self> {
|
|
let size_in_bytes = ceil_div(el_count, dtype.block_size()) * dtype.type_size();
|
|
let padded_size_in_bytes =
|
|
ceil_div(el_count + MATRIX_ROW_PADDING, dtype.block_size()) * dtype.type_size();
|
|
let inner = device.alloc_zeros::<u8>(padded_size_in_bytes)?;
|
|
Ok(QCudaStorage {
|
|
data: PaddedCudaSlice {
|
|
inner,
|
|
len: size_in_bytes,
|
|
},
|
|
device: device.clone(),
|
|
dtype,
|
|
})
|
|
}
|
|
|
|
pub fn dtype(&self) -> GgmlDType {
|
|
self.dtype
|
|
}
|
|
|
|
pub fn device(&self) -> &CudaDevice {
|
|
&self.device
|
|
}
|
|
|
|
pub fn dequantize(&self, elem_count: usize) -> Result<CudaStorage> {
|
|
fn deq<T: GgmlType>(buffer: &[u8], n: usize, dst: &mut [f32]) -> Result<()> {
|
|
let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) };
|
|
let vec = slice.to_vec();
|
|
T::to_float(&vec, dst)
|
|
}
|
|
|
|
let fast_kernel = matches!(
|
|
self.dtype,
|
|
GgmlDType::Q4_0
|
|
| GgmlDType::Q4_1
|
|
| GgmlDType::Q5_0
|
|
| GgmlDType::Q5_1
|
|
| GgmlDType::Q8_0
|
|
| GgmlDType::Q2K
|
|
| GgmlDType::Q3K
|
|
| GgmlDType::Q4K
|
|
| GgmlDType::Q5K
|
|
| GgmlDType::Q6K
|
|
| GgmlDType::Q8K
|
|
);
|
|
if fast_kernel {
|
|
return dequantize_f32(&self.data, self.dtype, elem_count, self.device());
|
|
}
|
|
// Run the dequantization on cpu.
|
|
|
|
let buffer = self
|
|
.device
|
|
.memcpy_dtov(&self.data.inner.slice(..self.data.len))?;
|
|
let mut out = vec![0.0; elem_count];
|
|
let block_len = elem_count / self.dtype.block_size();
|
|
match self.dtype {
|
|
GgmlDType::F32 => deq::<f32>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::F16 => deq::<half::f16>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q4_0 => deq::<crate::quantized::BlockQ4_0>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q4_1 => deq::<crate::quantized::BlockQ4_1>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q5_0 => deq::<crate::quantized::BlockQ5_0>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q5_1 => deq::<crate::quantized::BlockQ5_1>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q8_0 => deq::<crate::quantized::BlockQ8_0>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q8_1 => deq::<crate::quantized::BlockQ8_1>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q2K => deq::<crate::quantized::BlockQ2K>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q3K => deq::<crate::quantized::BlockQ3K>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q4K => deq::<crate::quantized::BlockQ4K>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q5K => deq::<crate::quantized::BlockQ5K>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q6K => deq::<crate::quantized::BlockQ6K>(&buffer, block_len, &mut out)?,
|
|
GgmlDType::Q8K => deq::<crate::quantized::BlockQ8K>(&buffer, block_len, &mut out)?,
|
|
}
|
|
|
|
self.device
|
|
.storage_from_cpu_storage(&crate::CpuStorage::F32(out))
|
|
}
|
|
|
|
pub fn dequantize_f16(&self, elem_count: usize) -> Result<CudaStorage> {
|
|
dequantize_f16(&self.data, self.dtype, elem_count, self.device())
|
|
}
|
|
|
|
pub fn quantize(&mut self, src: &CudaStorage) -> Result<()> {
|
|
// Run the quantization on cpu.
|
|
let src = match &src.slice {
|
|
crate::cuda_backend::CudaStorageSlice::F32(data) => self.device.memcpy_dtov(data)?,
|
|
_ => crate::bail!("only f32 can be quantized"),
|
|
};
|
|
let src_len = src.len();
|
|
let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
|
|
let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
|
|
qcpu_storage.quantize(&src)?;
|
|
let data = qcpu_storage.data()?;
|
|
let padded_len =
|
|
data.len() + MATRIX_ROW_PADDING * self.dtype.type_size() / self.dtype.block_size();
|
|
let mut inner = unsafe { self.device.alloc::<u8>(padded_len)? };
|
|
self.device
|
|
.memcpy_htod(data.as_ref(), &mut inner.slice_mut(..data.len()))?;
|
|
self.data = PaddedCudaSlice {
|
|
inner,
|
|
len: data.len(),
|
|
};
|
|
Ok(())
|
|
}
|
|
|
|
pub fn storage_size_in_bytes(&self) -> usize {
|
|
self.data.len
|
|
}
|
|
|
|
pub fn fwd(
|
|
&self,
|
|
self_shape: &crate::Shape,
|
|
storage: &CudaStorage,
|
|
layout: &crate::Layout,
|
|
) -> Result<(CudaStorage, crate::Shape)> {
|
|
let max_bm = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
|
|
1
|
|
} else {
|
|
8
|
|
};
|
|
let use_vec_kernel = match layout.shape().dims() {
|
|
[b, m, _k] => b * m <= max_bm,
|
|
[b, _k] => *b <= max_bm,
|
|
_ => false,
|
|
};
|
|
if use_vec_kernel {
|
|
self.dequantize_matmul_vec(self_shape, storage, layout)
|
|
} else {
|
|
self.dequantize_matmul(self_shape, storage, layout)
|
|
}
|
|
}
|
|
}
|
|
|
|
impl QCudaStorage {
|
|
fn dequantize_matmul_vec(
|
|
&self,
|
|
self_shape: &crate::Shape,
|
|
rhs: &CudaStorage,
|
|
rhs_l: &crate::Layout,
|
|
) -> Result<(CudaStorage, crate::Shape)> {
|
|
let (nrows, ncols) = self_shape.dims2()?;
|
|
let rhs = rhs.as_cuda_slice::<f32>()?;
|
|
let rhs = match rhs_l.contiguous_offsets() {
|
|
Some((o1, o2)) => rhs.slice(o1..o2),
|
|
None => Err(crate::Error::RequiresContiguous { op: "dmmv" }.bt())?,
|
|
};
|
|
let (b_size, k) = match rhs_l.shape().dims() {
|
|
[b, m, k] => (b * m, *k),
|
|
[b, k] => (*b, *k),
|
|
_ => crate::bail!("unexpected rhs shape in dmmv {:?}", rhs_l.shape()),
|
|
};
|
|
if ncols != k {
|
|
crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", rhs_l.shape())
|
|
}
|
|
|
|
let out = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
|
|
dequantize_mul_mat_vec(&self.data, &rhs, self.dtype, ncols, nrows, self.device())?
|
|
} else {
|
|
mul_mat_vec_via_q8_1(
|
|
&self.data,
|
|
&rhs,
|
|
self.dtype,
|
|
ncols,
|
|
nrows,
|
|
b_size,
|
|
self.device(),
|
|
)?
|
|
};
|
|
let mut out_shape = rhs_l.shape().dims().to_vec();
|
|
out_shape.pop();
|
|
out_shape.push(nrows);
|
|
Ok((out, out_shape.into()))
|
|
}
|
|
|
|
fn dequantize_matmul(
|
|
&self,
|
|
self_shape: &crate::Shape,
|
|
storage: &CudaStorage,
|
|
layout: &crate::Layout,
|
|
) -> Result<(CudaStorage, crate::Shape)> {
|
|
use crate::backend::BackendStorage;
|
|
let (n, k) = self_shape.dims2()?;
|
|
let (b, m, k2) = match layout.shape().dims() {
|
|
&[b, m, k2] => (b, m, k2),
|
|
&[m, k2] => (1, m, k2),
|
|
s => crate::bail!("unexpected shape for input {s:?}"),
|
|
};
|
|
if k2 != k {
|
|
crate::bail!("mismatch on matmul dim {self_shape:?} {:?}", layout.shape())
|
|
}
|
|
|
|
let out = if FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
|
|
let data_f32 = self.dequantize(n * k)?;
|
|
let rhs_l = crate::Layout::new((k, n).into(), vec![1, k], 0).broadcast_as((b, k, n))?;
|
|
storage.matmul(&data_f32, (b, m, n, k), layout, &rhs_l)?
|
|
} else {
|
|
let storage = storage.as_cuda_slice::<f32>()?;
|
|
let storage = match layout.contiguous_offsets() {
|
|
Some((o1, o2)) => storage.slice(o1..o2),
|
|
None => Err(crate::Error::RequiresContiguous {
|
|
op: "quantized-matmul",
|
|
}
|
|
.bt())?,
|
|
};
|
|
mul_mat_via_q8_1(
|
|
&self.data,
|
|
&storage,
|
|
self.dtype,
|
|
/* x_rows */ n,
|
|
/* x_cols */ k,
|
|
/* y_rows */ k,
|
|
/* y_cols */ b * m,
|
|
self.device(),
|
|
)?
|
|
};
|
|
let mut out_shape = layout.shape().dims().to_vec();
|
|
out_shape.pop();
|
|
out_shape.push(n);
|
|
Ok((out, out_shape.into()))
|
|
}
|
|
}
|
|
|
|
pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
|
|
device: &CudaDevice,
|
|
data: &[T],
|
|
) -> Result<super::QStorage> {
|
|
let data = unsafe {
|
|
std::slice::from_raw_parts(data.as_ptr() as *const u8, core::mem::size_of_val(data))
|
|
};
|
|
let dtype = T::DTYPE;
|
|
let padded_len = data.len() + MATRIX_ROW_PADDING * dtype.type_size() / dtype.block_size();
|
|
let mut inner = unsafe { device.alloc::<u8>(padded_len)? };
|
|
device.memcpy_htod(data, &mut inner.slice_mut(..data.len()))?;
|
|
Ok(QStorage::Cuda(QCudaStorage {
|
|
data: PaddedCudaSlice {
|
|
inner,
|
|
len: data.len(),
|
|
},
|
|
device: device.clone(),
|
|
dtype,
|
|
}))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn cuda_quantize_q8_1() -> Result<()> {
|
|
let dev = CudaDevice::new(0)?;
|
|
let el = 256;
|
|
let el_padded = pad(el, MATRIX_ROW_PADDING);
|
|
let y_size_in_bytes =
|
|
el_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size();
|
|
let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
|
|
let vs: Vec<f32> = (0..el).map(|v| v as f32).collect();
|
|
let y = dev.memcpy_stod(&vs)?;
|
|
quantize_q8_1(&y.slice(..), &mut y_q8_1, el, 1, &dev)?;
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn cuda_mmv_q8_1() -> Result<()> {
|
|
let dev = CudaDevice::new(0)?;
|
|
let ncols = 256;
|
|
let vs: Vec<f32> = (0..ncols).map(|v| v as f32).collect();
|
|
let y = dev.memcpy_stod(&vs)?;
|
|
let mut xs = QCudaStorage::zeros(&dev, ncols, GgmlDType::Q4_0)?;
|
|
xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
|
|
let cuda_storage = mul_mat_vec_via_q8_1(
|
|
&xs.data,
|
|
&y.slice(..),
|
|
/* dtype */ GgmlDType::Q4_0,
|
|
/* ncols */ ncols,
|
|
/* nrows */ 1,
|
|
/* b_size */ 1,
|
|
&dev,
|
|
)?;
|
|
let vs = cuda_storage.as_cuda_slice::<f32>()?;
|
|
let vs = dev.memcpy_dtov(&vs.slice(..))?;
|
|
assert_eq!(vs.len(), 1);
|
|
// for n = 255, n.(n+1).(2n+1) / 6 = 5559680
|
|
// Q8 means 1/256 precision.
|
|
assert_eq!(vs[0], 5561664.5);
|
|
|
|
let cuda_storage = dequantize_mul_mat_vec(
|
|
&xs.data,
|
|
&y.slice(..),
|
|
/* dtype */ GgmlDType::Q4_0,
|
|
/* ncols */ ncols,
|
|
/* nrows */ 1,
|
|
&dev,
|
|
)?;
|
|
let vs = cuda_storage.as_cuda_slice::<f32>()?;
|
|
let vs = dev.memcpy_dtov(&vs.slice(..))?;
|
|
assert_eq!(vs.len(), 1);
|
|
assert_eq!(vs[0], 5561851.0);
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn cuda_mm_q8_1() -> Result<()> {
|
|
let dev = CudaDevice::new(0)?;
|
|
let ncols = 256;
|
|
let vs: Vec<f32> = (0..ncols * 4).map(|v| v as f32 / 4.).collect();
|
|
let y = dev.memcpy_stod(&vs)?;
|
|
let mut xs = QCudaStorage::zeros(&dev, ncols * 4, GgmlDType::Q4_0)?;
|
|
xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
|
|
let cuda_storage = mul_mat_via_q8_1(
|
|
&xs.data,
|
|
&y.slice(..),
|
|
/* dtype */ GgmlDType::Q4_0,
|
|
/* x_rows */ 4,
|
|
/* x_cols */ ncols,
|
|
/* y_rows */ ncols,
|
|
/* y_cols */ 4,
|
|
&dev,
|
|
)?;
|
|
let vs = cuda_storage.as_cuda_slice::<f32>()?;
|
|
let vs = dev.memcpy_dtov(&vs.slice(..))?;
|
|
|
|
/*
|
|
x = torch.tensor([float(v) for v in range(1024)]).reshape(4, 256)
|
|
x @ x.t() / 16
|
|
tensor([[ 347480.0000, 869720.0000, 1391960.0000, 1914200.0000],
|
|
[ 869720.0000, 2440536.0000, 4011352.0000, 5582166.5000],
|
|
[ 1391960.0000, 4011352.0000, 6630742.0000, 9250132.0000],
|
|
[ 1914200.0000, 5582166.5000, 9250132.0000, 12918099.0000]])
|
|
*/
|
|
assert_eq!(vs.len(), 16);
|
|
assert_eq!(vs[0], 347604.0);
|
|
assert_eq!(vs[1], 888153.06);
|
|
assert_eq!(vs[4], 869780.7);
|
|
assert_eq!(vs[5], 2483145.0);
|
|
assert_eq!(vs[11], 9407368.0);
|
|
assert_eq!(vs[14], 9470856.0);
|
|
assert_eq!(vs[15], 13138824.0);
|
|
Ok(())
|
|
}
|
|
|
|
// The following test used to fail under compute-sanitizer until #2526.
|
|
#[test]
|
|
fn cuda_mm_q8_1_pad() -> Result<()> {
|
|
let dev = CudaDevice::new(0)?;
|
|
let (x_rows, ncols, y_cols) = (4, 16, 2048);
|
|
let vs: Vec<f32> = (0..ncols * y_cols).map(|v| v as f32 / 256.).collect();
|
|
let y = dev.memcpy_stod(&vs)?;
|
|
let mut xs = QCudaStorage::zeros(&dev, ncols * x_rows, GgmlDType::Q4_0)?;
|
|
xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
|
|
let cuda_storage = mul_mat_via_q8_1(
|
|
&xs.data,
|
|
&y.slice(..),
|
|
/* dtype */ GgmlDType::Q4_0,
|
|
/* x_rows */ x_rows,
|
|
/* x_cols */ ncols,
|
|
/* y_rows */ ncols,
|
|
/* y_cols */ y_cols,
|
|
&dev,
|
|
)?;
|
|
let vs = cuda_storage.as_cuda_slice::<f32>()?;
|
|
let _vs = dev.memcpy_dtov(&vs.slice(..))?;
|
|
Ok(())
|
|
}
|
|
}
|