mirror of
https://github.com/huggingface/candle.git
synced 2025-06-19 11:56:45 +00:00
Fix the block size for some cuda kernels. (#1767)
This commit is contained in:
@ -25,26 +25,28 @@ fn dequantize(
|
|||||||
) -> Result<CudaStorage> {
|
) -> Result<CudaStorage> {
|
||||||
use cudarc::driver::LaunchAsync;
|
use cudarc::driver::LaunchAsync;
|
||||||
|
|
||||||
let (kernel_name, is_k) = match dtype {
|
let (kernel_name, is_k, block_dim) = match dtype {
|
||||||
GgmlDType::Q4_0 => ("dequantize_block_q4_0", false),
|
GgmlDType::Q4_0 => ("dequantize_block_q4_0", false, 32),
|
||||||
GgmlDType::Q4_1 => ("dequantize_block_q4_1", false),
|
GgmlDType::Q4_1 => ("dequantize_block_q4_1", false, 32),
|
||||||
GgmlDType::Q5_0 => ("dequantize_block_q5_0", false),
|
GgmlDType::Q5_0 => ("dequantize_block_q5_0", false, 32),
|
||||||
GgmlDType::Q5_1 => ("dequantize_block_q5_1", false),
|
GgmlDType::Q5_1 => ("dequantize_block_q5_1", false, 32),
|
||||||
GgmlDType::Q8_0 => ("dequantize_block_q8_0", false),
|
GgmlDType::Q8_0 => ("dequantize_block_q8_0", false, 32),
|
||||||
GgmlDType::Q2K => ("dequantize_block_q2_K", true),
|
GgmlDType::Q2K => ("dequantize_block_q2_K", true, 64),
|
||||||
GgmlDType::Q3K => ("dequantize_block_q3_K", true),
|
GgmlDType::Q3K => ("dequantize_block_q3_K", true, 64),
|
||||||
GgmlDType::Q4K => ("dequantize_block_q4_K", true),
|
GgmlDType::Q4K => ("dequantize_block_q4_K", true, 32),
|
||||||
GgmlDType::Q5K => ("dequantize_block_q5_K", true),
|
GgmlDType::Q5K => ("dequantize_block_q5_K", true, 64),
|
||||||
GgmlDType::Q6K => ("dequantize_block_q6_K", true),
|
GgmlDType::Q6K => ("dequantize_block_q6_K", true, 64),
|
||||||
GgmlDType::Q8K => ("dequantize_block_q8_K", true),
|
GgmlDType::Q8K => ("dequantize_block_q8_K", true, 32),
|
||||||
_ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
|
_ => crate::bail!("unsupported dtype for dequantize {dtype:?}"),
|
||||||
};
|
};
|
||||||
let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
|
let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?;
|
||||||
let dst = dev.alloc_zeros::<f32>(elem_count).w()?;
|
let dst = dev.alloc_zeros::<f32>(elem_count).w()?;
|
||||||
let nb = (elem_count + 255) / 256;
|
let nb = (elem_count + 255) / 256;
|
||||||
|
// See e.g.
|
||||||
|
// https://github.com/ggerganov/llama.cpp/blob/cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b/ggml-cuda.cu#L7270
|
||||||
let cfg = cudarc::driver::LaunchConfig {
|
let cfg = cudarc::driver::LaunchConfig {
|
||||||
grid_dim: (nb as u32, 1, 1),
|
grid_dim: (nb as u32, 1, 1),
|
||||||
block_dim: (32, 1, 1),
|
block_dim: (block_dim, 1, 1),
|
||||||
shared_mem_bytes: 0,
|
shared_mem_bytes: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -178,10 +178,6 @@ test_device!(
|
|||||||
);
|
);
|
||||||
|
|
||||||
fn quantize_q4_0(device: &Device) -> Result<()> {
|
fn quantize_q4_0(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
|
let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
|
||||||
|
|
||||||
let src = Tensor::from_slice(&src, (32 * 4,), device)?;
|
let src = Tensor::from_slice(&src, (32 * 4,), device)?;
|
||||||
@ -209,10 +205,6 @@ fn quantize_q4_0(device: &Device) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q4_1(device: &Device) -> Result<()> {
|
fn quantize_q4_1(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
|
let src = (0..32 * 4).map(|v| v as f32).collect::<Vec<_>>();
|
||||||
let src = Tensor::from_slice(&src, (32 * 4,), device)?;
|
let src = Tensor::from_slice(&src, (32 * 4,), device)?;
|
||||||
let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_1)?;
|
let quant = quantized::QTensor::quantize(&src, GgmlDType::Q4_1)?;
|
||||||
@ -373,10 +365,6 @@ fn ggml_quantization_error_test(dtype: GgmlDType, device: &Device, max_error: f3
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q2k(device: &Device) -> Result<()> {
|
fn quantize_q2k(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let dtype = GgmlDType::Q2K;
|
let dtype = GgmlDType::Q2K;
|
||||||
|
|
||||||
let src = get_test_vector2(0.5, 1024, device)?;
|
let src = get_test_vector2(0.5, 1024, device)?;
|
||||||
@ -411,10 +399,6 @@ fn quantize_q2k(device: &Device) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q3k(device: &Device) -> Result<()> {
|
fn quantize_q3k(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let dtype = GgmlDType::Q3K;
|
let dtype = GgmlDType::Q3K;
|
||||||
let src = get_test_vector2(0.5, 1024, device)?;
|
let src = get_test_vector2(0.5, 1024, device)?;
|
||||||
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
||||||
@ -448,10 +432,6 @@ fn quantize_q3k(device: &Device) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q4k(device: &Device) -> Result<()> {
|
fn quantize_q4k(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let dtype = GgmlDType::Q4K;
|
let dtype = GgmlDType::Q4K;
|
||||||
let src = get_test_vector2(0.5, 1024, device)?;
|
let src = get_test_vector2(0.5, 1024, device)?;
|
||||||
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
||||||
@ -485,10 +465,6 @@ fn quantize_q4k(device: &Device) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q5k(device: &Device) -> Result<()> {
|
fn quantize_q5k(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let dtype = GgmlDType::Q5K;
|
let dtype = GgmlDType::Q5K;
|
||||||
let src = get_test_vector2(0.5, 1024, device)?;
|
let src = get_test_vector2(0.5, 1024, device)?;
|
||||||
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
||||||
@ -522,10 +498,6 @@ fn quantize_q5k(device: &Device) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q6k(device: &Device) -> Result<()> {
|
fn quantize_q6k(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let dtype = GgmlDType::Q6K;
|
let dtype = GgmlDType::Q6K;
|
||||||
let src = get_test_vector2(0.5, 1024, device)?;
|
let src = get_test_vector2(0.5, 1024, device)?;
|
||||||
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
||||||
@ -559,10 +531,6 @@ fn quantize_q6k(device: &Device) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn quantize_q8k(device: &Device) -> Result<()> {
|
fn quantize_q8k(device: &Device) -> Result<()> {
|
||||||
// TODO Enable this later when we enable cuda.
|
|
||||||
if device.is_cuda() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let dtype = GgmlDType::Q8K;
|
let dtype = GgmlDType::Q8K;
|
||||||
let src = get_test_vector2(0.5, 1024, device)?;
|
let src = get_test_vector2(0.5, 1024, device)?;
|
||||||
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
let quant = quantized::QTensor::quantize(&src, dtype)?;
|
||||||
|
Reference in New Issue
Block a user