From 318cb82f1657a7ebd238e94bce134203c9fd04b4 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 1 Apr 2024 11:06:42 +0200 Subject: [PATCH] Quantized cuda tweaks. (#1981) * Quantized cuda tweaks. * Add some safety checks. * Factorize the dequantization bits. --- candle-core/src/quantized/cuda.rs | 151 ++++++++++++------------------ 1 file changed, 62 insertions(+), 89 deletions(-) diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs index 64404beb..07f8c13e 100644 --- a/candle-core/src/quantized/cuda.rs +++ b/candle-core/src/quantized/cuda.rs @@ -1,9 +1,11 @@ use super::{GgmlDType, QStorage}; +use crate::quantized::k_quants::GgmlType; use crate::{backend::BackendDevice, cuda_backend::WrapErr}; use crate::{CudaDevice, CudaStorage, Result}; use cudarc::driver::{CudaSlice, CudaView, DeviceSlice}; +#[derive(Clone, Debug)] pub struct QCudaStorage { data: CudaSlice, dtype: GgmlDType, @@ -26,6 +28,14 @@ pub const CUDA_QUANTIZE_BLOCK_SIZE: usize = 256; pub const CUDA_DEQUANTIZE_BLOCK_SIZE: usize = 256; pub const MATRIX_ROW_PADDING: usize = 512; +fn ceil_div(p: usize, q: usize) -> usize { + (p + q - 1) / q +} + +fn pad(p: usize, q: usize) -> usize { + ceil_div(p, q) * q +} + fn quantize_q8_1( src: &CudaView, dst: &mut CudaSlice, @@ -35,8 +45,8 @@ fn quantize_q8_1( use cudarc::driver::LaunchAsync; let kx = elem_count; - let kx_padded = (kx + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; - let num_blocks = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + let kx_padded = pad(kx, MATRIX_ROW_PADDING); + let num_blocks = ceil_div(kx_padded, CUDA_QUANTIZE_BLOCK_SIZE); let func = dev.get_or_load_func("quantize_q8_1", candle_kernels::QUANTIZED)?; let cfg = cudarc::driver::LaunchConfig { grid_dim: (num_blocks as u32, 1, 1), @@ -60,26 +70,18 @@ fn dequantize( let (kernel_name, is_k, block_dim, num_blocks) = match dtype { GgmlDType::Q4_0 => ("dequantize_block_q4_0", false, 32, nb), GgmlDType::Q4_1 => ("dequantize_block_q4_1", false, 32, nb), - GgmlDType::Q5_0 => { - let nb = (elem_count + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1) - / (2 * CUDA_DEQUANTIZE_BLOCK_SIZE); - ( - "dequantize_block_q5_0", - false, - CUDA_DEQUANTIZE_BLOCK_SIZE, - nb, - ) - } - GgmlDType::Q5_1 => { - let nb = (elem_count + 2 * CUDA_DEQUANTIZE_BLOCK_SIZE - 1) - / (2 * CUDA_DEQUANTIZE_BLOCK_SIZE); - ( - "dequantize_block_q5_1", - false, - CUDA_DEQUANTIZE_BLOCK_SIZE, - nb, - ) - } + GgmlDType::Q5_0 => ( + "dequantize_block_q5_0", + false, + CUDA_DEQUANTIZE_BLOCK_SIZE, + ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE), + ), + GgmlDType::Q5_1 => ( + "dequantize_block_q5_1", + false, + CUDA_DEQUANTIZE_BLOCK_SIZE, + ceil_div(elem_count, 2 * CUDA_DEQUANTIZE_BLOCK_SIZE), + ), GgmlDType::Q8_0 => ("dequantize_block_q8_0", false, 32, nb), GgmlDType::Q2K => ("dequantize_block_q2_K", true, 64, nb), GgmlDType::Q3K => ("dequantize_block_q3_K", true, 64, nb), @@ -123,6 +125,13 @@ fn dequantize_mul_mat_vec( ) -> Result { use cudarc::driver::LaunchAsync; + let data_elems = data.len() / dtype.type_size() * dtype.block_size(); + if data_elems < ncols * nrows { + crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems) + } + if y.len() != ncols { + crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len()) + } let kernel_name = match dtype { GgmlDType::Q4_0 => "dequantize_mul_mat_vec_q4_0_cuda", GgmlDType::Q4_1 => "dequantize_mul_mat_vec_q4_1_cuda", @@ -138,7 +147,7 @@ fn dequantize_mul_mat_vec( }; let func = dev.get_or_load_func(kernel_name, candle_kernels::QUANTIZED)?; let dst = unsafe { dev.alloc::(nrows).w()? }; - let block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + let block_num_y = ceil_div(nrows, GGML_CUDA_MMV_Y); let cfg = cudarc::driver::LaunchConfig { grid_dim: (block_num_y as u32, 1, 1), block_dim: (WARP_SIZE as u32, GGML_CUDA_MMV_Y as u32, 1), @@ -160,8 +169,15 @@ fn mul_mat_vec_via_q8_1( ) -> Result { use cudarc::driver::LaunchAsync; + let data_elems = data.len() / dtype.type_size() * dtype.block_size(); + if data_elems < ncols * nrows { + crate::bail!("unexpected data size {}, ncols {ncols} {nrows}", data_elems) + } + if y.len() != ncols { + crate::bail!("unexpected y size {}, ncols {ncols} {nrows}", y.len()) + } // Start by quantizing y - let ncols_padded = (ncols + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let ncols_padded = pad(ncols, MATRIX_ROW_PADDING); let y_size_in_bytes = ncols_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size(); let mut y_q8_1 = unsafe { dev.alloc::(y_size_in_bytes).w()? }; quantize_q8_1(y, &mut y_q8_1, ncols, dev)?; @@ -202,7 +218,7 @@ fn mul_mat_vec_via_q8_1( impl QCudaStorage { pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result { - let size_in_bytes = el_count * dtype.type_size() / dtype.block_size(); + let size_in_bytes = ceil_div(el_count, dtype.block_size()) * dtype.type_size(); let data = device.alloc_zeros::(size_in_bytes).w()?; Ok(QCudaStorage { data, @@ -220,6 +236,12 @@ impl QCudaStorage { } pub fn dequantize(&self, elem_count: usize) -> Result { + fn deq(buffer: &[u8], n: usize, dst: &mut [f32]) -> Result<()> { + let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) }; + let vec = slice.to_vec(); + T::to_float(&vec, dst) + } + let fast_kernel = matches!( self.dtype, GgmlDType::Q4_0 @@ -238,69 +260,25 @@ impl QCudaStorage { return dequantize(&self.data, self.dtype, elem_count, self.device()); } // Run the dequantization on cpu. - use crate::quantized::k_quants::GgmlType; let buffer = self.device.dtoh_sync_copy(&self.data).w()?; let mut out = vec![0.0; elem_count]; let block_len = elem_count / self.dtype.block_size(); match self.dtype { - GgmlDType::F32 => { - let slice = - unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const f32, block_len) }; - out.copy_from_slice(slice) - } - GgmlDType::F16 => { - let vec: Vec = read_to_vec(&buffer, block_len); - half::f16::to_float(&vec, &mut out)?; - } - GgmlDType::Q4_0 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ4_0::to_float(&vec, &mut out)?; - } - GgmlDType::Q4_1 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ4_1::to_float(&vec, &mut out)?; - } - GgmlDType::Q5_0 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ5_0::to_float(&vec, &mut out)?; - } - GgmlDType::Q5_1 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ5_1::to_float(&vec, &mut out)?; - } - GgmlDType::Q8_0 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ8_0::to_float(&vec, &mut out)?; - } - GgmlDType::Q8_1 => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ8_1::to_float(&vec, &mut out)?; - } - GgmlDType::Q2K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ2K::to_float(&vec, &mut out)?; - } - GgmlDType::Q3K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ3K::to_float(&vec, &mut out)?; - } - GgmlDType::Q4K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ4K::to_float(&vec, &mut out)?; - } - GgmlDType::Q5K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ5K::to_float(&vec, &mut out)?; - } - GgmlDType::Q6K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ6K::to_float(&vec, &mut out)?; - } - GgmlDType::Q8K => { - let vec: Vec = read_to_vec(&buffer, block_len); - crate::quantized::BlockQ8K::to_float(&vec, &mut out)?; - } + GgmlDType::F32 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::F16 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q4_0 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q4_1 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q5_0 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q5_1 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q8_0 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q8_1 => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q2K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q3K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q4K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q5K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q6K => deq::(&buffer, block_len, &mut out)?, + GgmlDType::Q8K => deq::(&buffer, block_len, &mut out)?, } self.device @@ -405,11 +383,6 @@ impl QCudaStorage { } } -fn read_to_vec(buffer: &[u8], n: usize) -> Vec { - let slice = unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const T, n) }; - slice.to_vec() -} - pub fn load_quantized( device: &CudaDevice, data: &[T], @@ -433,7 +406,7 @@ mod test { fn cuda_quantize_q8_1() -> Result<()> { let dev = CudaDevice::new(0)?; let el = 256; - let el_padded = (el + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING * MATRIX_ROW_PADDING; + let el_padded = pad(el, MATRIX_ROW_PADDING); let y_size_in_bytes = el_padded * GgmlDType::Q8_1.type_size() / GgmlDType::Q8_1.block_size(); let mut y_q8_1 = unsafe { dev.alloc::(y_size_in_bytes).w()? };