diff --git a/candle-core/src/ggml.rs b/candle-core/src/ggml.rs index 4a5d4fa0..5796f056 100644 --- a/candle-core/src/ggml.rs +++ b/candle-core/src/ggml.rs @@ -110,6 +110,124 @@ struct BlockQ6K { } const _: () = assert!(3 * QK_K / 4 + QK_K / 16 + 2 == std::mem::size_of::()); +// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1525 +fn dequantize_row_q4_0(xs: &[BlockQ4_0], ys: &mut [f32]) -> Result<()> { + let k = ys.len(); + if k % QK4_0 != 0 { + crate::bail!("dequantize_row_q4_0: {k} is not divisible by {QK4_0}") + } + + let nb = k / QK4_0; + for i in 0..nb { + let d = xs[i].d.to_f32(); + + for j in 0..(QK4_0 / 2) { + let x0 = (xs[i].qs[j] & 0x0F) as i16 - 8; + let x1 = (xs[i].qs[j] >> 4) as i16 - 8; + + ys[i * QK4_0 + j] = (x0 as f32) * d; + ys[i * QK4_0 + j + QK4_0 / 2] = (x1 as f32) * d; + } + } + Ok(()) +} + +// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1545 +fn dequantize_row_q4_1(xs: &[BlockQ4_1], ys: &mut [f32]) -> Result<()> { + let k = ys.len(); + if k % QK4_1 != 0 { + crate::bail!("dequantize_row_q4_1: {k} is not divisible by {QK4_1}"); + } + + let nb = k / QK4_1; + for i in 0..nb { + let d = xs[i].d.to_f32(); + let m = xs[i].m.to_f32(); + + for j in 0..(QK4_1 / 2) { + let x0 = xs[i].qs[j] & 0x0F; + let x1 = xs[i].qs[j] >> 4; + + ys[i * QK4_1 + j] = (x0 as f32) * d + m; + ys[i * QK4_1 + j + QK4_1 / 2] = (x1 as f32) * d + m; + } + } + Ok(()) +} + +// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1566 +fn dequantize_row_q5_0(xs: &[BlockQ5_0], ys: &mut [f32]) -> Result<()> { + let k = ys.len(); + if k % QK5_0 != 0 { + crate::bail!("dequantize_row_q5_0: {k} is not divisible by {QK5_0}"); + } + + let nb = k / QK5_0; + for i in 0..nb { + let d = xs[i].d.to_f32(); + let qh: u32 = unsafe { std::mem::transmute_copy(&xs[i].qh) }; + + for j in 0..(QK5_0 / 2) { + let xh_0 = (((qh >> j) << 4) & 0x10) as u8; + let xh_1 = ((qh >> (j + 12)) & 0x10) as u8; + + let x0 = ((xs[i].qs[j] & 0x0F) | xh_0) as i32 - 16; + let x1 = ((xs[i].qs[j] >> 4) | xh_1) as i32 - 16; + + ys[i * QK5_0 + j] = (x0 as f32) * d; + ys[i * QK5_0 + j + QK5_0 / 2] = (x1 as f32) * d; + } + } + Ok(()) +} + +// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1592 +fn dequantize_row_q5_1(xs: &[BlockQ5_1], ys: &mut [f32]) -> Result<()> { + let k = ys.len(); + if k % QK5_1 != 0 { + crate::bail!("dequantize_row_q5_1: {k} is not divisible by {QK5_1}"); + } + + let nb = k / QK5_1; + for i in 0..nb { + let d = xs[i].d.to_f32(); + let m = xs[i].m.to_f32(); + let qh: u32 = unsafe { std::mem::transmute_copy(&xs[i].qh) }; + + for j in 0..(QK5_1 / 2) { + let xh_0 = (((qh >> j) << 4) & 0x10) as u8; + let xh_1 = ((qh >> (j + 12)) & 0x10) as u8; + + let x0 = (xs[i].qs[j] & 0x0F) | xh_0; + let x1 = (xs[i].qs[j] >> 4) | xh_1; + + ys[i * QK5_1 + j] = (x0 as f32) * d + m; + ys[i * QK5_1 + j + QK5_1 / 2] = (x1 as f32) * d + m; + } + } + Ok(()) +} + +// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1619 +fn dequantize_row_q8_0(vx: &[BlockQ8_0], ys: &mut [f32]) -> Result<()> { + let k = ys.len(); + if k % QK8_0 != 0 { + crate::bail!("dequantize_row_q8_0: {k} is not divisible by {QK8_0}"); + } + + let nb = k / QK8_0; + let xs: &[BlockQ8_0] = unsafe { std::mem::transmute(vx) }; + + for i in 0..nb { + let d = xs[i].d.to_f32(); + + for j in 0..QK8_0 { + ys[i * QK8_0 + j] = xs[i].qs[j] as f32 * d; + } + } + Ok(()) +} + // https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L354 fn dequantize_row_q2k(xs: &[BlockQ2K], ys: &mut [f32]) -> Result<()> { let k = ys.len(); @@ -467,23 +585,140 @@ impl GgmlDType { } } -#[derive(Debug)] -pub struct Content { - pub magic: VersionedMagic, - pub hparams: HParams, - pub vocab: Vocab, - pub tensors: Vec<(String, Tensor)>, +fn dequantize_and_create_tensor( + raw_data: &[u8], + tensor_elems: usize, + size_in_bytes: usize, + dims: Vec, + device: &Device, + dequantize_row: F, +) -> Result +where + F: Fn(&[T], &mut [f32]) -> Result<()>, +{ + let mut f32_data = vec![0f32; tensor_elems]; + let raw_data_ptr = raw_data.as_ptr(); + let n_blocks = size_in_bytes / std::mem::size_of::(); + let raw_data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) }; + dequantize_row(raw_data, &mut f32_data)?; + Tensor::from_vec(f32_data, dims, device) +} + +/// Creates a [Tensor] from a raw GGML tensor. +pub fn tensor_from_ggml( + ggml_dtype: GgmlDType, + raw_data: &[u8], + dims: Vec, + dtype: DType, + device: &Device, +) -> Result { + let tensor_elems = dims.iter().product::(); + let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size(); + + let tensor = match ggml_dtype { + GgmlDType::F32 => Tensor::from_raw_buffer(raw_data, DType::F32, &dims, device), + GgmlDType::F16 => Tensor::from_raw_buffer(raw_data, DType::F16, &dims, device), + GgmlDType::Q4_0 => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q4_0, + ), + GgmlDType::Q4_1 => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q4_1, + ), + GgmlDType::Q5_0 => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q5_0, + ), + GgmlDType::Q5_1 => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q5_1, + ), + GgmlDType::Q8_0 => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q8_0, + ), + GgmlDType::Q2K => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q2k, + ), + GgmlDType::Q3K => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q3k, + ), + GgmlDType::Q4K => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q4k, + ), + GgmlDType::Q5K => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q5k, + ), + GgmlDType::Q6K => dequantize_and_create_tensor( + raw_data, + tensor_elems, + size_in_bytes, + dims, + device, + dequantize_row_q6k, + ), + + _ => crate::bail!("quantized type {dtype:?} is not supported yet"), + }?; + //We only have ggml-quant to f32 conversions, meaning we have to convert to the desired type + if tensor.dtype() != dtype { + tensor.to_dtype(dtype) + } else { + Ok(tensor) + } } fn read_one_tensor( reader: &mut R, magic: VersionedMagic, + dtype: DType, device: &Device, ) -> Result<(String, Tensor)> { let n_dims = reader.read_u32::()?; let name_len = reader.read_u32::()?; - let dtype = reader.read_u32::()?; - let dtype = GgmlDType::from_u32(dtype)?; + let ggml_dtype = reader.read_u32::()?; + let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?; let mut dims = vec![0u32; n_dims as usize]; reader.read_u32_into::(&mut dims)?; let mut name = vec![0u8; name_len as usize]; @@ -496,68 +731,29 @@ fn read_one_tensor( } let dims = dims.iter().map(|&u| u as usize).collect::>(); let tensor_elems = dims.iter().product::(); - let size_in_bytes = tensor_elems * dtype.type_size() / dtype.blck_size(); - println!("{name} {dtype:?} {dims:?}"); + let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size(); + println!("{name} {ggml_dtype:?} {dims:?}"); // TODO: Mmap version to avoid copying the data around? let mut raw_data = vec![0u8; size_in_bytes]; reader.read_exact(&mut raw_data)?; - let tensor = match dtype { - GgmlDType::F32 => Tensor::from_raw_buffer(&raw_data, DType::F32, &dims, device)?, - GgmlDType::F16 => Tensor::from_raw_buffer(&raw_data, DType::F16, &dims, device)?, - GgmlDType::Q2K => { - let mut f32_data = vec![0f32; tensor_elems]; - let raw_data_ptr = raw_data.as_ptr(); - let n_blocks = size_in_bytes / std::mem::size_of::(); - let raw_data = - unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ2K, n_blocks) }; - dequantize_row_q2k(raw_data, &mut f32_data)?; - // Maybe we should use bf16 instead? - Tensor::from_vec(f32_data, dims, device)? - } - GgmlDType::Q3K => { - let mut f32_data = vec![0f32; tensor_elems]; - let raw_data_ptr = raw_data.as_ptr(); - let n_blocks = size_in_bytes / std::mem::size_of::(); - let raw_data = - unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ3K, n_blocks) }; - dequantize_row_q3k(raw_data, &mut f32_data)?; - Tensor::from_vec(f32_data, dims, device)? - } - GgmlDType::Q4K => { - let mut f32_data = vec![0f32; tensor_elems]; - let raw_data_ptr = raw_data.as_ptr(); - let n_blocks = size_in_bytes / std::mem::size_of::(); - let raw_data = - unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ4K, n_blocks) }; - dequantize_row_q4k(raw_data, &mut f32_data)?; - Tensor::from_vec(f32_data, dims, device)? - } - GgmlDType::Q5K => { - let mut f32_data = vec![0f32; tensor_elems]; - let raw_data_ptr = raw_data.as_ptr(); - let n_blocks = size_in_bytes / std::mem::size_of::(); - let raw_data = - unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ5K, n_blocks) }; - dequantize_row_q5k(raw_data, &mut f32_data)?; - Tensor::from_vec(f32_data, dims, device)? - } - GgmlDType::Q6K => { - let mut f32_data = vec![0f32; tensor_elems]; - let raw_data_ptr = raw_data.as_ptr(); - let n_blocks = size_in_bytes / std::mem::size_of::(); - let raw_data = - unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ6K, n_blocks) }; - dequantize_row_q6k(raw_data, &mut f32_data)?; - Tensor::from_vec(f32_data, dims, device)? - } - _ => crate::bail!("quantized type {dtype:?} used in {name} is not supported yet"), - }; - Ok((name, tensor)) + match tensor_from_ggml(ggml_dtype, &raw_data, dims, dtype, device) { + Ok(tensor) => Ok((name, tensor)), + Err(e) => crate::bail!("Error creating tensor {name}: {e}"), + } +} + +#[derive(Debug)] +pub struct Content { + pub magic: VersionedMagic, + pub hparams: HParams, + pub vocab: Vocab, + pub tensors: Vec<(String, Tensor)>, } impl Content { pub fn read( reader: &mut R, + dtype: DType, device: &Device, ) -> Result { // https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505 @@ -569,7 +765,7 @@ impl Content { let mut tensors = vec![]; while reader.stream_position()? != last_position { - let (name, tensor) = read_one_tensor(reader, magic, device)?; + let (name, tensor) = read_one_tensor(reader, magic, dtype, device)?; tensors.push((name, tensor)) } Ok(Self { diff --git a/candle-examples/examples/ggml/main.rs b/candle-examples/examples/ggml/main.rs new file mode 100644 index 00000000..c3fc6b9e --- /dev/null +++ b/candle-examples/examples/ggml/main.rs @@ -0,0 +1,29 @@ +use anyhow::Result; +use clap::Parser; +use std::fs::File; + +use candle::ggml::Content; +use candle::{DType, Device}; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// GGML file to load. + #[arg(long)] + model: String, +} + +fn main() -> Result<()> { + let args = Args::parse(); + + let mut file = File::open(args.model)?; + let start = std::time::Instant::now(); + let model = Content::read(&mut file, DType::F16, &Device::Cpu)?; + + println!( + "Loaded {:?} tensors in {:?}", + model.tensors.len(), + start.elapsed() + ); + Ok(()) +}