mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 02:38:10 +00:00
Add dequantization for ggmls q4_0
, q4_1
, q5_0
, q5_1
and q8_0
(#407)
* Added dequantization for `q4_0`, `q4_1`, `q5_0`, `q5_1` and `q8_0` * expose `tensor_from_ggml` for external usage * bugfixes & example
This commit is contained in:
@ -110,6 +110,124 @@ struct BlockQ6K {
|
||||
}
|
||||
const _: () = assert!(3 * QK_K / 4 + QK_K / 16 + 2 == std::mem::size_of::<BlockQ6K>());
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1525
|
||||
fn dequantize_row_q4_0(xs: &[BlockQ4_0], ys: &mut [f32]) -> Result<()> {
|
||||
let k = ys.len();
|
||||
if k % QK4_0 != 0 {
|
||||
crate::bail!("dequantize_row_q4_0: {k} is not divisible by {QK4_0}")
|
||||
}
|
||||
|
||||
let nb = k / QK4_0;
|
||||
for i in 0..nb {
|
||||
let d = xs[i].d.to_f32();
|
||||
|
||||
for j in 0..(QK4_0 / 2) {
|
||||
let x0 = (xs[i].qs[j] & 0x0F) as i16 - 8;
|
||||
let x1 = (xs[i].qs[j] >> 4) as i16 - 8;
|
||||
|
||||
ys[i * QK4_0 + j] = (x0 as f32) * d;
|
||||
ys[i * QK4_0 + j + QK4_0 / 2] = (x1 as f32) * d;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1545
|
||||
fn dequantize_row_q4_1(xs: &[BlockQ4_1], ys: &mut [f32]) -> Result<()> {
|
||||
let k = ys.len();
|
||||
if k % QK4_1 != 0 {
|
||||
crate::bail!("dequantize_row_q4_1: {k} is not divisible by {QK4_1}");
|
||||
}
|
||||
|
||||
let nb = k / QK4_1;
|
||||
for i in 0..nb {
|
||||
let d = xs[i].d.to_f32();
|
||||
let m = xs[i].m.to_f32();
|
||||
|
||||
for j in 0..(QK4_1 / 2) {
|
||||
let x0 = xs[i].qs[j] & 0x0F;
|
||||
let x1 = xs[i].qs[j] >> 4;
|
||||
|
||||
ys[i * QK4_1 + j] = (x0 as f32) * d + m;
|
||||
ys[i * QK4_1 + j + QK4_1 / 2] = (x1 as f32) * d + m;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1566
|
||||
fn dequantize_row_q5_0(xs: &[BlockQ5_0], ys: &mut [f32]) -> Result<()> {
|
||||
let k = ys.len();
|
||||
if k % QK5_0 != 0 {
|
||||
crate::bail!("dequantize_row_q5_0: {k} is not divisible by {QK5_0}");
|
||||
}
|
||||
|
||||
let nb = k / QK5_0;
|
||||
for i in 0..nb {
|
||||
let d = xs[i].d.to_f32();
|
||||
let qh: u32 = unsafe { std::mem::transmute_copy(&xs[i].qh) };
|
||||
|
||||
for j in 0..(QK5_0 / 2) {
|
||||
let xh_0 = (((qh >> j) << 4) & 0x10) as u8;
|
||||
let xh_1 = ((qh >> (j + 12)) & 0x10) as u8;
|
||||
|
||||
let x0 = ((xs[i].qs[j] & 0x0F) | xh_0) as i32 - 16;
|
||||
let x1 = ((xs[i].qs[j] >> 4) | xh_1) as i32 - 16;
|
||||
|
||||
ys[i * QK5_0 + j] = (x0 as f32) * d;
|
||||
ys[i * QK5_0 + j + QK5_0 / 2] = (x1 as f32) * d;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1592
|
||||
fn dequantize_row_q5_1(xs: &[BlockQ5_1], ys: &mut [f32]) -> Result<()> {
|
||||
let k = ys.len();
|
||||
if k % QK5_1 != 0 {
|
||||
crate::bail!("dequantize_row_q5_1: {k} is not divisible by {QK5_1}");
|
||||
}
|
||||
|
||||
let nb = k / QK5_1;
|
||||
for i in 0..nb {
|
||||
let d = xs[i].d.to_f32();
|
||||
let m = xs[i].m.to_f32();
|
||||
let qh: u32 = unsafe { std::mem::transmute_copy(&xs[i].qh) };
|
||||
|
||||
for j in 0..(QK5_1 / 2) {
|
||||
let xh_0 = (((qh >> j) << 4) & 0x10) as u8;
|
||||
let xh_1 = ((qh >> (j + 12)) & 0x10) as u8;
|
||||
|
||||
let x0 = (xs[i].qs[j] & 0x0F) | xh_0;
|
||||
let x1 = (xs[i].qs[j] >> 4) | xh_1;
|
||||
|
||||
ys[i * QK5_1 + j] = (x0 as f32) * d + m;
|
||||
ys[i * QK5_1 + j + QK5_1 / 2] = (x1 as f32) * d + m;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1619
|
||||
fn dequantize_row_q8_0(vx: &[BlockQ8_0], ys: &mut [f32]) -> Result<()> {
|
||||
let k = ys.len();
|
||||
if k % QK8_0 != 0 {
|
||||
crate::bail!("dequantize_row_q8_0: {k} is not divisible by {QK8_0}");
|
||||
}
|
||||
|
||||
let nb = k / QK8_0;
|
||||
let xs: &[BlockQ8_0] = unsafe { std::mem::transmute(vx) };
|
||||
|
||||
for i in 0..nb {
|
||||
let d = xs[i].d.to_f32();
|
||||
|
||||
for j in 0..QK8_0 {
|
||||
ys[i * QK8_0 + j] = xs[i].qs[j] as f32 * d;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L354
|
||||
fn dequantize_row_q2k(xs: &[BlockQ2K], ys: &mut [f32]) -> Result<()> {
|
||||
let k = ys.len();
|
||||
@ -467,23 +585,140 @@ impl GgmlDType {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Content {
|
||||
pub magic: VersionedMagic,
|
||||
pub hparams: HParams,
|
||||
pub vocab: Vocab,
|
||||
pub tensors: Vec<(String, Tensor)>,
|
||||
fn dequantize_and_create_tensor<T, F>(
|
||||
raw_data: &[u8],
|
||||
tensor_elems: usize,
|
||||
size_in_bytes: usize,
|
||||
dims: Vec<usize>,
|
||||
device: &Device,
|
||||
dequantize_row: F,
|
||||
) -> Result<Tensor>
|
||||
where
|
||||
F: Fn(&[T], &mut [f32]) -> Result<()>,
|
||||
{
|
||||
let mut f32_data = vec![0f32; tensor_elems];
|
||||
let raw_data_ptr = raw_data.as_ptr();
|
||||
let n_blocks = size_in_bytes / std::mem::size_of::<T>();
|
||||
let raw_data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
|
||||
dequantize_row(raw_data, &mut f32_data)?;
|
||||
Tensor::from_vec(f32_data, dims, device)
|
||||
}
|
||||
|
||||
/// Creates a [Tensor] from a raw GGML tensor.
|
||||
pub fn tensor_from_ggml(
|
||||
ggml_dtype: GgmlDType,
|
||||
raw_data: &[u8],
|
||||
dims: Vec<usize>,
|
||||
dtype: DType,
|
||||
device: &Device,
|
||||
) -> Result<Tensor> {
|
||||
let tensor_elems = dims.iter().product::<usize>();
|
||||
let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
|
||||
|
||||
let tensor = match ggml_dtype {
|
||||
GgmlDType::F32 => Tensor::from_raw_buffer(raw_data, DType::F32, &dims, device),
|
||||
GgmlDType::F16 => Tensor::from_raw_buffer(raw_data, DType::F16, &dims, device),
|
||||
GgmlDType::Q4_0 => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q4_0,
|
||||
),
|
||||
GgmlDType::Q4_1 => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q4_1,
|
||||
),
|
||||
GgmlDType::Q5_0 => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q5_0,
|
||||
),
|
||||
GgmlDType::Q5_1 => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q5_1,
|
||||
),
|
||||
GgmlDType::Q8_0 => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q8_0,
|
||||
),
|
||||
GgmlDType::Q2K => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q2k,
|
||||
),
|
||||
GgmlDType::Q3K => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q3k,
|
||||
),
|
||||
GgmlDType::Q4K => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q4k,
|
||||
),
|
||||
GgmlDType::Q5K => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q5k,
|
||||
),
|
||||
GgmlDType::Q6K => dequantize_and_create_tensor(
|
||||
raw_data,
|
||||
tensor_elems,
|
||||
size_in_bytes,
|
||||
dims,
|
||||
device,
|
||||
dequantize_row_q6k,
|
||||
),
|
||||
|
||||
_ => crate::bail!("quantized type {dtype:?} is not supported yet"),
|
||||
}?;
|
||||
//We only have ggml-quant to f32 conversions, meaning we have to convert to the desired type
|
||||
if tensor.dtype() != dtype {
|
||||
tensor.to_dtype(dtype)
|
||||
} else {
|
||||
Ok(tensor)
|
||||
}
|
||||
}
|
||||
|
||||
fn read_one_tensor<R: std::io::Seek + std::io::Read>(
|
||||
reader: &mut R,
|
||||
magic: VersionedMagic,
|
||||
dtype: DType,
|
||||
device: &Device,
|
||||
) -> Result<(String, Tensor)> {
|
||||
let n_dims = reader.read_u32::<LittleEndian>()?;
|
||||
let name_len = reader.read_u32::<LittleEndian>()?;
|
||||
let dtype = reader.read_u32::<LittleEndian>()?;
|
||||
let dtype = GgmlDType::from_u32(dtype)?;
|
||||
let ggml_dtype = reader.read_u32::<LittleEndian>()?;
|
||||
let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?;
|
||||
let mut dims = vec![0u32; n_dims as usize];
|
||||
reader.read_u32_into::<LittleEndian>(&mut dims)?;
|
||||
let mut name = vec![0u8; name_len as usize];
|
||||
@ -496,68 +731,29 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
|
||||
}
|
||||
let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>();
|
||||
let tensor_elems = dims.iter().product::<usize>();
|
||||
let size_in_bytes = tensor_elems * dtype.type_size() / dtype.blck_size();
|
||||
println!("{name} {dtype:?} {dims:?}");
|
||||
let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
|
||||
println!("{name} {ggml_dtype:?} {dims:?}");
|
||||
// TODO: Mmap version to avoid copying the data around?
|
||||
let mut raw_data = vec![0u8; size_in_bytes];
|
||||
reader.read_exact(&mut raw_data)?;
|
||||
let tensor = match dtype {
|
||||
GgmlDType::F32 => Tensor::from_raw_buffer(&raw_data, DType::F32, &dims, device)?,
|
||||
GgmlDType::F16 => Tensor::from_raw_buffer(&raw_data, DType::F16, &dims, device)?,
|
||||
GgmlDType::Q2K => {
|
||||
let mut f32_data = vec![0f32; tensor_elems];
|
||||
let raw_data_ptr = raw_data.as_ptr();
|
||||
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ2K>();
|
||||
let raw_data =
|
||||
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ2K, n_blocks) };
|
||||
dequantize_row_q2k(raw_data, &mut f32_data)?;
|
||||
// Maybe we should use bf16 instead?
|
||||
Tensor::from_vec(f32_data, dims, device)?
|
||||
}
|
||||
GgmlDType::Q3K => {
|
||||
let mut f32_data = vec![0f32; tensor_elems];
|
||||
let raw_data_ptr = raw_data.as_ptr();
|
||||
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ3K>();
|
||||
let raw_data =
|
||||
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ3K, n_blocks) };
|
||||
dequantize_row_q3k(raw_data, &mut f32_data)?;
|
||||
Tensor::from_vec(f32_data, dims, device)?
|
||||
}
|
||||
GgmlDType::Q4K => {
|
||||
let mut f32_data = vec![0f32; tensor_elems];
|
||||
let raw_data_ptr = raw_data.as_ptr();
|
||||
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ4K>();
|
||||
let raw_data =
|
||||
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ4K, n_blocks) };
|
||||
dequantize_row_q4k(raw_data, &mut f32_data)?;
|
||||
Tensor::from_vec(f32_data, dims, device)?
|
||||
}
|
||||
GgmlDType::Q5K => {
|
||||
let mut f32_data = vec![0f32; tensor_elems];
|
||||
let raw_data_ptr = raw_data.as_ptr();
|
||||
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ5K>();
|
||||
let raw_data =
|
||||
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ5K, n_blocks) };
|
||||
dequantize_row_q5k(raw_data, &mut f32_data)?;
|
||||
Tensor::from_vec(f32_data, dims, device)?
|
||||
}
|
||||
GgmlDType::Q6K => {
|
||||
let mut f32_data = vec![0f32; tensor_elems];
|
||||
let raw_data_ptr = raw_data.as_ptr();
|
||||
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ6K>();
|
||||
let raw_data =
|
||||
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ6K, n_blocks) };
|
||||
dequantize_row_q6k(raw_data, &mut f32_data)?;
|
||||
Tensor::from_vec(f32_data, dims, device)?
|
||||
}
|
||||
_ => crate::bail!("quantized type {dtype:?} used in {name} is not supported yet"),
|
||||
};
|
||||
Ok((name, tensor))
|
||||
match tensor_from_ggml(ggml_dtype, &raw_data, dims, dtype, device) {
|
||||
Ok(tensor) => Ok((name, tensor)),
|
||||
Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Content {
|
||||
pub magic: VersionedMagic,
|
||||
pub hparams: HParams,
|
||||
pub vocab: Vocab,
|
||||
pub tensors: Vec<(String, Tensor)>,
|
||||
}
|
||||
|
||||
impl Content {
|
||||
pub fn read<R: std::io::Seek + std::io::Read>(
|
||||
reader: &mut R,
|
||||
dtype: DType,
|
||||
device: &Device,
|
||||
) -> Result<Content> {
|
||||
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
|
||||
@ -569,7 +765,7 @@ impl Content {
|
||||
let mut tensors = vec![];
|
||||
|
||||
while reader.stream_position()? != last_position {
|
||||
let (name, tensor) = read_one_tensor(reader, magic, device)?;
|
||||
let (name, tensor) = read_one_tensor(reader, magic, dtype, device)?;
|
||||
tensors.push((name, tensor))
|
||||
}
|
||||
Ok(Self {
|
||||
|
29
candle-examples/examples/ggml/main.rs
Normal file
29
candle-examples/examples/ggml/main.rs
Normal file
@ -0,0 +1,29 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use std::fs::File;
|
||||
|
||||
use candle::ggml::Content;
|
||||
use candle::{DType, Device};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// GGML file to load.
|
||||
#[arg(long)]
|
||||
model: String,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
let mut file = File::open(args.model)?;
|
||||
let start = std::time::Instant::now();
|
||||
let model = Content::read(&mut file, DType::F16, &Device::Cpu)?;
|
||||
|
||||
println!(
|
||||
"Loaded {:?} tensors in {:?}",
|
||||
model.tensors.len(),
|
||||
start.elapsed()
|
||||
);
|
||||
Ok(())
|
||||
}
|
Reference in New Issue
Block a user