Add dequantization for ggmls q4_0, q4_1, q5_0, q5_1 and q8_0 (#407)

* Added dequantization for `q4_0`, `q4_1`, `q5_0`, `q5_1` and `q8_0`

* expose `tensor_from_ggml` for external usage

* bugfixes & example
This commit is contained in:
Lukas Kreussel
2023-08-14 00:22:57 +02:00
committed by GitHub
parent 8bd2b22b33
commit 9e7e6e0288
2 changed files with 288 additions and 63 deletions

View File

@ -110,6 +110,124 @@ struct BlockQ6K {
}
const _: () = assert!(3 * QK_K / 4 + QK_K / 16 + 2 == std::mem::size_of::<BlockQ6K>());
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1525
fn dequantize_row_q4_0(xs: &[BlockQ4_0], ys: &mut [f32]) -> Result<()> {
let k = ys.len();
if k % QK4_0 != 0 {
crate::bail!("dequantize_row_q4_0: {k} is not divisible by {QK4_0}")
}
let nb = k / QK4_0;
for i in 0..nb {
let d = xs[i].d.to_f32();
for j in 0..(QK4_0 / 2) {
let x0 = (xs[i].qs[j] & 0x0F) as i16 - 8;
let x1 = (xs[i].qs[j] >> 4) as i16 - 8;
ys[i * QK4_0 + j] = (x0 as f32) * d;
ys[i * QK4_0 + j + QK4_0 / 2] = (x1 as f32) * d;
}
}
Ok(())
}
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1545
fn dequantize_row_q4_1(xs: &[BlockQ4_1], ys: &mut [f32]) -> Result<()> {
let k = ys.len();
if k % QK4_1 != 0 {
crate::bail!("dequantize_row_q4_1: {k} is not divisible by {QK4_1}");
}
let nb = k / QK4_1;
for i in 0..nb {
let d = xs[i].d.to_f32();
let m = xs[i].m.to_f32();
for j in 0..(QK4_1 / 2) {
let x0 = xs[i].qs[j] & 0x0F;
let x1 = xs[i].qs[j] >> 4;
ys[i * QK4_1 + j] = (x0 as f32) * d + m;
ys[i * QK4_1 + j + QK4_1 / 2] = (x1 as f32) * d + m;
}
}
Ok(())
}
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1566
fn dequantize_row_q5_0(xs: &[BlockQ5_0], ys: &mut [f32]) -> Result<()> {
let k = ys.len();
if k % QK5_0 != 0 {
crate::bail!("dequantize_row_q5_0: {k} is not divisible by {QK5_0}");
}
let nb = k / QK5_0;
for i in 0..nb {
let d = xs[i].d.to_f32();
let qh: u32 = unsafe { std::mem::transmute_copy(&xs[i].qh) };
for j in 0..(QK5_0 / 2) {
let xh_0 = (((qh >> j) << 4) & 0x10) as u8;
let xh_1 = ((qh >> (j + 12)) & 0x10) as u8;
let x0 = ((xs[i].qs[j] & 0x0F) | xh_0) as i32 - 16;
let x1 = ((xs[i].qs[j] >> 4) | xh_1) as i32 - 16;
ys[i * QK5_0 + j] = (x0 as f32) * d;
ys[i * QK5_0 + j + QK5_0 / 2] = (x1 as f32) * d;
}
}
Ok(())
}
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1592
fn dequantize_row_q5_1(xs: &[BlockQ5_1], ys: &mut [f32]) -> Result<()> {
let k = ys.len();
if k % QK5_1 != 0 {
crate::bail!("dequantize_row_q5_1: {k} is not divisible by {QK5_1}");
}
let nb = k / QK5_1;
for i in 0..nb {
let d = xs[i].d.to_f32();
let m = xs[i].m.to_f32();
let qh: u32 = unsafe { std::mem::transmute_copy(&xs[i].qh) };
for j in 0..(QK5_1 / 2) {
let xh_0 = (((qh >> j) << 4) & 0x10) as u8;
let xh_1 = ((qh >> (j + 12)) & 0x10) as u8;
let x0 = (xs[i].qs[j] & 0x0F) | xh_0;
let x1 = (xs[i].qs[j] >> 4) | xh_1;
ys[i * QK5_1 + j] = (x0 as f32) * d + m;
ys[i * QK5_1 + j + QK5_1 / 2] = (x1 as f32) * d + m;
}
}
Ok(())
}
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/ggml.c#L1619
fn dequantize_row_q8_0(vx: &[BlockQ8_0], ys: &mut [f32]) -> Result<()> {
let k = ys.len();
if k % QK8_0 != 0 {
crate::bail!("dequantize_row_q8_0: {k} is not divisible by {QK8_0}");
}
let nb = k / QK8_0;
let xs: &[BlockQ8_0] = unsafe { std::mem::transmute(vx) };
for i in 0..nb {
let d = xs[i].d.to_f32();
for j in 0..QK8_0 {
ys[i * QK8_0 + j] = xs[i].qs[j] as f32 * d;
}
}
Ok(())
}
// https://github.com/ggerganov/llama.cpp/blob/8183159cf3def112f6d1fe94815fce70e1bffa12/k_quants.c#L354
fn dequantize_row_q2k(xs: &[BlockQ2K], ys: &mut [f32]) -> Result<()> {
let k = ys.len();
@ -467,23 +585,140 @@ impl GgmlDType {
}
}
#[derive(Debug)]
pub struct Content {
pub magic: VersionedMagic,
pub hparams: HParams,
pub vocab: Vocab,
pub tensors: Vec<(String, Tensor)>,
fn dequantize_and_create_tensor<T, F>(
raw_data: &[u8],
tensor_elems: usize,
size_in_bytes: usize,
dims: Vec<usize>,
device: &Device,
dequantize_row: F,
) -> Result<Tensor>
where
F: Fn(&[T], &mut [f32]) -> Result<()>,
{
let mut f32_data = vec![0f32; tensor_elems];
let raw_data_ptr = raw_data.as_ptr();
let n_blocks = size_in_bytes / std::mem::size_of::<T>();
let raw_data = unsafe { std::slice::from_raw_parts(raw_data_ptr as *const T, n_blocks) };
dequantize_row(raw_data, &mut f32_data)?;
Tensor::from_vec(f32_data, dims, device)
}
/// Creates a [Tensor] from a raw GGML tensor.
pub fn tensor_from_ggml(
ggml_dtype: GgmlDType,
raw_data: &[u8],
dims: Vec<usize>,
dtype: DType,
device: &Device,
) -> Result<Tensor> {
let tensor_elems = dims.iter().product::<usize>();
let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
let tensor = match ggml_dtype {
GgmlDType::F32 => Tensor::from_raw_buffer(raw_data, DType::F32, &dims, device),
GgmlDType::F16 => Tensor::from_raw_buffer(raw_data, DType::F16, &dims, device),
GgmlDType::Q4_0 => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q4_0,
),
GgmlDType::Q4_1 => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q4_1,
),
GgmlDType::Q5_0 => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q5_0,
),
GgmlDType::Q5_1 => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q5_1,
),
GgmlDType::Q8_0 => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q8_0,
),
GgmlDType::Q2K => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q2k,
),
GgmlDType::Q3K => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q3k,
),
GgmlDType::Q4K => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q4k,
),
GgmlDType::Q5K => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q5k,
),
GgmlDType::Q6K => dequantize_and_create_tensor(
raw_data,
tensor_elems,
size_in_bytes,
dims,
device,
dequantize_row_q6k,
),
_ => crate::bail!("quantized type {dtype:?} is not supported yet"),
}?;
//We only have ggml-quant to f32 conversions, meaning we have to convert to the desired type
if tensor.dtype() != dtype {
tensor.to_dtype(dtype)
} else {
Ok(tensor)
}
}
fn read_one_tensor<R: std::io::Seek + std::io::Read>(
reader: &mut R,
magic: VersionedMagic,
dtype: DType,
device: &Device,
) -> Result<(String, Tensor)> {
let n_dims = reader.read_u32::<LittleEndian>()?;
let name_len = reader.read_u32::<LittleEndian>()?;
let dtype = reader.read_u32::<LittleEndian>()?;
let dtype = GgmlDType::from_u32(dtype)?;
let ggml_dtype = reader.read_u32::<LittleEndian>()?;
let ggml_dtype = GgmlDType::from_u32(ggml_dtype)?;
let mut dims = vec![0u32; n_dims as usize];
reader.read_u32_into::<LittleEndian>(&mut dims)?;
let mut name = vec![0u8; name_len as usize];
@ -496,68 +731,29 @@ fn read_one_tensor<R: std::io::Seek + std::io::Read>(
}
let dims = dims.iter().map(|&u| u as usize).collect::<Vec<_>>();
let tensor_elems = dims.iter().product::<usize>();
let size_in_bytes = tensor_elems * dtype.type_size() / dtype.blck_size();
println!("{name} {dtype:?} {dims:?}");
let size_in_bytes = tensor_elems * ggml_dtype.type_size() / ggml_dtype.blck_size();
println!("{name} {ggml_dtype:?} {dims:?}");
// TODO: Mmap version to avoid copying the data around?
let mut raw_data = vec![0u8; size_in_bytes];
reader.read_exact(&mut raw_data)?;
let tensor = match dtype {
GgmlDType::F32 => Tensor::from_raw_buffer(&raw_data, DType::F32, &dims, device)?,
GgmlDType::F16 => Tensor::from_raw_buffer(&raw_data, DType::F16, &dims, device)?,
GgmlDType::Q2K => {
let mut f32_data = vec![0f32; tensor_elems];
let raw_data_ptr = raw_data.as_ptr();
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ2K>();
let raw_data =
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ2K, n_blocks) };
dequantize_row_q2k(raw_data, &mut f32_data)?;
// Maybe we should use bf16 instead?
Tensor::from_vec(f32_data, dims, device)?
match tensor_from_ggml(ggml_dtype, &raw_data, dims, dtype, device) {
Ok(tensor) => Ok((name, tensor)),
Err(e) => crate::bail!("Error creating tensor {name}: {e}"),
}
GgmlDType::Q3K => {
let mut f32_data = vec![0f32; tensor_elems];
let raw_data_ptr = raw_data.as_ptr();
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ3K>();
let raw_data =
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ3K, n_blocks) };
dequantize_row_q3k(raw_data, &mut f32_data)?;
Tensor::from_vec(f32_data, dims, device)?
}
GgmlDType::Q4K => {
let mut f32_data = vec![0f32; tensor_elems];
let raw_data_ptr = raw_data.as_ptr();
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ4K>();
let raw_data =
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ4K, n_blocks) };
dequantize_row_q4k(raw_data, &mut f32_data)?;
Tensor::from_vec(f32_data, dims, device)?
}
GgmlDType::Q5K => {
let mut f32_data = vec![0f32; tensor_elems];
let raw_data_ptr = raw_data.as_ptr();
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ5K>();
let raw_data =
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ5K, n_blocks) };
dequantize_row_q5k(raw_data, &mut f32_data)?;
Tensor::from_vec(f32_data, dims, device)?
}
GgmlDType::Q6K => {
let mut f32_data = vec![0f32; tensor_elems];
let raw_data_ptr = raw_data.as_ptr();
let n_blocks = size_in_bytes / std::mem::size_of::<BlockQ6K>();
let raw_data =
unsafe { std::slice::from_raw_parts(raw_data_ptr as *const BlockQ6K, n_blocks) };
dequantize_row_q6k(raw_data, &mut f32_data)?;
Tensor::from_vec(f32_data, dims, device)?
}
_ => crate::bail!("quantized type {dtype:?} used in {name} is not supported yet"),
};
Ok((name, tensor))
}
#[derive(Debug)]
pub struct Content {
pub magic: VersionedMagic,
pub hparams: HParams,
pub vocab: Vocab,
pub tensors: Vec<(String, Tensor)>,
}
impl Content {
pub fn read<R: std::io::Seek + std::io::Read>(
reader: &mut R,
dtype: DType,
device: &Device,
) -> Result<Content> {
// https://github.com/ggerganov/llama.cpp/blob/468ea24fb4633a0d681f7ac84089566c1c6190cb/llama.cpp#L505
@ -569,7 +765,7 @@ impl Content {
let mut tensors = vec![];
while reader.stream_position()? != last_position {
let (name, tensor) = read_one_tensor(reader, magic, device)?;
let (name, tensor) = read_one_tensor(reader, magic, dtype, device)?;
tensors.push((name, tensor))
}
Ok(Self {

View File

@ -0,0 +1,29 @@
use anyhow::Result;
use clap::Parser;
use std::fs::File;
use candle::ggml::Content;
use candle::{DType, Device};
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// GGML file to load.
#[arg(long)]
model: String,
}
fn main() -> Result<()> {
let args = Args::parse();
let mut file = File::open(args.model)?;
let start = std::time::Instant::now();
let model = Content::read(&mut file, DType::F16, &Device::Cpu)?;
println!(
"Loaded {:?} tensors in {:?}",
model.tensors.len(),
start.elapsed()
);
Ok(())
}