From 7151f2cf63b312049fa53713ebf6f0f174cf2fc9 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 27 Aug 2023 11:35:19 +0100 Subject: [PATCH] Add the quantize command. (#624) * Add the quantize command. * Bugfix for writing gguf files. * And add a comment. --- candle-core/examples/tensor-tools.rs | 76 +++++++++++++++++++++++++- candle-core/src/quantized/gguf_file.rs | 3 +- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs index 67e6aa1e..f45cbc7e 100644 --- a/candle-core/examples/tensor-tools.rs +++ b/candle-core/examples/tensor-tools.rs @@ -1,5 +1,16 @@ -use candle_core::Result; +use candle_core::{Device, Result}; use clap::{Parser, Subcommand, ValueEnum}; +use rayon::prelude::*; + +#[derive(ValueEnum, Debug, Clone)] +enum Quantization { + Q2k, + Q3k, + Q4k, + Q5k, + Q6k, + Q8k, +} #[derive(ValueEnum, Debug, Clone)] enum Format { @@ -41,6 +52,17 @@ enum Command { #[arg(short, long)] verbose: bool, }, + + Quantize { + /// The input file, in gguf format. + in_file: std::path::PathBuf, + /// The output file, in gguf format. + out_file: std::path::PathBuf, + + /// The quantization schema to apply. + #[arg(long, value_enum)] + quantization: Quantization, + }, } #[derive(Parser, Debug, Clone)] @@ -144,6 +166,53 @@ fn run_ls(file: &std::path::PathBuf, format: Option, verbose: bool) -> R Ok(()) } +fn run_quantize( + in_file: std::path::PathBuf, + out_file: std::path::PathBuf, + q: Quantization, +) -> Result<()> { + use candle_core::quantized::{gguf_file, k_quants, QTensor}; + // Open the out file early so as to fail directly on missing directories etc. + let mut out_file = std::fs::File::create(out_file)?; + let mut in_ = std::fs::File::open(&in_file)?; + let content = gguf_file::Content::read(&mut in_)?; + println!("tensors: {}", content.tensor_infos.len()); + + let qtensors = content + .tensor_infos + .par_iter() + .map(|(name, _)| { + println!(" quantizing {name}"); + let mut in_file = std::fs::File::open(&in_file)?; + let tensor = content.tensor(&mut in_file, name)?; + let tensor = tensor.dequantize(&Device::Cpu)?; + // TODO: Only quantize the linear weights, and quantize the final layer weights + // differently from the rest. + let tensor = match q { + Quantization::Q2k => QTensor::quantize::(&tensor)?, + Quantization::Q3k => QTensor::quantize::(&tensor)?, + Quantization::Q4k => QTensor::quantize::(&tensor)?, + Quantization::Q5k => QTensor::quantize::(&tensor)?, + Quantization::Q6k => QTensor::quantize::(&tensor)?, + Quantization::Q8k => QTensor::quantize::(&tensor)?, + }; + Ok((name, tensor)) + }) + .collect::>>()?; + let qtensors = qtensors + .iter() + .map(|(k, v)| (k.as_str(), v)) + .collect::>(); + + let metadata = content + .metadata + .iter() + .map(|(k, v)| (k.as_str(), v)) + .collect::>(); + gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?; + Ok(()) +} + fn main() -> anyhow::Result<()> { let args = Args::parse(); match args.command { @@ -160,6 +229,11 @@ fn main() -> anyhow::Result<()> { run_ls(file, format.clone(), verbose)? } } + Command::Quantize { + in_file, + out_file, + quantization, + } => run_quantize(in_file, out_file, quantization)?, } Ok(()) } diff --git a/candle-core/src/quantized/gguf_file.rs b/candle-core/src/quantized/gguf_file.rs index 17a60b79..5fd6bc79 100644 --- a/candle-core/src/quantized/gguf_file.rs +++ b/candle-core/src/quantized/gguf_file.rs @@ -292,7 +292,7 @@ impl ValueType { 7 => Self::Bool, 8 => Self::String, 9 => Self::Array, - v => crate::bail!("unrecognized value-type {v}"), + v => crate::bail!("unrecognized value-type {v:#08x}"), }; Ok(v) } @@ -393,6 +393,7 @@ pub fn write( w.write_u32::(0x46554747)?; w.write_u32::(1)?; // version 1. w.write_u32::(tensors.len() as u32)?; + w.write_u32::(metadata.len() as u32)?; for (name, value) in metadata.iter() { write_string(w, name)?; w.write_u32::(value.value_type().to_u32())?;