use candle_core::{Device, Result}; use clap::{Parser, Subcommand, ValueEnum}; use rayon::prelude::*; #[derive(ValueEnum, Debug, Clone)] enum Quantization { Q2k, Q3k, Q4k, Q5k, Q6k, Q8k, } #[derive(ValueEnum, Debug, Clone)] enum Format { Safetensors, Npz, Ggml, Gguf, Pth, Pickle, } impl Format { fn infer>(p: P) -> Option { p.as_ref() .extension() .and_then(|e| e.to_str()) .and_then(|e| match e { // We don't infer any format for .bin as it can be used for ggml/gguf or pytorch. "safetensors" | "safetensor" => Some(Self::Safetensors), "npz" => Some(Self::Npz), "pth" | "pt" => Some(Self::Pth), "ggml" => Some(Self::Ggml), "gguf" => Some(Self::Gguf), _ => None, }) } } #[derive(Subcommand, Debug, Clone)] enum Command { Ls { files: Vec, /// The file format to use, if unspecified infer from the file extension. #[arg(long, value_enum)] format: Option, /// Enable verbose mode. #[arg(short, long)] verbose: bool, }, Quantize { /// The input file, in gguf format. in_file: std::path::PathBuf, /// The output file, in gguf format. out_file: std::path::PathBuf, /// The quantization schema to apply. #[arg(long, value_enum)] quantization: Quantization, }, } #[derive(Parser, Debug, Clone)] struct Args { #[command(subcommand)] command: Command, } fn run_ls(file: &std::path::PathBuf, format: Option, verbose: bool) -> Result<()> { let format = match format { Some(format) => format, None => match Format::infer(file) { Some(format) => format, None => { println!( "{file:?}: cannot infer format from file extension, use the --format flag" ); return Ok(()); } }, }; match format { Format::Npz => { let tensors = candle_core::npy::NpzTensors::new(file)?; let mut names = tensors.names(); names.sort(); for name in names { let shape_dtype = match tensors.get_shape_and_dtype(name) { Ok((shape, dtype)) => format!("[{shape:?}; {dtype:?}]"), Err(err) => err.to_string(), }; println!("{name}: {shape_dtype}") } } Format::Safetensors => { let tensors = unsafe { candle_core::safetensors::MmapedFile::new(file)? }; let tensors = tensors.deserialize()?; let mut tensors = tensors.tensors(); tensors.sort_by(|a, b| a.0.cmp(&b.0)); for (name, view) in tensors.iter() { let dtype = view.dtype(); let dtype = match candle_core::DType::try_from(dtype) { Ok(dtype) => format!("{dtype:?}"), Err(_) => format!("{dtype:?}"), }; let shape = view.shape(); println!("{name}: [{shape:?}; {dtype}]") } } Format::Pth => { let mut tensors = candle_core::pickle::read_pth_tensor_info(file, verbose)?; tensors.sort_by(|a, b| a.name.cmp(&b.name)); for tensor_info in tensors.iter() { println!( "{}: [{:?}; {:?}]", tensor_info.name, tensor_info.layout.shape(), tensor_info.dtype, ); if verbose { println!(" {:?}", tensor_info); } } } Format::Pickle => { let file = std::fs::File::open(file)?; let mut reader = std::io::BufReader::new(file); let mut stack = candle_core::pickle::Stack::empty(); stack.read_loop(&mut reader)?; for (i, obj) in stack.stack().iter().enumerate() { println!("{i} {obj:?}"); } } Format::Ggml => { let mut file = std::fs::File::open(file)?; let content = candle_core::quantized::ggml_file::Content::read(&mut file)?; let mut tensors = content.tensors.into_iter().collect::>(); tensors.sort_by(|a, b| a.0.cmp(&b.0)); for (name, qtensor) in tensors.iter() { println!("{name}: [{:?}; {:?}]", qtensor.shape(), qtensor.dtype()); } } Format::Gguf => { let mut file = std::fs::File::open(file)?; let content = candle_core::quantized::gguf_file::Content::read(&mut file)?; if verbose { let mut metadata = content.metadata.into_iter().collect::>(); metadata.sort_by(|a, b| a.0.cmp(&b.0)); println!("metadata entries ({})", metadata.len()); for (key, value) in metadata.iter() { println!(" {key}: {value:?}"); } } let mut tensors = content.tensor_infos.into_iter().collect::>(); tensors.sort_by(|a, b| a.0.cmp(&b.0)); for (name, info) in tensors.iter() { println!("{name}: [{:?}; {:?}]", info.shape, info.ggml_dtype); } } } Ok(()) } fn run_quantize( in_file: std::path::PathBuf, out_file: std::path::PathBuf, q: Quantization, ) -> Result<()> { use candle_core::quantized::{gguf_file, k_quants, QTensor}; // Open the out file early so as to fail directly on missing directories etc. let mut out_file = std::fs::File::create(out_file)?; let mut in_ = std::fs::File::open(&in_file)?; let content = gguf_file::Content::read(&mut in_)?; println!("tensors: {}", content.tensor_infos.len()); let qtensors = content .tensor_infos .par_iter() .map(|(name, _)| { println!(" quantizing {name}"); let mut in_file = std::fs::File::open(&in_file)?; let tensor = content.tensor(&mut in_file, name)?; let tensor = tensor.dequantize(&Device::Cpu)?; // TODO: Only quantize the linear weights, and quantize the final layer weights // differently from the rest. let tensor = match q { Quantization::Q2k => QTensor::quantize::(&tensor)?, Quantization::Q3k => QTensor::quantize::(&tensor)?, Quantization::Q4k => QTensor::quantize::(&tensor)?, Quantization::Q5k => QTensor::quantize::(&tensor)?, Quantization::Q6k => QTensor::quantize::(&tensor)?, Quantization::Q8k => QTensor::quantize::(&tensor)?, }; Ok((name, tensor)) }) .collect::>>()?; let qtensors = qtensors .iter() .map(|(k, v)| (k.as_str(), v)) .collect::>(); let metadata = content .metadata .iter() .map(|(k, v)| (k.as_str(), v)) .collect::>(); gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?; Ok(()) } fn main() -> anyhow::Result<()> { let args = Args::parse(); match args.command { Command::Ls { files, format, verbose, } => { let multiple_files = files.len() > 1; for file in files.iter() { if multiple_files { println!("--- {file:?} ---"); } run_ls(file, format.clone(), verbose)? } } Command::Quantize { in_file, out_file, quantization, } => run_quantize(in_file, out_file, quantization)?, } Ok(()) }