From 7151f2cf63b312049fa53713ebf6f0f174cf2fc9 Mon Sep 17 00:00:00 2001
From: Laurent Mazare <laurent.mazare@gmail.com>
Date: Sun, 27 Aug 2023 11:35:19 +0100
Subject: [PATCH] Add the quantize command. (#624)

* Add the quantize command.

* Bugfix for writing gguf files.

* And add a comment.
---
 candle-core/examples/tensor-tools.rs   | 76 +++++++++++++++++++++++++-
 candle-core/src/quantized/gguf_file.rs |  3 +-
 2 files changed, 77 insertions(+), 2 deletions(-)
diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs
index 67e6aa1e..f45cbc7e 100644
--- a/candle-core/examples/tensor-tools.rs
+++ b/candle-core/examples/tensor-tools.rs
@@ -1,5 +1,16 @@
-use candle_core::Result;
+use candle_core::{Device, Result};
 use clap::{Parser, Subcommand, ValueEnum};
+use rayon::prelude::*;
+
+#[derive(ValueEnum, Debug, Clone)]
+enum Quantization {
+    Q2k,
+    Q3k,
+    Q4k,
+    Q5k,
+    Q6k,
+    Q8k,
+}
 
 #[derive(ValueEnum, Debug, Clone)]
 enum Format {
@@ -41,6 +52,17 @@ enum Command {
         #[arg(short, long)]
         verbose: bool,
     },
+
+    Quantize {
+        /// The input file, in gguf format.
+        in_file: std::path::PathBuf,
+        /// The output file, in gguf format.
+        out_file: std::path::PathBuf,
+
+        /// The quantization schema to apply.
+        #[arg(long, value_enum)]
+        quantization: Quantization,
+    },
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -144,6 +166,53 @@ fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> R
     Ok(())
 }
 
+fn run_quantize(
+    in_file: std::path::PathBuf,
+    out_file: std::path::PathBuf,
+    q: Quantization,
+) -> Result<()> {
+    use candle_core::quantized::{gguf_file, k_quants, QTensor};
+    // Open the out file early so as to fail directly on missing directories etc.
+    let mut out_file = std::fs::File::create(out_file)?;
+    let mut in_ = std::fs::File::open(&in_file)?;
+    let content = gguf_file::Content::read(&mut in_)?;
+    println!("tensors: {}", content.tensor_infos.len());
+
+    let qtensors = content
+        .tensor_infos
+        .par_iter()
+        .map(|(name, _)| {
+            println!("  quantizing {name}");
+            let mut in_file = std::fs::File::open(&in_file)?;
+            let tensor = content.tensor(&mut in_file, name)?;
+            let tensor = tensor.dequantize(&Device::Cpu)?;
+            // TODO: Only quantize the linear weights, and quantize the final layer weights
+            // differently from the rest.
+            let tensor = match q {
+                Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>(&tensor)?,
+                Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>(&tensor)?,
+                Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>(&tensor)?,
+                Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>(&tensor)?,
+                Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>(&tensor)?,
+                Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>(&tensor)?,
+            };
+            Ok((name, tensor))
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let qtensors = qtensors
+        .iter()
+        .map(|(k, v)| (k.as_str(), v))
+        .collect::<Vec<_>>();
+
+    let metadata = content
+        .metadata
+        .iter()
+        .map(|(k, v)| (k.as_str(), v))
+        .collect::<Vec<_>>();
+    gguf_file::write(&mut out_file, metadata.as_slice(), &qtensors)?;
+    Ok(())
+}
+
 fn main() -> anyhow::Result<()> {
     let args = Args::parse();
     match args.command {
@@ -160,6 +229,11 @@ fn main() -> anyhow::Result<()> {
                 run_ls(file, format.clone(), verbose)?
             }
         }
+        Command::Quantize {
+            in_file,
+            out_file,
+            quantization,
+        } => run_quantize(in_file, out_file, quantization)?,
     }
     Ok(())
 }
diff --git a/candle-core/src/quantized/gguf_file.rs b/candle-core/src/quantized/gguf_file.rs
index 17a60b79..5fd6bc79 100644
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@@ -292,7 +292,7 @@ impl ValueType {
             7 => Self::Bool,
             8 => Self::String,
             9 => Self::Array,
-            v => crate::bail!("unrecognized value-type {v}"),
+            v => crate::bail!("unrecognized value-type {v:#08x}"),
         };
         Ok(v)
     }
@@ -393,6 +393,7 @@ pub fn write<W: std::io::Seek + std::io::Write>(
     w.write_u32::<LittleEndian>(0x46554747)?;
     w.write_u32::<LittleEndian>(1)?; // version 1.
     w.write_u32::<LittleEndian>(tensors.len() as u32)?;
+    w.write_u32::<LittleEndian>(metadata.len() as u32)?;
     for (name, value) in metadata.iter() {
         write_string(w, name)?;
         w.write_u32::<LittleEndian>(value.value_type().to_u32())?;