mirror of
https://github.com/huggingface/candle.git
synced 2025-06-17 02:58:50 +00:00
Llama quantization. (#625)
This commit is contained in:
@ -1,15 +1,63 @@
|
|||||||
use candle_core::{Device, Result};
|
use candle_core::quantized::{gguf_file, k_quants, QTensor};
|
||||||
|
use candle_core::{Device, Result, Tensor};
|
||||||
use clap::{Parser, Subcommand, ValueEnum};
|
use clap::{Parser, Subcommand, ValueEnum};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
|
#[derive(ValueEnum, Debug, Clone)]
|
||||||
|
enum QuantizationMode {
|
||||||
|
/// The default quantization includes all 2d tensors, except the output tensor which always
|
||||||
|
/// uses Q6_K.
|
||||||
|
Llama,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QuantizationMode {
|
||||||
|
fn quantize(
|
||||||
|
&self,
|
||||||
|
name: &str,
|
||||||
|
tensor: QTensor,
|
||||||
|
default: fn(&Tensor) -> Result<QTensor>,
|
||||||
|
) -> Result<QTensor> {
|
||||||
|
match self {
|
||||||
|
Self::Llama => {
|
||||||
|
// Same behavior as the llama.cpp quantization.
|
||||||
|
let should_quantize = name.ends_with(".weight") && tensor.rank() == 2;
|
||||||
|
if should_quantize {
|
||||||
|
let tensor = tensor.dequantize(&Device::Cpu)?;
|
||||||
|
if name == "output.weight" {
|
||||||
|
QTensor::quantize::<k_quants::BlockQ6K>(&tensor)
|
||||||
|
} else {
|
||||||
|
default(&tensor)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(tensor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(ValueEnum, Debug, Clone)]
|
#[derive(ValueEnum, Debug, Clone)]
|
||||||
enum Quantization {
|
enum Quantization {
|
||||||
|
#[value(name = "q4_0")]
|
||||||
|
Q4_0,
|
||||||
|
#[value(name = "q4_1")]
|
||||||
|
Q4_1,
|
||||||
|
#[value(name = "q5_0")]
|
||||||
|
Q5_0,
|
||||||
|
#[value(name = "q5_1")]
|
||||||
|
Q5_1,
|
||||||
|
#[value(name = "q8_0")]
|
||||||
|
Q8_0,
|
||||||
|
#[value(name = "q8_1")]
|
||||||
|
Q8_1,
|
||||||
Q2k,
|
Q2k,
|
||||||
Q3k,
|
Q3k,
|
||||||
Q4k,
|
Q4k,
|
||||||
Q5k,
|
Q5k,
|
||||||
Q6k,
|
Q6k,
|
||||||
Q8k,
|
Q8k,
|
||||||
|
F16,
|
||||||
|
F32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(ValueEnum, Debug, Clone)]
|
#[derive(ValueEnum, Debug, Clone)]
|
||||||
@ -62,6 +110,10 @@ enum Command {
|
|||||||
/// The quantization schema to apply.
|
/// The quantization schema to apply.
|
||||||
#[arg(long, value_enum)]
|
#[arg(long, value_enum)]
|
||||||
quantization: Quantization,
|
quantization: Quantization,
|
||||||
|
|
||||||
|
/// Which tensor to quantize.
|
||||||
|
#[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
|
||||||
|
mode: QuantizationMode,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -147,7 +199,7 @@ fn run_ls(file: &std::path::PathBuf, format: Option<Format>, verbose: bool) -> R
|
|||||||
}
|
}
|
||||||
Format::Gguf => {
|
Format::Gguf => {
|
||||||
let mut file = std::fs::File::open(file)?;
|
let mut file = std::fs::File::open(file)?;
|
||||||
let content = candle_core::quantized::gguf_file::Content::read(&mut file)?;
|
let content = gguf_file::Content::read(&mut file)?;
|
||||||
if verbose {
|
if verbose {
|
||||||
let mut metadata = content.metadata.into_iter().collect::<Vec<_>>();
|
let mut metadata = content.metadata.into_iter().collect::<Vec<_>>();
|
||||||
metadata.sort_by(|a, b| a.0.cmp(&b.0));
|
metadata.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
@ -170,14 +222,31 @@ fn run_quantize(
|
|||||||
in_file: std::path::PathBuf,
|
in_file: std::path::PathBuf,
|
||||||
out_file: std::path::PathBuf,
|
out_file: std::path::PathBuf,
|
||||||
q: Quantization,
|
q: Quantization,
|
||||||
|
qmode: QuantizationMode,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
use candle_core::quantized::{gguf_file, k_quants, QTensor};
|
|
||||||
// Open the out file early so as to fail directly on missing directories etc.
|
// Open the out file early so as to fail directly on missing directories etc.
|
||||||
let mut out_file = std::fs::File::create(out_file)?;
|
let mut out_file = std::fs::File::create(out_file)?;
|
||||||
let mut in_ = std::fs::File::open(&in_file)?;
|
let mut in_ = std::fs::File::open(&in_file)?;
|
||||||
let content = gguf_file::Content::read(&mut in_)?;
|
let content = gguf_file::Content::read(&mut in_)?;
|
||||||
println!("tensors: {}", content.tensor_infos.len());
|
println!("tensors: {}", content.tensor_infos.len());
|
||||||
|
|
||||||
|
let quantize_fn = match q {
|
||||||
|
Quantization::Q4_0 => QTensor::quantize::<k_quants::BlockQ4_0>,
|
||||||
|
Quantization::Q4_1 => QTensor::quantize::<k_quants::BlockQ4_1>,
|
||||||
|
Quantization::Q5_0 => QTensor::quantize::<k_quants::BlockQ5_0>,
|
||||||
|
Quantization::Q5_1 => QTensor::quantize::<k_quants::BlockQ5_1>,
|
||||||
|
Quantization::Q8_0 => QTensor::quantize::<k_quants::BlockQ8_0>,
|
||||||
|
Quantization::Q8_1 => QTensor::quantize::<k_quants::BlockQ8_1>,
|
||||||
|
Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>,
|
||||||
|
Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>,
|
||||||
|
Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>,
|
||||||
|
Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>,
|
||||||
|
Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>,
|
||||||
|
Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>,
|
||||||
|
Quantization::F16 => QTensor::quantize::<half::f16>,
|
||||||
|
Quantization::F32 => QTensor::quantize::<f32>,
|
||||||
|
};
|
||||||
|
|
||||||
let qtensors = content
|
let qtensors = content
|
||||||
.tensor_infos
|
.tensor_infos
|
||||||
.par_iter()
|
.par_iter()
|
||||||
@ -185,17 +254,7 @@ fn run_quantize(
|
|||||||
println!(" quantizing {name}");
|
println!(" quantizing {name}");
|
||||||
let mut in_file = std::fs::File::open(&in_file)?;
|
let mut in_file = std::fs::File::open(&in_file)?;
|
||||||
let tensor = content.tensor(&mut in_file, name)?;
|
let tensor = content.tensor(&mut in_file, name)?;
|
||||||
let tensor = tensor.dequantize(&Device::Cpu)?;
|
let tensor = qmode.quantize(name, tensor, quantize_fn)?;
|
||||||
// TODO: Only quantize the linear weights, and quantize the final layer weights
|
|
||||||
// differently from the rest.
|
|
||||||
let tensor = match q {
|
|
||||||
Quantization::Q2k => QTensor::quantize::<k_quants::BlockQ2K>(&tensor)?,
|
|
||||||
Quantization::Q3k => QTensor::quantize::<k_quants::BlockQ3K>(&tensor)?,
|
|
||||||
Quantization::Q4k => QTensor::quantize::<k_quants::BlockQ4K>(&tensor)?,
|
|
||||||
Quantization::Q5k => QTensor::quantize::<k_quants::BlockQ5K>(&tensor)?,
|
|
||||||
Quantization::Q6k => QTensor::quantize::<k_quants::BlockQ6K>(&tensor)?,
|
|
||||||
Quantization::Q8k => QTensor::quantize::<k_quants::BlockQ8K>(&tensor)?,
|
|
||||||
};
|
|
||||||
Ok((name, tensor))
|
Ok((name, tensor))
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>>>()?;
|
.collect::<Result<Vec<_>>>()?;
|
||||||
@ -233,7 +292,8 @@ fn main() -> anyhow::Result<()> {
|
|||||||
in_file,
|
in_file,
|
||||||
out_file,
|
out_file,
|
||||||
quantization,
|
quantization,
|
||||||
} => run_quantize(in_file, out_file, quantization)?,
|
mode,
|
||||||
|
} => run_quantize(in_file, out_file, quantization, mode)?,
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -202,6 +202,10 @@ impl QTensor {
|
|||||||
self.data.dtype()
|
self.data.dtype()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn rank(&self) -> usize {
|
||||||
|
self.shape.rank()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn shape(&self) -> &Shape {
|
pub fn shape(&self) -> &Shape {
|
||||||
&self.shape
|
&self.shape
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user