mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 18:48:51 +00:00
Generic implementation of vecdot for q80. (#596)
* Generic implementation of vecdot for q80. * Add support for code-llama 7b. * Support more code-llama.
This commit is contained in:
@ -421,8 +421,24 @@ impl GgmlType for BlockQ8_0 {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn vec_dot(_: usize, _: &[Self], _: &[Self::VecDotType]) -> Result<f32> {
|
fn vec_dot(n: usize, xs: &[Self], ys: &[Self::VecDotType]) -> Result<f32> {
|
||||||
todo!()
|
let qk = QK8_0;
|
||||||
|
if n % QK8_0 != 0 {
|
||||||
|
crate::bail!("vec_dot_q8_0_q8_0: {n} is not divisible by {qk}")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generic implementation.
|
||||||
|
let mut sumf = 0f32;
|
||||||
|
for (xs, ys) in xs.iter().zip(ys.iter()) {
|
||||||
|
let sum_i = xs
|
||||||
|
.qs
|
||||||
|
.iter()
|
||||||
|
.zip(ys.qs.iter())
|
||||||
|
.map(|(&x, &y)| x as i32 * y as i32)
|
||||||
|
.sum::<i32>();
|
||||||
|
sumf += sum_i as f32 * f16::to_f32(xs.d) * f16::to_f32(ys.d)
|
||||||
|
}
|
||||||
|
Ok(sumf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -195,10 +195,10 @@ impl WeightMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn precomput_freqs_cis(head_dim: usize) -> Result<(Tensor, Tensor)> {
|
fn precomput_freqs_cis(head_dim: usize, freq_base: f32) -> Result<(Tensor, Tensor)> {
|
||||||
let theta: Vec<_> = (0..head_dim)
|
let theta: Vec<_> = (0..head_dim)
|
||||||
.step_by(2)
|
.step_by(2)
|
||||||
.map(|i| 1f32 / 10000f32.powf(i as f32 / head_dim as f32))
|
.map(|i| 1f32 / freq_base.powf(i as f32 / head_dim as f32))
|
||||||
.collect();
|
.collect();
|
||||||
let theta = Tensor::new(theta.as_slice(), &Device::Cpu)?;
|
let theta = Tensor::new(theta.as_slice(), &Device::Cpu)?;
|
||||||
let idx_theta = Tensor::arange(0, MAX_SEQ_LEN as u32, &Device::Cpu)?
|
let idx_theta = Tensor::arange(0, MAX_SEQ_LEN as u32, &Device::Cpu)?
|
||||||
@ -214,7 +214,7 @@ impl ModelWeights {
|
|||||||
fn from_ggml(mut ct: ggml_file::Content, gqa: usize) -> Result<Self> {
|
fn from_ggml(mut ct: ggml_file::Content, gqa: usize) -> Result<Self> {
|
||||||
let cpu = &Device::Cpu;
|
let cpu = &Device::Cpu;
|
||||||
let head_dim = (ct.hparams.n_embd / ct.hparams.n_head) as usize;
|
let head_dim = (ct.hparams.n_embd / ct.hparams.n_head) as usize;
|
||||||
let (cos, sin) = precomput_freqs_cis(head_dim)?;
|
let (cos, sin) = precomput_freqs_cis(head_dim, 10000.)?;
|
||||||
let tok_embeddings = ct.remove("tok_embeddings.weight")?;
|
let tok_embeddings = ct.remove("tok_embeddings.weight")?;
|
||||||
let tok_embeddings = tok_embeddings.dequantize(cpu)?;
|
let tok_embeddings = tok_embeddings.dequantize(cpu)?;
|
||||||
let norm = RmsNorm::new(ct.remove("norm.weight")?, 1e-5)?;
|
let norm = RmsNorm::new(ct.remove("norm.weight")?, 1e-5)?;
|
||||||
@ -287,7 +287,10 @@ impl ModelWeights {
|
|||||||
// Strangely this value is generally 1e-6 in GGUF file but used to be 1e-5 by default.
|
// Strangely this value is generally 1e-6 in GGUF file but used to be 1e-5 by default.
|
||||||
let rms_norm_eps = md_get("llama.attention.layer_norm_rms_epsilon")?.to_f32()?;
|
let rms_norm_eps = md_get("llama.attention.layer_norm_rms_epsilon")?.to_f32()?;
|
||||||
|
|
||||||
let (cos, sin) = precomput_freqs_cis(rope_dim)?;
|
let rope_freq_base = md_get("llama.rope.freq_base")
|
||||||
|
.and_then(|m| m.to_f32())
|
||||||
|
.unwrap_or(10000f32);
|
||||||
|
let (cos, sin) = precomput_freqs_cis(rope_dim, rope_freq_base)?;
|
||||||
|
|
||||||
let tok_embeddings = ct.tensor(reader, "token_embd.weight")?;
|
let tok_embeddings = ct.tensor(reader, "token_embd.weight")?;
|
||||||
let tok_embeddings = tok_embeddings.dequantize(cpu)?;
|
let tok_embeddings = tok_embeddings.dequantize(cpu)?;
|
||||||
@ -399,6 +402,12 @@ enum Which {
|
|||||||
L13bChat,
|
L13bChat,
|
||||||
#[value(name = "70b-chat")]
|
#[value(name = "70b-chat")]
|
||||||
L70bChat,
|
L70bChat,
|
||||||
|
#[value(name = "7b-code")]
|
||||||
|
L7bCode,
|
||||||
|
#[value(name = "13b-code")]
|
||||||
|
L13bCode,
|
||||||
|
#[value(name = "32b-code")]
|
||||||
|
L34bCode,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
@ -486,6 +495,9 @@ impl Args {
|
|||||||
"TheBloke/Llama-2-70B-Chat-GGML",
|
"TheBloke/Llama-2-70B-Chat-GGML",
|
||||||
"llama-2-70b-chat.ggmlv3.q4_0.bin",
|
"llama-2-70b-chat.ggmlv3.q4_0.bin",
|
||||||
),
|
),
|
||||||
|
Which::L7bCode => ("TheBloke/CodeLlama-7B-GGUF", "codellama-7b.Q8_0.gguf"),
|
||||||
|
Which::L13bCode => ("TheBloke/CodeLlama-13B-GGUF", "codellama-13b.Q8_0.gguf"),
|
||||||
|
Which::L34bCode => ("TheBloke/CodeLlama-34B-GGUF", "codellama-34b.Q8_0.gguf"),
|
||||||
};
|
};
|
||||||
let api = hf_hub::api::sync::Api::new()?;
|
let api = hf_hub::api::sync::Api::new()?;
|
||||||
let api = api.model(repo.to_string());
|
let api = api.model(repo.to_string());
|
||||||
@ -607,7 +619,13 @@ fn main() -> anyhow::Result<()> {
|
|||||||
);
|
);
|
||||||
println!("params: {:?}", model.hparams);
|
println!("params: {:?}", model.hparams);
|
||||||
let default_gqa = match args.which {
|
let default_gqa = match args.which {
|
||||||
Which::L7b | Which::L13b | Which::L7bChat | Which::L13bChat => 1,
|
Which::L7b
|
||||||
|
| Which::L13b
|
||||||
|
| Which::L7bChat
|
||||||
|
| Which::L13bChat
|
||||||
|
| Which::L7bCode
|
||||||
|
| Which::L13bCode
|
||||||
|
| Which::L34bCode => 1,
|
||||||
Which::L70b | Which::L70bChat => 8,
|
Which::L70b | Which::L70bChat => 8,
|
||||||
};
|
};
|
||||||
ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa))?
|
ModelWeights::from_ggml(model, args.gqa.unwrap_or(default_gqa))?
|
||||||
|
Reference in New Issue
Block a user