mirror of
https://github.com/huggingface/candle.git
synced 2025-06-19 03:54:56 +00:00
GQA support in the quantized model. (#555)
* GQA support in the quantized model. * Fix the reshaping. * Fix the main llama model. * Infer the proper gqa from the model kind.
This commit is contained in:
@ -291,7 +291,7 @@ impl CausalSelfAttention {
|
||||
let x = x
|
||||
.unsqueeze(2)?
|
||||
.expand((b_sz, n_kv_head, n_rep, seq_len, head_dim))?
|
||||
.reshape((b_sz, n_kv_head, n_rep, seq_len, head_dim))?;
|
||||
.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))?;
|
||||
Ok(x)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user