Quantized version of mistral. (#1009)

* Quantized version of mistral.

* Integrate the quantized mistral variant.

* Use the quantized weight files.

* Tweak the quantization command.

* Fix the dtype when computing the rotary embeddings.

* Update the readme with the quantized version.

* Fix the decoding of the remaining tokens.
This commit is contained in:
Laurent Mazare
2023-09-30 19:25:47 +02:00
committed by GitHub
parent 06207332bc
commit deee7612da
7 changed files with 507 additions and 37 deletions

View File

@ -6,18 +6,18 @@ use std::sync::Arc;
#[derive(Debug, Clone, PartialEq)]
pub struct Config {
vocab_size: usize,
hidden_size: usize,
intermediate_size: usize,
num_hidden_layers: usize,
num_attention_heads: usize,
num_key_value_heads: usize,
hidden_act: Activation,
max_position_embeddings: usize,
rms_norm_eps: f64,
rope_theta: f64,
sliding_window: usize,
use_flash_attn: bool,
pub(crate) vocab_size: usize,
pub(crate) hidden_size: usize,
pub(crate) intermediate_size: usize,
pub(crate) num_hidden_layers: usize,
pub(crate) num_attention_heads: usize,
pub(crate) num_key_value_heads: usize,
pub(crate) hidden_act: Activation,
pub(crate) max_position_embeddings: usize,
pub(crate) rms_norm_eps: f64,
pub(crate) rope_theta: f64,
pub(crate) sliding_window: usize,
pub(crate) use_flash_attn: bool,
}
impl Config {