Quantized version of mistral. (#1009)

* Quantized version of mistral.

* Integrate the quantized mistral variant.

* Use the quantized weight files.

* Tweak the quantization command.

* Fix the dtype when computing the rotary embeddings.

* Update the readme with the quantized version.

* Fix the decoding of the remaining tokens.
This commit is contained in:
Laurent Mazare
2023-09-30 19:25:47 +02:00
committed by GitHub
parent 06207332bc
commit deee7612da
7 changed files with 507 additions and 37 deletions

View File

@ -50,8 +50,20 @@ impl TokenOutputStream {
}
}
pub fn decode_rest(&self) -> Result<String> {
self.decode(&self.tokens[self.prev_index..])
pub fn decode_rest(&self) -> Result<Option<String>> {
let prev_text = if self.tokens.is_empty() {
String::new()
} else {
let tokens = &self.tokens[self.prev_index..self.current_index];
self.decode(tokens)?
};
let text = self.decode(&self.tokens[self.prev_index..])?;
if text.len() > prev_text.len() {
let text = text.split_at(prev_text.len());
Ok(Some(text.1.to_string()))
} else {
Ok(None)
}
}
pub fn decode_all(&self) -> Result<String> {