Get the ggml based llama to generate some text. (#464)

* Add more stats to the ggml example.

* Build a quantized model from the file content.

* Move the tensor retrieval in the main crate.

* Start adding the forward pass.

* Add more to the forward pass of the quantized llama.

* Apply the attention layers.

* Add the sampling loop.

* Get the sampling loop to work.

* Minor tweak.

* Add a quantize/dequantize test.

* Bugfix.

* Add a comment + swap the order.

* Bugfixes.
This commit is contained in:
Laurent Mazare
2023-08-16 12:41:07 +01:00
committed by GitHub
parent fec87e86f5
commit 3071134788
7 changed files with 381 additions and 37 deletions

View File

@ -140,10 +140,6 @@ impl Cache {
}
}
fn silu(xs: &Tensor) -> Result<Tensor> {
xs / (xs.neg()?.exp()? + 1.0)?
}
fn linear(size1: usize, size2: usize, vb: VarBuilder) -> Result<Linear> {
let span = tracing::span!(tracing::Level::TRACE, "linear");
let inner = candle_nn::linear_no_bias(size1, size2, vb)?;
@ -358,7 +354,7 @@ struct Mlp {
impl Mlp {
fn forward(&self, x: &Tensor) -> Result<Tensor> {
let _enter = self.span.enter();
let x = (silu(&self.c_fc1.forward(x)?)? * self.c_fc2.forward(x)?)?;
let x = (candle_nn::ops::silu(&self.c_fc1.forward(x)?)? * self.c_fc2.forward(x)?)?;
self.c_proj.forward(&x)
}