Use the fast RmsNorm in the quantized model. (#1904)

2025-06-18 19:47:12 +00:00 · 2024-03-21 18:49:35 +01:00
parent 9563a5fee4
commit c0bdd9c7a6
3 changed files with 21 additions and 35 deletions
--- a/candle-transformers/src/models/quantized_mistral.rs
+++ b/candle-transformers/src/models/quantized_mistral.rs
@ -327,6 +327,7 @@ impl Model {
            xs = layer.forward(&xs, attention_mask.as_ref(), seqlen_offset)?
        }
        xs.narrow(1, seq_len - 1, 1)?
+            .contiguous()?
            .apply(&self.norm)?
            .apply(&self.lm_head)
    }