mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 10:38:54 +00:00
Avoid a contiguous call in the quantized phi 3 model. (#2209)
* Simplify the KvCache api. * Avoid a contiguous call in the quantized phi3 model.
This commit is contained in:
@ -146,7 +146,7 @@ impl LayerWeights {
|
||||
};
|
||||
let att = candle_nn::ops::softmax_last_dim(&att)?;
|
||||
// Convert to contiguous as matmul doesn't support strided vs for now.
|
||||
att.matmul(&v.contiguous()?)?
|
||||
att.matmul(&v)?
|
||||
};
|
||||
let y = y.transpose(1, 2)?.reshape(&[b_sz, seq_len, n_embd])?;
|
||||
let y = self.attn_output.forward(&y)?;
|
||||
|
Reference in New Issue
Block a user