Faster repeat penalty (#1940)

* Avoid the attention mask where possible.

* Faster repeat penalty.
This commit is contained in:
Laurent Mazare
2024-03-26 11:31:20 +01:00
committed by GitHub
parent f5dfe883d7
commit 4523ecfb2a

View File

@ -3,9 +3,13 @@ use candle::{Result, Tensor};
pub fn apply_repeat_penalty(logits: &Tensor, penalty: f32, context: &[u32]) -> Result<Tensor> {
let device = logits.device();
let mut logits = logits.to_vec1::<f32>()?;
let context: std::collections::HashSet<_> = context.iter().collect();
for (token_id, logit) in logits.iter_mut().enumerate() {
if context.contains(&(token_id as u32)) {
let mut already_seen = std::collections::HashSet::new();
for token_id in context {
if already_seen.contains(token_id) {
continue;
}
already_seen.insert(token_id);
if let Some(logit) = logits.get_mut(*token_id as usize) {
if *logit >= 0. {
*logit /= penalty
} else {