Fixed Gemma3 model and example (#2917)

* gemma3: changed RotaryEmbedding base freq based on layer and sliding window

* Changed attention mask per layer, either normal or sliding

* made attention mask creation slightly more efficient by only creating them once per model iteration

* changed is_sliding to an Option

* clippy

* changed to stop on both <eos> and <end_of_turn> instead of either or
This commit is contained in:
Kyle Birnbaum
2025-04-24 20:35:08 -07:00
committed by GitHub
parent 82def7ae38
commit 6ff0a6999c
2 changed files with 143 additions and 54 deletions

View File

@ -124,6 +124,17 @@ impl TextGeneration {
Some(token) => token,
None => anyhow::bail!("cannot find the <eos> token"),
};
let eot_token = match self.tokenizer.get_token("<end_of_turn>") {
Some(token) => token,
None => {
println!(
"Warning: <end_of_turn> token not found in tokenizer, using <eos> as a backup"
);
eos_token
}
};
let start_gen = std::time::Instant::now();
for index in 0..sample_len {
let context_size = if index > 0 { 1 } else { tokens.len() };
@ -146,7 +157,7 @@ impl TextGeneration {
let next_token = self.logits_processor.sample(&logits)?;
tokens.push(next_token);
generated_tokens += 1;
if next_token == eos_token {
if next_token == eos_token || next_token == eot_token {
break;
}
if let Some(t) = self.tokenizer.next_token(next_token)? {
@ -350,6 +361,31 @@ fn main() -> Result<()> {
args.repeat_last_n,
&device,
);
pipeline.run(&args.prompt, args.sample_len)?;
let prompt = match args.which {
Which::Base2B
| Which::Base7B
| Which::Instruct2B
| Which::Instruct7B
| Which::InstructV1_1_2B
| Which::InstructV1_1_7B
| Which::CodeBase2B
| Which::CodeBase7B
| Which::CodeInstruct2B
| Which::CodeInstruct7B
| Which::BaseV2_2B
| Which::InstructV2_2B
| Which::BaseV2_9B
| Which::InstructV2_9B
| Which::BaseV3_1B => args.prompt,
Which::InstructV3_1B => {
format!(
"<start_of_turn> user\n{}<end_of_turn>\n<start_of_turn> model\n",
args.prompt
)
}
};
pipeline.run(&prompt, args.sample_len)?;
Ok(())
}