Add an option to split the prompt. (#1766)

2025-06-16 10:38:54 +00:00 · 2024-02-27 11:24:11 +01:00
parent badf886583
commit 32544a2ad6
1 changed files with 14 additions and 1 deletions
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -212,6 +212,10 @@ struct Args {
    #[arg(long)]
    verbose_prompt: bool,

+    /// Process prompt elements separately.
+    #[arg(long)]
+    split_prompt: bool,
+
    /// Penalty to be applied for repeating tokens, 1. means no penalty.
    #[arg(long, default_value_t = 1.1)]
    repeat_penalty: f32,
@ -487,11 +491,20 @@ fn main() -> anyhow::Result<()> {
        let mut logits_processor = LogitsProcessor::new(args.seed, temperature, args.top_p);

        let start_prompt_processing = std::time::Instant::now();
-        let mut next_token = {
+        let mut next_token = if !args.split_prompt {
            let input = Tensor::new(prompt_tokens.as_slice(), &device)?.unsqueeze(0)?;
            let logits = model.forward(&input, 0)?;
            let logits = logits.squeeze(0)?;
            logits_processor.sample(&logits)?
+        } else {
+            let mut next_token = 0;
+            for (pos, token) in prompt_tokens.iter().enumerate() {
+                let input = Tensor::new(&[*token], &device)?.unsqueeze(0)?;
+                let logits = model.forward(&input, pos)?;
+                let logits = logits.squeeze(0)?;
+                next_token = logits_processor.sample(&logits)?
+            }
+            next_token
        };
        let prompt_dt = start_prompt_processing.elapsed();
        all_tokens.push(next_token);