Include topk sampling in the quantized example. (#2005)

* Include topk sampling in the quantized example. * Also sample with top-k on the mistral side.
2025-06-16 18:48:51 +00:00 · 2024-04-04 09:27:54 +02:00
parent 8967c46563
commit f48c07e242
3 changed files with 66 additions and 10 deletions
--- a/candle-examples/examples/mistral/main.rs
+++ b/candle-examples/examples/mistral/main.rs
@ -13,7 +13,7 @@ use candle_transformers::models::quantized_mistral::Model as QMistral;
 use candle::{DType, Device, Tensor};
 use candle_examples::token_output_stream::TokenOutputStream;
 use candle_nn::VarBuilder;
-use candle_transformers::generation::LogitsProcessor;
+use candle_transformers::generation::{LogitsProcessor, Sampling};
 use hf_hub::{api::sync::Api, Repo, RepoType};
 use tokenizers::Tokenizer;

@ -39,11 +39,26 @@ impl TextGeneration {
        seed: u64,
        temp: Option<f64>,
        top_p: Option<f64>,
+        top_k: Option<usize>,
        repeat_penalty: f32,
        repeat_last_n: usize,
        device: &Device,
    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        let logits_processor = {
+            let temperature = temp.unwrap_or(0.);
+            let sampling = if temperature <= 0. {
+                Sampling::ArgMax
+            } else {
+                match (top_k, top_p) {
+                    (None, None) => Sampling::All { temperature },
+                    (Some(k), None) => Sampling::TopK { k, temperature },
+                    (None, Some(p)) => Sampling::TopP { p, temperature },
+                    (Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature },
+                }
+            };
+            LogitsProcessor::from_sampling(seed, sampling)
+        };
+
        Self {
            model,
            tokenizer: TokenOutputStream::new(tokenizer),
@ -159,6 +174,10 @@ struct Args {
    #[arg(long)]
    top_p: Option<f64>,

+    /// Only sample among the top K samples.
+    #[arg(long)]
+    top_k: Option<usize>,
+
    /// The seed to use when generating random samples.
    #[arg(long, default_value_t = 299792458)]
    seed: u64,
@ -314,6 +333,7 @@ fn main() -> Result<()> {
        args.seed,
        args.temperature,
        args.top_p,
+        args.top_k,
        args.repeat_penalty,
        args.repeat_last_n,
        &device,
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -10,7 +10,7 @@ use tokenizers::Tokenizer;

 use candle::quantized::{ggml_file, gguf_file};
 use candle::Tensor;
-use candle_transformers::generation::LogitsProcessor;
+use candle_transformers::generation::{LogitsProcessor, Sampling};

 use candle_examples::token_output_stream::TokenOutputStream;
 use candle_transformers::models::quantized_llama as model;
@ -200,6 +200,10 @@ struct Args {
    #[arg(long)]
    top_p: Option<f64>,

+    /// Only sample among the top K samples.
+    #[arg(long)]
+    top_k: Option<usize>,
+
    /// The seed to use when generating random samples.
    #[arg(long, default_value_t = 299792458)]
    seed: u64,
@ -349,11 +353,6 @@ fn main() -> anyhow::Result<()> {
    #[cfg(feature = "cuda")]
    candle::quantized::cuda::set_force_dmmv(args.force_dmmv);

-    let temperature = if args.temperature == 0. {
-        None
-    } else {
-        Some(args.temperature)
-    };
    let _guard = if args.tracing {
        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
        tracing_subscriber::registry().with(chrome_layer).init();
@ -500,7 +499,20 @@ fn main() -> anyhow::Result<()> {
            prompt_tokens
        };
        let mut all_tokens = vec![];
-        let mut logits_processor = LogitsProcessor::new(args.seed, temperature, args.top_p);
+        let mut logits_processor = {
+            let temperature = args.temperature;
+            let sampling = if temperature <= 0. {
+                Sampling::ArgMax
+            } else {
+                match (args.top_k, args.top_p) {
+                    (None, None) => Sampling::All { temperature },
+                    (Some(k), None) => Sampling::TopK { k, temperature },
+                    (None, Some(p)) => Sampling::TopP { p, temperature },
+                    (Some(k), Some(p)) => Sampling::TopKThenTopP { k, p, temperature },
+                }
+            };
+            LogitsProcessor::from_sampling(args.seed, sampling)
+        };

        let start_prompt_processing = std::time::Instant::now();
        let mut next_token = if !args.split_prompt {