Fix cuda memory error for Qwen3 non-quantized (#2987)

* Update KvCache initialization in Qwen3 model to use a fixed max position embedding value of 512 * add doc
2025-06-14 09:57:10 +00:00 · 2025-06-07 16:02:58 +02:00
parent 0224a749f0
commit 17313a4226
1 changed files with 3 additions and 1 deletions
--- a/candle-transformers/src/models/qwen3.rs
+++ b/candle-transformers/src/models/qwen3.rs
@ -157,7 +157,9 @@ impl Qwen3Attention {
        // Necessary because the hidden_size in the config isn't always accurate
        let hidden_size = head_dim * cfg.num_attention_heads;
-        let kv_cache = KvCache::new(2, cfg.max_position_embeddings);
+        // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation.
        // The cache will grow in chunks of 512 tokens when needed.
        let kv_cache = KvCache::new(2, 512);
        Ok(Self {
            q_proj,