From 17313a4226a6c6bde444d28b4be4f0f96d155be7 Mon Sep 17 00:00:00 2001
From: Akshay Ballal <61191840+akshayballal95@users.noreply.github.com>
Date: Sat, 7 Jun 2025 16:02:58 +0200
Subject: [PATCH] Fix cuda memory error for Qwen3 non-quantized (#2987)

* Update KvCache initialization in Qwen3 model to use a fixed max position embedding value of 512

* add doc
---
 candle-transformers/src/models/qwen3.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/candle-transformers/src/models/qwen3.rs b/candle-transformers/src/models/qwen3.rs
index dd90b193..89b0b689 100644
--- a/candle-transformers/src/models/qwen3.rs
+++ b/candle-transformers/src/models/qwen3.rs
@@ -157,7 +157,9 @@ impl Qwen3Attention {
         // Necessary because the hidden_size in the config isn't always accurate
         let hidden_size = head_dim * cfg.num_attention_heads;
 
-        let kv_cache = KvCache::new(2, cfg.max_position_embeddings);
+        // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation.
+        // The cache will grow in chunks of 512 tokens when needed.
+        let kv_cache = KvCache::new(2, 512);
 
         Ok(Self {
             q_proj,