From 17313a4226a6c6bde444d28b4be4f0f96d155be7 Mon Sep 17 00:00:00 2001 From: Akshay Ballal <61191840+akshayballal95@users.noreply.github.com> Date: Sat, 7 Jun 2025 16:02:58 +0200 Subject: [PATCH] Fix cuda memory error for Qwen3 non-quantized (#2987) * Update KvCache initialization in Qwen3 model to use a fixed max position embedding value of 512 * add doc --- candle-transformers/src/models/qwen3.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/candle-transformers/src/models/qwen3.rs b/candle-transformers/src/models/qwen3.rs index dd90b193..89b0b689 100644 --- a/candle-transformers/src/models/qwen3.rs +++ b/candle-transformers/src/models/qwen3.rs @@ -157,7 +157,9 @@ impl Qwen3Attention { // Necessary because the hidden_size in the config isn't always accurate let hidden_size = head_dim * cfg.num_attention_heads; - let kv_cache = KvCache::new(2, cfg.max_position_embeddings); + // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation. + // The cache will grow in chunks of 512 tokens when needed. + let kv_cache = KvCache::new(2, 512); Ok(Self { q_proj,