diff --git a/candle-transformers/src/models/qwen3.rs b/candle-transformers/src/models/qwen3.rs
index dd90b193..89b0b689 100644
--- a/candle-transformers/src/models/qwen3.rs
+++ b/candle-transformers/src/models/qwen3.rs
@@ -157,7 +157,9 @@ impl Qwen3Attention {
         // Necessary because the hidden_size in the config isn't always accurate
         let hidden_size = head_dim * cfg.num_attention_heads;
 
-        let kv_cache = KvCache::new(2, cfg.max_position_embeddings);
+        // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation.
+        // The cache will grow in chunks of 512 tokens when needed.
+        let kv_cache = KvCache::new(2, 512);
 
         Ok(Self {
             q_proj,