diff --git a/candle-transformers/src/models/qwen3.rs b/candle-transformers/src/models/qwen3.rs index dd90b193..89b0b689 100644 --- a/candle-transformers/src/models/qwen3.rs +++ b/candle-transformers/src/models/qwen3.rs @@ -157,7 +157,9 @@ impl Qwen3Attention { // Necessary because the hidden_size in the config isn't always accurate let hidden_size = head_dim * cfg.num_attention_heads; - let kv_cache = KvCache::new(2, cfg.max_position_embeddings); + // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation. + // The cache will grow in chunks of 512 tokens when needed. + let kv_cache = KvCache::new(2, 512); Ok(Self { q_proj,