Fix cuda memory error for Qwen3 non-quantized (#2987)

* Update KvCache initialization in Qwen3 model to use a fixed max position embedding value of 512

* add doc
This commit is contained in:
Akshay Ballal
2025-06-07 16:02:58 +02:00
committed by GitHub
parent 0224a749f0
commit 17313a4226

View File

@ -157,7 +157,9 @@ impl Qwen3Attention {
// Necessary because the hidden_size in the config isn't always accurate // Necessary because the hidden_size in the config isn't always accurate
let hidden_size = head_dim * cfg.num_attention_heads; let hidden_size = head_dim * cfg.num_attention_heads;
let kv_cache = KvCache::new(2, cfg.max_position_embeddings); // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation.
// The cache will grow in chunks of 512 tokens when needed.
let kv_cache = KvCache::new(2, 512);
Ok(Self { Ok(Self {
q_proj, q_proj,