mirror of
https://github.com/huggingface/candle.git
synced 2025-06-14 01:48:08 +00:00
Fix cuda memory error for Qwen3 non-quantized (#2987)
* Update KvCache initialization in Qwen3 model to use a fixed max position embedding value of 512 * add doc
This commit is contained in:
@ -157,7 +157,9 @@ impl Qwen3Attention {
|
||||
// Necessary because the hidden_size in the config isn't always accurate
|
||||
let hidden_size = head_dim * cfg.num_attention_heads;
|
||||
|
||||
let kv_cache = KvCache::new(2, cfg.max_position_embeddings);
|
||||
// Initialize KV cache with 512 tokens capacity to reduce initial memory allocation.
|
||||
// The cache will grow in chunks of 512 tokens when needed.
|
||||
let kv_cache = KvCache::new(2, 512);
|
||||
|
||||
Ok(Self {
|
||||
q_proj,
|
||||
|
Reference in New Issue
Block a user