Separate quantized phi-3 implementation. (#2157)

* Separate quantized phi-3 implementation. * Integrate the quantized phi3 model.= * Small fixes, get the generation to work properly. * Keep the old llama implementation around. * Change the default.
2025-06-19 03:54:56 +00:00 · 2024-05-04 10:14:57 +02:00
parent 59b18d974e
commit b13a82a438
7 changed files with 323 additions and 12 deletions
--- a/candle-transformers/src/models/phi3.rs
+++ b/candle-transformers/src/models/phi3.rs
@ -24,19 +24,19 @@ pub struct Config {
 }

 impl Config {
-    fn head_dim(&self) -> usize {
+    pub fn head_dim(&self) -> usize {
        self.hidden_size / self.num_attention_heads
    }
 }

 #[derive(Debug, Clone)]
-struct RotaryEmbedding {
+pub struct RotaryEmbedding {
    sin: Tensor,
    cos: Tensor,
 }

 impl RotaryEmbedding {
-    fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result<Self> {
+    pub fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result<Self> {
        let dim = cfg.head_dim();
        let max_seq_len = cfg.max_position_embeddings;
        let inv_freq: Vec<_> = (0..dim)
@ -55,7 +55,7 @@ impl RotaryEmbedding {
        })
    }

-    fn apply_rotary_emb_qkv(
+    pub fn apply_rotary_emb_qkv(
        &self,
        q: &Tensor,
        k: &Tensor,