Mixformer (#929)

* Sketch the mixformer model. * More modeling code. * More mixformers. * MixFormer creation. * More mixformers.
2025-06-16 18:48:51 +00:00 · 2023-09-22 16:17:14 +01:00
parent 19e52e5007
commit a46b1b4657
2 changed files with 218 additions and 0 deletions
--- a/candle-transformers/src/models/mixformer.rs
+++ b/candle-transformers/src/models/mixformer.rs
@ -0,0 +1,217 @@
+#![allow(unused)]
+/// MixFormer model.
+/// https://huggingface.co/microsoft/phi-1_5
+/// https://arxiv.org/abs/2309.05463
+use candle::{DType, Device, Module, Result, Tensor, D};
+use candle_nn::{Activation, VarBuilder};
+
+// https://huggingface.co/microsoft/phi-1_5/blob/main/configuration_mixformer_sequential.py
+#[derive(Debug, Clone, PartialEq)]
+pub struct Config {
+    vocab_size: usize,
+    n_positions: usize,
+    n_embd: usize,
+    n_layer: usize,
+    n_inner: Option<usize>,
+    n_head: usize,
+    rotary_dim: usize,
+    activation_function: Activation,
+    layer_norm_epsilon: f64,
+    tie_word_embeddings: bool,
+    pad_vocab_size_multiple: usize,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            vocab_size: 50304,
+            n_positions: 2048,
+            n_embd: 1024,
+            n_layer: 20,
+            n_inner: None,
+            n_head: 16,
+            rotary_dim: usize::min(32, 1024 / 16),
+            activation_function: Activation::Gelu,
+            layer_norm_epsilon: 1e-5,
+            tie_word_embeddings: false,
+            pad_vocab_size_multiple: 64,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Embedding {
+    wte: candle_nn::Embedding,
+}
+
+impl Embedding {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let wte = candle_nn::embedding(cfg.vocab_size, cfg.n_embd, vb.pp("wte"))?;
+        Ok(Self { wte })
+    }
+}
+
+impl Module for Embedding {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        self.wte.forward(xs)
+    }
+}
+
+#[derive(Debug)]
+struct RotaryEmbedding {}
+
+#[derive(Debug)]
+#[allow(clippy::upper_case_acronyms)]
+struct MLP {
+    fc1: candle_nn::Linear,
+    fc2: candle_nn::Linear,
+    act: Activation,
+}
+
+impl MLP {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let n_inner = cfg.n_inner.unwrap_or(4 * cfg.n_embd);
+        let fc1 = candle_nn::linear(cfg.n_embd, n_inner, vb.pp("fc1"))?;
+        let fc2 = candle_nn::linear(n_inner, cfg.n_embd, vb.pp("fc2"))?;
+        Ok(Self {
+            fc1,
+            fc2,
+            act: cfg.activation_function,
+        })
+    }
+}
+
+impl Module for MLP {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2)
+    }
+}
+
+#[derive(Debug)]
+struct SelfAttention {
+    causal: bool,
+    softmax_scale: f64,
+}
+
+#[derive(Debug)]
+struct CrossAttention {
+    causal: bool,
+    softmax_scale: f64,
+}
+
+#[derive(Debug)]
+struct CausalLMHead {
+    ln: candle_nn::LayerNorm,
+    linear: candle_nn::Linear,
+}
+
+impl CausalLMHead {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let ln = candle_nn::layer_norm(cfg.n_embd, cfg.layer_norm_epsilon, vb.pp("ln"))?;
+        let linear = candle_nn::linear(cfg.n_embd, cfg.vocab_size, vb.pp("linear"))?;
+        Ok(Self { ln, linear })
+    }
+}
+
+impl Module for CausalLMHead {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        xs.apply(&self.ln)?
+            .apply(&self.linear)?
+            .to_dtype(DType::F32)
+    }
+}
+
+#[derive(Debug)]
+#[allow(clippy::upper_case_acronyms)]
+struct MHA {
+    wqkv: candle_nn::Linear,
+    out_proj: candle_nn::Linear,
+    head_dim: usize,
+}
+
+impl MHA {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let head_dim = cfg.n_embd / cfg.n_head;
+        let op_size = cfg.n_embd;
+        let wqkv = candle_nn::linear(cfg.n_embd, 3 * op_size, vb.pp("Wqkv"))?;
+        let out_proj = candle_nn::linear(op_size, cfg.n_embd, vb.pp("out_proj"))?;
+        Ok(Self {
+            wqkv,
+            out_proj,
+            head_dim,
+        })
+    }
+}
+
+impl Module for MHA {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let (b_size, seq_len, n_embd) = xs.dims3()?;
+        let qkv = self
+            .wqkv
+            .forward(xs)?
+            .reshape((b_size, seq_len, 3, (), self.head_dim))?;
+        let context: Tensor = qkv; // TODO
+        context.flatten_from(D::Minus2)?.apply(&self.out_proj)
+    }
+}
+
+#[derive(Debug)]
+struct ParallelBlock {
+    ln: candle_nn::LayerNorm,
+    mixer: MHA,
+    mlp: MLP,
+}
+
+impl ParallelBlock {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let ln = candle_nn::layer_norm(cfg.n_embd, cfg.layer_norm_epsilon, vb.pp("ln"))?;
+        let mixer = MHA::new(cfg, vb.pp("mixer"))?;
+        let mlp = MLP::new(cfg, vb.pp("mlp"))?;
+        Ok(Self { ln, mixer, mlp })
+    }
+}
+
+impl Module for ParallelBlock {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let residual = xs;
+        let xs = xs.apply(&self.ln)?;
+        let attn_outputs = self.mixer.forward(&xs)?;
+        let feed_forward_hidden_states = self.mlp.forward(&xs)?;
+        attn_outputs + feed_forward_hidden_states + residual
+    }
+}
+
+#[derive(Debug)]
+pub struct MixFormerSequentialForCausalLM {
+    embedding: Embedding,
+    blocks: Vec<ParallelBlock>,
+    head: CausalLMHead,
+}
+
+impl MixFormerSequentialForCausalLM {
+    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let vb = vb.pp("layers");
+        let embedding = Embedding::new(cfg, vb.pp(0))?;
+        let mut blocks = Vec::new();
+        for i in 0..cfg.n_layer {
+            let block = ParallelBlock::new(cfg, vb.pp(i + 1))?;
+            blocks.push(block)
+        }
+        let head = CausalLMHead::new(cfg, vb.pp(cfg.n_layer + 1))?;
+        Ok(Self {
+            embedding,
+            blocks,
+            head,
+        })
+    }
+}
+
+impl Module for MixFormerSequentialForCausalLM {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let mut xs = xs.apply(&self.embedding)?;
+        for block in self.blocks.iter() {
+            xs = block.forward(&xs)?
+        }
+        xs.apply(&self.head)
+    }
+}
--- a/candle-transformers/src/models/mod.rs
+++ b/candle-transformers/src/models/mod.rs
@ -4,6 +4,7 @@ pub mod dinov2;
 pub mod efficientnet;
 pub mod falcon;
 pub mod llama;
+pub mod mixformer;
 pub mod quantized_llama;
 pub mod quantized_t5;
 pub mod segment_anything;