Paligemma siglip vision config (#2518)

* Add the paligemma siglip vision config. * More paligemma configs.
2025-06-16 18:48:51 +00:00 · 2024-09-29 17:53:52 +02:00
parent 3a3c48b14b
commit 0ebb38813b
1 changed files with 54 additions and 0 deletions
--- a/candle-transformers/src/models/siglip.rs
+++ b/candle-transformers/src/models/siglip.rs
@ -83,6 +83,60 @@ impl TransformerConfig for VisionConfig {
    }
 }
 impl VisionConfig {
    pub fn paligemma_3b_224() -> Self {
        Self {
            // https://huggingface.co/google/paligemma-3b-pt-224/blob/main/config.json
            patch_size: 14,
            num_attention_heads: 16,
            num_hidden_layers: 27,
            hidden_size: 1152,
            intermediate_size: 4304,
            image_size: 224, // num_image_tokens: (224 / 14)^2 = 256
            // Default values.
            num_channels: 3,
            hidden_act: candle_nn::Activation::GeluPytorchTanh,
            layer_norm_eps: 1e-6,
        }
    }
    pub fn paligemma_3b_448() -> Self {
        Self {
            // https://huggingface.co/google/paligemma-3b-pt-448/blob/main/config.json
            patch_size: 14,
            num_attention_heads: 16,
            num_hidden_layers: 27,
            hidden_size: 1152,
            intermediate_size: 4304,
            image_size: 448, // num_image_tokens: (448 / 14)^2 = 1024
            // Default values.
            num_channels: 3,
            hidden_act: candle_nn::Activation::GeluPytorchTanh,
            layer_norm_eps: 1e-6,
        }
    }
    pub fn paligemma_3b_896() -> Self {
        Self {
            // https://huggingface.co/google/paligemma-3b-pt-448/blob/main/config.json
            patch_size: 14,
            num_attention_heads: 16,
            num_hidden_layers: 27,
            hidden_size: 1152,
            intermediate_size: 4304,
            image_size: 896, // num_image_tokens: (896 / 14)^2 = 4096
            // Default values.
            num_channels: 3,
            hidden_act: candle_nn::Activation::GeluPytorchTanh,
            layer_norm_eps: 1e-6,
        }
    }
    pub fn num_patches(&self) -> usize {
        (self.image_size / self.patch_size).pow(2)
    }
 }
 // https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/models/siglip/configuration_siglip.py#L228
 #[derive(serde::Deserialize, Clone, Debug)]
 pub struct Config {