Segment Anything - process images (#766)

* Start processing images. * Add LayerNorm2d. * Properly use LayerNorm2d. * Tweak eps. * Use LayerNorm on inputs with a rank different from 3. * Window partitioning. * Fix a couple todos. * More todos. * Hard-code the einsums. * More padding support. * Some sizes tweaks. * Use the hub to get the weights. * Use a batch matmul. * Tweaks. * More fixes. * Get some predictions to be generated.
2025-06-17 02:58:50 +00:00 · 2023-09-07 19:22:45 +01:00
parent 7b50f3e106
commit 7396b8ed1a
10 changed files with 303 additions and 105 deletions
--- a/candle-examples/examples/segment-anything/model_transformer.rs
+++ b/candle-examples/examples/segment-anything/model_transformer.rs
@ -36,7 +36,8 @@ impl Attention {
    fn separate_heads(&self, x: &Tensor) -> Result<Tensor> {
        let (b, n, c) = x.dims3()?;
        x.reshape((b, n, self.num_heads, c / self.num_heads))?
-            .transpose(1, 2)
+            .transpose(1, 2)?
+            .contiguous()
    }

    fn recombine_heads(&self, x: &Tensor) -> Result<Tensor> {
@ -102,8 +103,12 @@ impl TwoWayAttentionBlock {
            2,
            vb.pp("cross_attn_image_to_token"),
        )?;
-        // TODO: use relu in this mlp
-        let mlp = crate::MlpBlock::new(embedding_dim, mlp_dim, vb.pp("mlp"))?;
+        let mlp = crate::MlpBlock::new(
+            embedding_dim,
+            mlp_dim,
+            candle_nn::Activation::Relu,
+            vb.pp("mlp"),
+        )?;
        Ok(Self {
            self_attn,
            norm1,
@ -126,7 +131,7 @@ impl TwoWayAttentionBlock {
    ) -> Result<(Tensor, Tensor)> {
        // Self attention block
        let queries = if self.skip_first_layer_pe {
-            self.self_attn.forward(queries, keys, queries)?
+            self.self_attn.forward(queries, queries, queries)?
        } else {
            let q = (queries + query_pe)?;
            let attn_out = self.self_attn.forward(&q, &q, queries)?;