Add a stable diffusion example (#328)

* Start adding a stable-diffusion example. * Proper computation of the causal mask. * Add the chunk operation. * Work in progress: port the attention module. * Add some dummy modules for conv2d and group-norm, get the attention module to compile. * Re-enable the 2d convolution. * Add the embeddings module. * Add the resnet module. * Add the unet blocks. * Add the unet. * And add the variational auto-encoder. * Use the pad function from utils.
2025-06-20 20:09:50 +00:00 · 2023-08-06 18:49:43 +02:00
parent 93cfe5642f
commit d34039e352
14 changed files with 2722 additions and 1 deletions
--- a/candle-nn/src/conv.rs
+++ b/candle-nn/src/conv.rs
@ -48,3 +48,84 @@ impl Conv1d {
        }
    }
 }
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct Conv2dConfig {
+    pub padding: usize,
+    pub stride: usize,
+}
+
+impl Default for Conv2dConfig {
+    fn default() -> Self {
+        Self {
+            padding: 0,
+            stride: 1,
+        }
+    }
+}
+
+#[allow(dead_code)]
+#[derive(Debug)]
+pub struct Conv2d {
+    weight: Tensor,
+    bias: Option<Tensor>,
+    config: Conv2dConfig,
+}
+
+impl Conv2d {
+    pub fn new(weight: Tensor, bias: Option<Tensor>, config: Conv2dConfig) -> Self {
+        Self {
+            weight,
+            bias,
+            config,
+        }
+    }
+
+    pub fn config(&self) -> &Conv2dConfig {
+        &self.config
+    }
+
+    pub fn forward(&self, _x: &Tensor) -> Result<Tensor> {
+        todo!()
+    }
+}
+
+pub fn conv1d(
+    in_channels: usize,
+    out_channels: usize,
+    kernel_size: usize,
+    cfg: Conv1dConfig,
+    vs: crate::VarBuilder,
+) -> Result<Conv1d> {
+    let init_ws = crate::init::DEFAULT_KAIMING_NORMAL;
+    let ws = vs.get_or_init((out_channels, in_channels, kernel_size), "weight", init_ws)?;
+    let bound = 1. / (in_channels as f64).sqrt();
+    let init_bs = crate::Init::Uniform {
+        lo: -bound,
+        up: bound,
+    };
+    let bs = vs.get_or_init(out_channels, "bias", init_bs)?;
+    Ok(Conv1d::new(ws, Some(bs), cfg))
+}
+
+pub fn conv2d(
+    in_channels: usize,
+    out_channels: usize,
+    kernel_size: usize,
+    cfg: Conv2dConfig,
+    vs: crate::VarBuilder,
+) -> Result<Conv2d> {
+    let init_ws = crate::init::DEFAULT_KAIMING_NORMAL;
+    let ws = vs.get_or_init(
+        (out_channels, in_channels, kernel_size, kernel_size),
+        "weight",
+        init_ws,
+    )?;
+    let bound = 1. / (in_channels as f64).sqrt();
+    let init_bs = crate::Init::Uniform {
+        lo: -bound,
+        up: bound,
+    };
+    let bs = vs.get_or_init(out_channels, "bias", init_bs)?;
+    Ok(Conv2d::new(ws, Some(bs), cfg))
+}
--- a/candle-nn/src/group_norm.rs
+++ b/candle-nn/src/group_norm.rs
@ -0,0 +1,48 @@
+//! Group Normalization.
+//!
+//! This layer applies Group Normalization over a mini-batch of inputs.
+use candle::{Result, Tensor};
+
+// This group norm version handles both weight and bias so removes the mean.
+#[allow(dead_code)]
+#[derive(Debug)]
+pub struct GroupNorm {
+    weight: Tensor,
+    bias: Tensor,
+    eps: f64,
+    num_channels: usize,
+    num_groups: usize,
+}
+
+impl GroupNorm {
+    pub fn new(
+        weight: Tensor,
+        bias: Tensor,
+        num_channels: usize,
+        num_groups: usize,
+        eps: f64,
+    ) -> Self {
+        Self {
+            weight,
+            bias,
+            eps,
+            num_channels,
+            num_groups,
+        }
+    }
+
+    pub fn forward(&self, _: &Tensor) -> Result<Tensor> {
+        todo!()
+    }
+}
+
+pub fn group_norm(
+    num_channels: usize,
+    num_groups: usize,
+    eps: f64,
+    vb: crate::VarBuilder,
+) -> Result<GroupNorm> {
+    let weight = vb.get_or_init(num_channels, "weight", crate::Init::Const(1.))?;
+    let bias = vb.get_or_init(num_channels, "bias", crate::Init::Const(0.))?;
+    Ok(GroupNorm::new(weight, bias, num_channels, num_groups, eps))
+}
--- a/candle-nn/src/lib.rs
+++ b/candle-nn/src/lib.rs
@ -3,6 +3,7 @@
 pub mod activation;
 pub mod conv;
 pub mod embedding;
+pub mod group_norm;
 pub mod init;
 pub mod layer_norm;
 pub mod linear;
@ -12,8 +13,9 @@ pub mod optim;
 pub mod var_builder;

 pub use activation::Activation;
-pub use conv::{Conv1d, Conv1dConfig};
+pub use conv::{conv1d, conv2d, Conv1d, Conv1dConfig, Conv2d, Conv2dConfig};
 pub use embedding::{embedding, Embedding};
+pub use group_norm::{group_norm, GroupNorm};
 pub use init::Init;
 pub use layer_norm::{layer_norm, LayerNorm};
 pub use linear::{linear, linear_no_bias, Linear};
--- a/candle-nn/src/ops.rs
+++ b/candle-nn/src/ops.rs
@ -32,3 +32,7 @@ pub fn log_softmax<D: candle::shape::Dim>(xs: &Tensor, d: D) -> Result<Tensor> {
    let log_sm = diff.broadcast_sub(&sum_exp.log()?)?;
    Ok(log_sm)
 }
+
+pub fn silu(xs: &Tensor) -> Result<Tensor> {
+    xs / (xs.neg()?.exp()? + 1.0)?
+}