From 33c23c19b6f4821c00a47758f7841baf52ba9081 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Tue, 29 Aug 2023 09:00:04 +0100 Subject: [PATCH] Preliminary support for SDXL. (#647) * Preliminary support for SDXL. * More SDXL support. * More SDXL. * Use the proper clip config. * Querying for existing tensors. * More robust test. --- .../examples/stable-diffusion/attention.rs | 2 +- .../examples/stable-diffusion/clip.rs | 30 ++++ .../examples/stable-diffusion/main.rs | 170 +++++++++++++----- .../stable-diffusion/stable_diffusion.rs | 110 ++++++++++-- candle-nn/src/var_builder.rs | 44 +++++ 5 files changed, 298 insertions(+), 58 deletions(-) diff --git a/candle-examples/examples/stable-diffusion/attention.rs b/candle-examples/examples/stable-diffusion/attention.rs index 58f5e87e..797542aa 100644 --- a/candle-examples/examples/stable-diffusion/attention.rs +++ b/candle-examples/examples/stable-diffusion/attention.rs @@ -473,7 +473,7 @@ impl AttentionBlock { let num_heads = channels / num_head_channels; let group_norm = nn::group_norm(config.num_groups, channels, config.eps, vs.pp("group_norm"))?; - let (q_path, k_path, v_path, out_path) = if vs.dtype() == DType::F16 { + let (q_path, k_path, v_path, out_path) = if vs.contains_tensor("to_q.weight") { ("to_q", "to_k", "to_v", "to_out.0") } else { ("query", "key", "value", "proj_attn") diff --git a/candle-examples/examples/stable-diffusion/clip.rs b/candle-examples/examples/stable-diffusion/clip.rs index 2927a404..d26c1c46 100644 --- a/candle-examples/examples/stable-diffusion/clip.rs +++ b/candle-examples/examples/stable-diffusion/clip.rs @@ -69,6 +69,36 @@ impl Config { activation: Activation::Gelu, } } + + // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/text_encoder/config.json + pub fn sdxl() -> Self { + Self { + vocab_size: 49408, + embed_dim: 768, + intermediate_size: 3072, + max_position_embeddings: 77, + pad_with: Some("!".to_string()), + num_hidden_layers: 12, + num_attention_heads: 12, + projection_dim: 768, + activation: Activation::QuickGelu, + } + } + + // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/text_encoder_2/config.json + pub fn sdxl2() -> Self { + Self { + vocab_size: 49408, + embed_dim: 1280, + intermediate_size: 5120, + max_position_embeddings: 77, + pad_with: Some("!".to_string()), + num_hidden_layers: 32, + num_attention_heads: 20, + projection_dim: 1280, + activation: Activation::Gelu, + } + } } // CLIP Text Model diff --git a/candle-examples/examples/stable-diffusion/main.rs b/candle-examples/examples/stable-diffusion/main.rs index 1443986c..8372edcd 100644 --- a/candle-examples/examples/stable-diffusion/main.rs +++ b/candle-examples/examples/stable-diffusion/main.rs @@ -17,7 +17,7 @@ mod utils; mod vae; use anyhow::{Error as E, Result}; -use candle::{DType, Device, IndexOp, Tensor}; +use candle::{DType, Device, IndexOp, Tensor, D}; use clap::Parser; use tokenizers::Tokenizer; @@ -102,12 +102,16 @@ struct Args { enum StableDiffusionVersion { V1_5, V2_1, + Xl, } +#[allow(unused)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum ModelFile { Tokenizer, + Tokenizer2, Clip, + Clip2, Unet, Vae, } @@ -115,6 +119,7 @@ enum ModelFile { impl StableDiffusionVersion { fn repo(&self) -> &'static str { match self { + Self::Xl => "stabilityai/stable-diffusion-xl-base-1.0", Self::V2_1 => "stabilityai/stable-diffusion-2-1", Self::V1_5 => "runwayml/stable-diffusion-v1-5", } @@ -122,7 +127,7 @@ impl StableDiffusionVersion { fn unet_file(&self, use_f16: bool) -> &'static str { match self { - Self::V1_5 | Self::V2_1 => { + Self::V1_5 | Self::V2_1 | Self::Xl => { if use_f16 { "unet/diffusion_pytorch_model.fp16.safetensors" } else { @@ -134,7 +139,7 @@ impl StableDiffusionVersion { fn vae_file(&self, use_f16: bool) -> &'static str { match self { - Self::V1_5 | Self::V2_1 => { + Self::V1_5 | Self::V2_1 | Self::Xl => { if use_f16 { "vae/diffusion_pytorch_model.fp16.safetensors" } else { @@ -146,7 +151,7 @@ impl StableDiffusionVersion { fn clip_file(&self, use_f16: bool) -> &'static str { match self { - Self::V1_5 | Self::V2_1 => { + Self::V1_5 | Self::V2_1 | Self::Xl => { if use_f16 { "text_encoder/model.fp16.safetensors" } else { @@ -155,12 +160,21 @@ impl StableDiffusionVersion { } } } + + fn clip2_file(&self, use_f16: bool) -> &'static str { + match self { + Self::V1_5 | Self::V2_1 | Self::Xl => { + if use_f16 { + "text_encoder_2/model.fp16.safetensors" + } else { + "text_encoder_2/model.safetensors" + } + } + } + } } impl ModelFile { - const TOKENIZER_REPO: &str = "openai/clip-vit-base-patch32"; - const TOKENIZER_PATH: &str = "tokenizer.json"; - fn get( &self, filename: Option, @@ -172,8 +186,24 @@ impl ModelFile { Some(filename) => Ok(std::path::PathBuf::from(filename)), None => { let (repo, path) = match self { - Self::Tokenizer => (Self::TOKENIZER_REPO, Self::TOKENIZER_PATH), + Self::Tokenizer => { + let tokenizer_repo = match version { + StableDiffusionVersion::V1_5 | StableDiffusionVersion::V2_1 => { + "openai/clip-vit-base-patch32" + } + StableDiffusionVersion::Xl => { + // This seems similar to the patch32 version except some very small + // difference in the split regex. + "openai/clip-vit-large-patch14" + } + }; + (tokenizer_repo, "tokenizer.json") + } + Self::Tokenizer2 => { + ("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", "tokenizer.json") + } Self::Clip => (version.repo(), version.clip_file(use_f16)), + Self::Clip2 => (version.repo(), version.clip2_file(use_f16)), Self::Unet => (version.repo(), version.unet_file(use_f16)), Self::Vae => (version.repo(), version.vae_file(use_f16)), }; @@ -211,6 +241,71 @@ fn output_filename( } } +#[allow(clippy::too_many_arguments)] +fn text_embeddings( + prompt: &str, + uncond_prompt: &str, + tokenizer: Option, + clip_weights: Option, + sd_version: StableDiffusionVersion, + sd_config: &stable_diffusion::StableDiffusionConfig, + use_f16: bool, + device: &Device, + dtype: DType, + first: bool, +) -> Result { + let tokenizer_file = if first { + ModelFile::Tokenizer + } else { + ModelFile::Tokenizer2 + }; + let tokenizer = tokenizer_file.get(tokenizer, sd_version, use_f16)?; + let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?; + let pad_id = match &sd_config.clip.pad_with { + Some(padding) => *tokenizer.get_vocab(true).get(padding.as_str()).unwrap(), + None => *tokenizer.get_vocab(true).get("<|endoftext|>").unwrap(), + }; + println!("Running with prompt \"{prompt}\"."); + let mut tokens = tokenizer + .encode(prompt, true) + .map_err(E::msg)? + .get_ids() + .to_vec(); + while tokens.len() < sd_config.clip.max_position_embeddings { + tokens.push(pad_id) + } + let tokens = Tensor::new(tokens.as_slice(), device)?.unsqueeze(0)?; + + let mut uncond_tokens = tokenizer + .encode(uncond_prompt, true) + .map_err(E::msg)? + .get_ids() + .to_vec(); + while uncond_tokens.len() < sd_config.clip.max_position_embeddings { + uncond_tokens.push(pad_id) + } + let uncond_tokens = Tensor::new(uncond_tokens.as_slice(), device)?.unsqueeze(0)?; + + println!("Building the Clip transformer."); + let clip_weights_file = if first { + ModelFile::Clip + } else { + ModelFile::Clip2 + }; + let clip_weights = clip_weights_file.get(clip_weights, sd_version, false)?; + let clip_config = if first { + &sd_config.clip + } else { + sd_config.clip2.as_ref().unwrap() + }; + let text_model = + stable_diffusion::build_clip_transformer(clip_config, clip_weights, device, DType::F32)?; + let text_embeddings = text_model.forward(&tokens)?; + let uncond_embeddings = text_model.forward(&uncond_tokens)?; + let text_embeddings = Tensor::cat(&[uncond_embeddings, text_embeddings], 0)?.to_dtype(dtype)?; + Ok(text_embeddings) +} + fn run(args: Args) -> Result<()> { use tracing_chrome::ChromeLayerBuilder; use tracing_subscriber::prelude::*; @@ -252,46 +347,37 @@ fn run(args: Args) -> Result<()> { StableDiffusionVersion::V2_1 => { stable_diffusion::StableDiffusionConfig::v2_1(sliced_attention_size, height, width) } + StableDiffusionVersion::Xl => { + stable_diffusion::StableDiffusionConfig::sdxl(sliced_attention_size, height, width) + } }; let scheduler = sd_config.build_scheduler(n_steps)?; let device = candle_examples::device(cpu)?; - let tokenizer = ModelFile::Tokenizer.get(tokenizer, sd_version, use_f16)?; - let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?; - let pad_id = match &sd_config.clip.pad_with { - Some(padding) => *tokenizer.get_vocab(true).get(padding.as_str()).unwrap(), - None => *tokenizer.get_vocab(true).get("<|endoftext|>").unwrap(), - }; - println!("Running with prompt \"{prompt}\"."); - let mut tokens = tokenizer - .encode(prompt, true) - .map_err(E::msg)? - .get_ids() - .to_vec(); - while tokens.len() < sd_config.clip.max_position_embeddings { - tokens.push(pad_id) - } - let tokens = Tensor::new(tokens.as_slice(), &device)?.unsqueeze(0)?; - - let mut uncond_tokens = tokenizer - .encode(uncond_prompt, true) - .map_err(E::msg)? - .get_ids() - .to_vec(); - while uncond_tokens.len() < sd_config.clip.max_position_embeddings { - uncond_tokens.push(pad_id) - } - let uncond_tokens = Tensor::new(uncond_tokens.as_slice(), &device)?.unsqueeze(0)?; - - println!("Building the Clip transformer."); - let text_embeddings = { - let clip_weights = ModelFile::Clip.get(clip_weights, sd_version, false)?; - let text_model = sd_config.build_clip_transformer(&clip_weights, &device, DType::F32)?; - let text_embeddings = text_model.forward(&tokens)?; - let uncond_embeddings = text_model.forward(&uncond_tokens)?; - Tensor::cat(&[uncond_embeddings, text_embeddings], 0)?.to_dtype(dtype)? + let which = match sd_version { + StableDiffusionVersion::Xl => vec![true, false], + _ => vec![true], }; + let text_embeddings = which + .iter() + .map(|first| { + text_embeddings( + &prompt, + &uncond_prompt, + tokenizer.clone(), + clip_weights.clone(), + sd_version, + &sd_config, + use_f16, + &device, + dtype, + *first, + ) + }) + .collect::>>()?; + let text_embeddings = Tensor::cat(&text_embeddings, D::Minus1)?; + println!("{text_embeddings:?}"); println!("Building the autoencoder."); let vae_weights = ModelFile::Vae.get(vae_weights, sd_version, use_f16)?; diff --git a/candle-examples/examples/stable-diffusion/stable_diffusion.rs b/candle-examples/examples/stable-diffusion/stable_diffusion.rs index e159fa0a..bed60161 100644 --- a/candle-examples/examples/stable-diffusion/stable_diffusion.rs +++ b/candle-examples/examples/stable-diffusion/stable_diffusion.rs @@ -8,6 +8,7 @@ pub struct StableDiffusionConfig { pub width: usize, pub height: usize, pub clip: clip::Config, + pub clip2: Option, autoencoder: vae::AutoEncoderKLConfig, unet: unet_2d::UNet2DConditionModelConfig, scheduler: ddim::DDIMSchedulerConfig, @@ -51,7 +52,7 @@ impl StableDiffusionConfig { norm_num_groups: 32, }; let height = if let Some(height) = height { - assert_eq!(height % 8, 0, "heigh has to be divisible by 8"); + assert_eq!(height % 8, 0, "height has to be divisible by 8"); height } else { 512 @@ -68,6 +69,7 @@ impl StableDiffusionConfig { width, height, clip: clip::Config::v1_5(), + clip2: None, autoencoder, scheduler: Default::default(), unet, @@ -118,7 +120,7 @@ impl StableDiffusionConfig { }; let height = if let Some(height) = height { - assert_eq!(height % 8, 0, "heigh has to be divisible by 8"); + assert_eq!(height % 8, 0, "height has to be divisible by 8"); height } else { 768 @@ -135,6 +137,7 @@ impl StableDiffusionConfig { width, height, clip: clip::Config::v2_1(), + clip2: None, autoencoder, scheduler, unet, @@ -155,6 +158,83 @@ impl StableDiffusionConfig { ) } + fn sdxl_( + sliced_attention_size: Option, + height: Option, + width: Option, + prediction_type: PredictionType, + ) -> Self { + let bc = |out_channels, use_cross_attn, attention_head_dim| unet_2d::BlockConfig { + out_channels, + use_cross_attn, + attention_head_dim, + }; + // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/unet/config.json + let unet = unet_2d::UNet2DConditionModelConfig { + blocks: vec![bc(320, false, 5), bc(640, false, 10), bc(1280, true, 20)], + center_input_sample: false, + cross_attention_dim: 2048, + downsample_padding: 1, + flip_sin_to_cos: true, + freq_shift: 0., + layers_per_block: 2, + mid_block_scale_factor: 1., + norm_eps: 1e-5, + norm_num_groups: 32, + sliced_attention_size, + use_linear_projection: true, + }; + // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/vae/config.json + let autoencoder = vae::AutoEncoderKLConfig { + block_out_channels: vec![128, 256, 512, 512], + layers_per_block: 2, + latent_channels: 4, + norm_num_groups: 32, + }; + let scheduler = ddim::DDIMSchedulerConfig { + prediction_type, + ..Default::default() + }; + + let height = if let Some(height) = height { + assert_eq!(height % 8, 0, "height has to be divisible by 8"); + height + } else { + 1024 + }; + + let width = if let Some(width) = width { + assert_eq!(width % 8, 0, "width has to be divisible by 8"); + width + } else { + 1024 + }; + + Self { + width, + height, + clip: clip::Config::sdxl(), + clip2: Some(clip::Config::sdxl2()), + autoencoder, + scheduler, + unet, + } + } + + pub fn sdxl( + sliced_attention_size: Option, + height: Option, + width: Option, + ) -> Self { + Self::sdxl_( + sliced_attention_size, + height, + width, + // https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/scheduler/scheduler_config.json + PredictionType::Epsilon, + ) + } + pub fn build_vae>( &self, vae_weights: P, @@ -193,17 +273,17 @@ impl StableDiffusionConfig { pub fn build_scheduler(&self, n_steps: usize) -> Result { ddim::DDIMScheduler::new(n_steps, self.scheduler) } - - pub fn build_clip_transformer>( - &self, - clip_weights: P, - device: &Device, - dtype: DType, - ) -> Result { - let weights = unsafe { candle::safetensors::MmapedFile::new(clip_weights)? }; - let weights = weights.deserialize()?; - let vs = nn::VarBuilder::from_safetensors(vec![weights], dtype, device); - let text_model = clip::ClipTextTransformer::new(vs, &self.clip)?; - Ok(text_model) - } +} + +pub fn build_clip_transformer>( + clip: &clip::Config, + clip_weights: P, + device: &Device, + dtype: DType, +) -> Result { + let weights = unsafe { candle::safetensors::MmapedFile::new(clip_weights)? }; + let weights = weights.deserialize()?; + let vs = nn::VarBuilder::from_safetensors(vec![weights], dtype, device); + let text_model = clip::ClipTextTransformer::new(vs, clip)?; + Ok(text_model) } diff --git a/candle-nn/src/var_builder.rs b/candle-nn/src/var_builder.rs index c593960b..c372897a 100644 --- a/candle-nn/src/var_builder.rs +++ b/candle-nn/src/var_builder.rs @@ -52,6 +52,8 @@ pub trait Backend { dtype: DType, dev: &Device, ) -> Result; + + fn contains_tensor(&self, name: &str) -> bool; } pub trait SimpleBackend { @@ -64,6 +66,8 @@ pub trait SimpleBackend { dtype: DType, dev: &Device, ) -> Result; + + fn contains_tensor(&self, name: &str) -> bool; } impl<'a> Backend for Box { @@ -78,6 +82,10 @@ impl<'a> Backend for Box { ) -> Result { self.as_ref().get(s, name, h, dtype, dev) } + + fn contains_tensor(&self, name: &str) -> bool { + self.as_ref().contains_tensor(name) + } } impl<'a, B: Backend> VarBuilderArgs<'a, B> { @@ -94,6 +102,8 @@ impl<'a, B: Backend> VarBuilderArgs<'a, B> { } } + /// Return a new `VarBuilder` adding `s` to the current prefix. This can be think of as `cd` + /// into a directory. pub fn push_prefix(&self, s: S) -> Self { let mut path = self.path.clone(); path.push(s.to_string()); @@ -109,10 +119,12 @@ impl<'a, B: Backend> VarBuilderArgs<'a, B> { self.push_prefix(s) } + /// The device used by default. pub fn device(&self) -> &Device { &self.data.device } + /// The dtype used by default. pub fn dtype(&self) -> DType { self.data.dtype } @@ -125,6 +137,14 @@ impl<'a, B: Backend> VarBuilderArgs<'a, B> { } } + /// This returns true only if a tensor with the passed in name is available. E.g. when passed + /// `a`, true is returned if `prefix.a` exists but false is returned if only `prefix.a.b` + /// exists. + pub fn contains_tensor(&self, tensor_name: &str) -> bool { + let path = self.path(tensor_name); + self.data.backend.contains_tensor(&path) + } + /// Retrieve the tensor associated with the given name at the current path. pub fn get_with_hints>( &self, @@ -149,6 +169,10 @@ impl SimpleBackend for Zeros { fn get(&self, s: Shape, _: &str, _: crate::Init, dtype: DType, dev: &Device) -> Result { Tensor::zeros(s, dtype, dev) } + + fn contains_tensor(&self, _name: &str) -> bool { + true + } } impl SimpleBackend for HashMap { @@ -179,6 +203,10 @@ impl SimpleBackend for HashMap { } tensor.to_device(dev)?.to_dtype(dtype) } + + fn contains_tensor(&self, name: &str) -> bool { + self.contains_key(name) + } } impl SimpleBackend for VarMap { @@ -192,6 +220,10 @@ impl SimpleBackend for VarMap { ) -> Result { VarMap::get(self, s, name, h, dtype, dev) } + + fn contains_tensor(&self, name: &str) -> bool { + self.data().lock().unwrap().contains_key(name) + } } struct SafeTensorWithRouting<'a> { @@ -228,6 +260,10 @@ impl<'a> SimpleBackend for SafeTensorWithRouting<'a> { } Ok(tensor) } + + fn contains_tensor(&self, name: &str) -> bool { + self.routing.contains_key(name) + } } impl SimpleBackend for candle::npy::NpzTensors { @@ -257,6 +293,10 @@ impl SimpleBackend for candle::npy::NpzTensors { } Ok(tensor) } + + fn contains_tensor(&self, name: &str) -> bool { + self.get(name).map_or(false, |v| v.is_some()) + } } impl<'a> VarBuilder<'a> { @@ -425,4 +465,8 @@ impl<'a> Backend for ShardedSafeTensors<'a> { let raw: Vec = iterator.into_iter().flatten().cloned().collect(); Tensor::from_raw_buffer(&raw, view_dtype, &shape, dev)?.to_dtype(dtype) } + + fn contains_tensor(&self, name: &str) -> bool { + self.0.routing.contains_key(name) + } }