mirror of
https://github.com/huggingface/candle.git
synced 2025-06-15 18:28:24 +00:00
Add inpainting to the stable diffusion example (#2735)
* Update the stable diffusion example with inpainting support for 1.5, 2 and XL. * Apply cargo fmt. * Clippy fixes. --------- Co-authored-by: laurent <laurent.mazare@gmail.com>
This commit is contained in:
@ -5,10 +5,12 @@ extern crate accelerate_src;
|
||||
extern crate intel_mkl_src;
|
||||
|
||||
use candle_transformers::models::stable_diffusion;
|
||||
use std::ops::Div;
|
||||
|
||||
use anyhow::{Error as E, Result};
|
||||
use candle::{DType, Device, IndexOp, Module, Tensor, D};
|
||||
use clap::Parser;
|
||||
use rand::Rng;
|
||||
use stable_diffusion::vae::AutoEncoderKL;
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
@ -49,6 +51,10 @@ struct Args {
|
||||
#[arg(long, value_name = "FILE")]
|
||||
clip_weights: Option<String>,
|
||||
|
||||
/// The CLIP2 weight file, in .safetensors format.
|
||||
#[arg(long, value_name = "FILE")]
|
||||
clip2_weights: Option<String>,
|
||||
|
||||
/// The VAE weight file, in .safetensors format.
|
||||
#[arg(long, value_name = "FILE")]
|
||||
vae_weights: Option<String>,
|
||||
@ -93,6 +99,11 @@ struct Args {
|
||||
#[arg(long)]
|
||||
guidance_scale: Option<f64>,
|
||||
|
||||
/// Path to the mask image for inpainting.
|
||||
#[arg(long, value_name = "FILE")]
|
||||
mask_path: Option<String>,
|
||||
|
||||
/// Path to the image used to initialize the latents. For inpainting, this is the image to be masked.
|
||||
#[arg(long, value_name = "FILE")]
|
||||
img2img: Option<String>,
|
||||
|
||||
@ -105,13 +116,20 @@ struct Args {
|
||||
/// The seed to use when generating random samples.
|
||||
#[arg(long)]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Force the saved image to update only the masked region
|
||||
#[arg(long)]
|
||||
only_update_masked: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, clap::ValueEnum, PartialEq, Eq)]
|
||||
enum StableDiffusionVersion {
|
||||
V1_5,
|
||||
V1_5Inpaint,
|
||||
V2_1,
|
||||
V2Inpaint,
|
||||
Xl,
|
||||
XlInpaint,
|
||||
Turbo,
|
||||
}
|
||||
|
||||
@ -128,16 +146,25 @@ enum ModelFile {
|
||||
impl StableDiffusionVersion {
|
||||
fn repo(&self) -> &'static str {
|
||||
match self {
|
||||
Self::XlInpaint => "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
|
||||
Self::Xl => "stabilityai/stable-diffusion-xl-base-1.0",
|
||||
Self::V2Inpaint => "stabilityai/stable-diffusion-2-inpainting",
|
||||
Self::V2_1 => "stabilityai/stable-diffusion-2-1",
|
||||
Self::V1_5 => "runwayml/stable-diffusion-v1-5",
|
||||
Self::V1_5Inpaint => "stable-diffusion-v1-5/stable-diffusion-inpainting",
|
||||
Self::Turbo => "stabilityai/sdxl-turbo",
|
||||
}
|
||||
}
|
||||
|
||||
fn unet_file(&self, use_f16: bool) -> &'static str {
|
||||
match self {
|
||||
Self::V1_5 | Self::V2_1 | Self::Xl | Self::Turbo => {
|
||||
Self::V1_5
|
||||
| Self::V1_5Inpaint
|
||||
| Self::V2_1
|
||||
| Self::V2Inpaint
|
||||
| Self::Xl
|
||||
| Self::XlInpaint
|
||||
| Self::Turbo => {
|
||||
if use_f16 {
|
||||
"unet/diffusion_pytorch_model.fp16.safetensors"
|
||||
} else {
|
||||
@ -149,7 +176,13 @@ impl StableDiffusionVersion {
|
||||
|
||||
fn vae_file(&self, use_f16: bool) -> &'static str {
|
||||
match self {
|
||||
Self::V1_5 | Self::V2_1 | Self::Xl | Self::Turbo => {
|
||||
Self::V1_5
|
||||
| Self::V1_5Inpaint
|
||||
| Self::V2_1
|
||||
| Self::V2Inpaint
|
||||
| Self::Xl
|
||||
| Self::XlInpaint
|
||||
| Self::Turbo => {
|
||||
if use_f16 {
|
||||
"vae/diffusion_pytorch_model.fp16.safetensors"
|
||||
} else {
|
||||
@ -161,7 +194,13 @@ impl StableDiffusionVersion {
|
||||
|
||||
fn clip_file(&self, use_f16: bool) -> &'static str {
|
||||
match self {
|
||||
Self::V1_5 | Self::V2_1 | Self::Xl | Self::Turbo => {
|
||||
Self::V1_5
|
||||
| Self::V1_5Inpaint
|
||||
| Self::V2_1
|
||||
| Self::V2Inpaint
|
||||
| Self::Xl
|
||||
| Self::XlInpaint
|
||||
| Self::Turbo => {
|
||||
if use_f16 {
|
||||
"text_encoder/model.fp16.safetensors"
|
||||
} else {
|
||||
@ -173,7 +212,13 @@ impl StableDiffusionVersion {
|
||||
|
||||
fn clip2_file(&self, use_f16: bool) -> &'static str {
|
||||
match self {
|
||||
Self::V1_5 | Self::V2_1 | Self::Xl | Self::Turbo => {
|
||||
Self::V1_5
|
||||
| Self::V1_5Inpaint
|
||||
| Self::V2_1
|
||||
| Self::V2Inpaint
|
||||
| Self::Xl
|
||||
| Self::XlInpaint
|
||||
| Self::Turbo => {
|
||||
if use_f16 {
|
||||
"text_encoder_2/model.fp16.safetensors"
|
||||
} else {
|
||||
@ -198,10 +243,13 @@ impl ModelFile {
|
||||
let (repo, path) = match self {
|
||||
Self::Tokenizer => {
|
||||
let tokenizer_repo = match version {
|
||||
StableDiffusionVersion::V1_5 | StableDiffusionVersion::V2_1 => {
|
||||
"openai/clip-vit-base-patch32"
|
||||
}
|
||||
StableDiffusionVersion::Xl | StableDiffusionVersion::Turbo => {
|
||||
StableDiffusionVersion::V1_5
|
||||
| StableDiffusionVersion::V2_1
|
||||
| StableDiffusionVersion::V1_5Inpaint
|
||||
| StableDiffusionVersion::V2Inpaint => "openai/clip-vit-base-patch32",
|
||||
StableDiffusionVersion::Xl
|
||||
| StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::Turbo => {
|
||||
// This seems similar to the patch32 version except some very small
|
||||
// difference in the split regex.
|
||||
"openai/clip-vit-large-patch14"
|
||||
@ -299,6 +347,7 @@ fn text_embeddings(
|
||||
uncond_prompt: &str,
|
||||
tokenizer: Option<String>,
|
||||
clip_weights: Option<String>,
|
||||
clip2_weights: Option<String>,
|
||||
sd_version: StableDiffusionVersion,
|
||||
sd_config: &stable_diffusion::StableDiffusionConfig,
|
||||
use_f16: bool,
|
||||
@ -342,7 +391,11 @@ fn text_embeddings(
|
||||
} else {
|
||||
ModelFile::Clip2
|
||||
};
|
||||
let clip_weights = clip_weights_file.get(clip_weights, sd_version, false)?;
|
||||
let clip_weights = if first {
|
||||
clip_weights_file.get(clip_weights, sd_version, use_f16)?
|
||||
} else {
|
||||
clip_weights_file.get(clip2_weights, sd_version, use_f16)?
|
||||
};
|
||||
let clip_config = if first {
|
||||
&sd_config.clip
|
||||
} else {
|
||||
@ -399,6 +452,82 @@ fn image_preprocess<T: AsRef<std::path::Path>>(path: T) -> anyhow::Result<Tensor
|
||||
Ok(img)
|
||||
}
|
||||
|
||||
/// Convert the mask image to a single channel tensor. Also ensure the image is a multiple of 32 in both dimensions.
|
||||
fn mask_preprocess<T: AsRef<std::path::Path>>(path: T) -> anyhow::Result<Tensor> {
|
||||
let img = image::open(path)?.to_luma8();
|
||||
let (new_width, new_height) = {
|
||||
let (width, height) = img.dimensions();
|
||||
(width - width % 32, height - height % 32)
|
||||
};
|
||||
let img = image::imageops::resize(
|
||||
&img,
|
||||
new_width,
|
||||
new_height,
|
||||
image::imageops::FilterType::CatmullRom,
|
||||
)
|
||||
.into_raw();
|
||||
let mask = Tensor::from_vec(img, (new_height as usize, new_width as usize), &Device::Cpu)?
|
||||
.unsqueeze(0)?
|
||||
.to_dtype(DType::F32)?
|
||||
.div(255.0)?
|
||||
.unsqueeze(0)?;
|
||||
Ok(mask)
|
||||
}
|
||||
|
||||
/// Generates the mask latents, scaled mask and mask_4 for inpainting. Returns a tuple of None if inpainting is not
|
||||
/// being used.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn inpainting_tensors(
|
||||
sd_version: StableDiffusionVersion,
|
||||
mask_path: Option<String>,
|
||||
dtype: DType,
|
||||
device: &Device,
|
||||
use_guide_scale: bool,
|
||||
vae: &AutoEncoderKL,
|
||||
image: Option<Tensor>,
|
||||
vae_scale: f64,
|
||||
) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
|
||||
match sd_version {
|
||||
StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::V2Inpaint
|
||||
| StableDiffusionVersion::V1_5Inpaint => {
|
||||
let inpaint_mask = mask_path.ok_or_else(|| {
|
||||
anyhow::anyhow!("An inpainting model was requested but mask-path is not provided.")
|
||||
})?;
|
||||
// Get the mask image with shape [1, 1, 128, 128]
|
||||
let mask = mask_preprocess(inpaint_mask)?
|
||||
.to_device(device)?
|
||||
.to_dtype(dtype)?;
|
||||
// Generate the masked image from the image and the mask with shape [1, 3, 1024, 1024]
|
||||
let xmask = mask.le(0.5)?.repeat(&[1, 3, 1, 1])?.to_dtype(dtype)?;
|
||||
let image = &image
|
||||
.ok_or_else(|| anyhow::anyhow!(
|
||||
"An inpainting model was requested but img2img which is used as the input image is not provided."
|
||||
))?;
|
||||
let masked_img = (image * xmask)?;
|
||||
// Scale down the mask
|
||||
let shape = masked_img.shape();
|
||||
let (w, h) = (shape.dims()[3] / 8, shape.dims()[2] / 8);
|
||||
let mask = mask.interpolate2d(w, h)?;
|
||||
// shape: [1, 4, 128, 128]
|
||||
let mask_latents = vae.encode(&masked_img)?;
|
||||
let mask_latents = (mask_latents.sample()? * vae_scale)?.to_device(device)?;
|
||||
|
||||
let mask_4 = mask.as_ref().repeat(&[1, 4, 1, 1])?;
|
||||
let (mask_latents, mask) = if use_guide_scale {
|
||||
(
|
||||
Tensor::cat(&[&mask_latents, &mask_latents], 0)?,
|
||||
Tensor::cat(&[&mask, &mask], 0)?,
|
||||
)
|
||||
} else {
|
||||
(mask_latents, mask)
|
||||
};
|
||||
Ok((Some(mask_latents), Some(mask), Some(mask_4)))
|
||||
}
|
||||
_ => Ok((None, None, None)),
|
||||
}
|
||||
}
|
||||
|
||||
fn run(args: Args) -> Result<()> {
|
||||
use tracing_chrome::ChromeLayerBuilder;
|
||||
use tracing_subscriber::prelude::*;
|
||||
@ -417,12 +546,14 @@ fn run(args: Args) -> Result<()> {
|
||||
bsize,
|
||||
sd_version,
|
||||
clip_weights,
|
||||
clip2_weights,
|
||||
vae_weights,
|
||||
unet_weights,
|
||||
tracing,
|
||||
use_f16,
|
||||
guidance_scale,
|
||||
use_flash_attn,
|
||||
mask_path,
|
||||
img2img,
|
||||
img2img_strength,
|
||||
seed,
|
||||
@ -445,7 +576,10 @@ fn run(args: Args) -> Result<()> {
|
||||
Some(guidance_scale) => guidance_scale,
|
||||
None => match sd_version {
|
||||
StableDiffusionVersion::V1_5
|
||||
| StableDiffusionVersion::V1_5Inpaint
|
||||
| StableDiffusionVersion::V2_1
|
||||
| StableDiffusionVersion::V2Inpaint
|
||||
| StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::Xl => 7.5,
|
||||
StableDiffusionVersion::Turbo => 0.,
|
||||
},
|
||||
@ -454,20 +588,23 @@ fn run(args: Args) -> Result<()> {
|
||||
Some(n_steps) => n_steps,
|
||||
None => match sd_version {
|
||||
StableDiffusionVersion::V1_5
|
||||
| StableDiffusionVersion::V1_5Inpaint
|
||||
| StableDiffusionVersion::V2_1
|
||||
| StableDiffusionVersion::V2Inpaint
|
||||
| StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::Xl => 30,
|
||||
StableDiffusionVersion::Turbo => 1,
|
||||
},
|
||||
};
|
||||
let dtype = if use_f16 { DType::F16 } else { DType::F32 };
|
||||
let sd_config = match sd_version {
|
||||
StableDiffusionVersion::V1_5 => {
|
||||
StableDiffusionVersion::V1_5 | StableDiffusionVersion::V1_5Inpaint => {
|
||||
stable_diffusion::StableDiffusionConfig::v1_5(sliced_attention_size, height, width)
|
||||
}
|
||||
StableDiffusionVersion::V2_1 => {
|
||||
StableDiffusionVersion::V2_1 | StableDiffusionVersion::V2Inpaint => {
|
||||
stable_diffusion::StableDiffusionConfig::v2_1(sliced_attention_size, height, width)
|
||||
}
|
||||
StableDiffusionVersion::Xl => {
|
||||
StableDiffusionVersion::Xl | StableDiffusionVersion::XlInpaint => {
|
||||
stable_diffusion::StableDiffusionConfig::sdxl(sliced_attention_size, height, width)
|
||||
}
|
||||
StableDiffusionVersion::Turbo => stable_diffusion::StableDiffusionConfig::sdxl_turbo(
|
||||
@ -479,13 +616,16 @@ fn run(args: Args) -> Result<()> {
|
||||
|
||||
let mut scheduler = sd_config.build_scheduler(n_steps)?;
|
||||
let device = candle_examples::device(cpu)?;
|
||||
if let Some(seed) = seed {
|
||||
device.set_seed(seed)?;
|
||||
}
|
||||
// If a seed is not given, generate a random seed and print it
|
||||
let seed = seed.unwrap_or(rand::thread_rng().gen_range(0u64..u64::MAX));
|
||||
println!("Using seed {seed}");
|
||||
device.set_seed(seed)?;
|
||||
let use_guide_scale = guidance_scale > 1.0;
|
||||
|
||||
let which = match sd_version {
|
||||
StableDiffusionVersion::Xl | StableDiffusionVersion::Turbo => vec![true, false],
|
||||
StableDiffusionVersion::Xl
|
||||
| StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::Turbo => vec![true, false],
|
||||
_ => vec![true],
|
||||
};
|
||||
let text_embeddings = which
|
||||
@ -496,6 +636,7 @@ fn run(args: Args) -> Result<()> {
|
||||
&uncond_prompt,
|
||||
tokenizer.clone(),
|
||||
clip_weights.clone(),
|
||||
clip2_weights.clone(),
|
||||
sd_version,
|
||||
&sd_config,
|
||||
use_f16,
|
||||
@ -514,16 +655,26 @@ fn run(args: Args) -> Result<()> {
|
||||
println!("Building the autoencoder.");
|
||||
let vae_weights = ModelFile::Vae.get(vae_weights, sd_version, use_f16)?;
|
||||
let vae = sd_config.build_vae(vae_weights, &device, dtype)?;
|
||||
let init_latent_dist = match &img2img {
|
||||
None => None,
|
||||
|
||||
let (image, init_latent_dist) = match &img2img {
|
||||
None => (None, None),
|
||||
Some(image) => {
|
||||
let image = image_preprocess(image)?.to_device(&device)?;
|
||||
Some(vae.encode(&image)?)
|
||||
let image = image_preprocess(image)?
|
||||
.to_device(&device)?
|
||||
.to_dtype(dtype)?;
|
||||
(Some(image.clone()), Some(vae.encode(&image)?))
|
||||
}
|
||||
};
|
||||
|
||||
println!("Building the unet.");
|
||||
let unet_weights = ModelFile::Unet.get(unet_weights, sd_version, use_f16)?;
|
||||
let unet = sd_config.build_unet(unet_weights, &device, 4, use_flash_attn, dtype)?;
|
||||
let in_channels = match sd_version {
|
||||
StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::V2Inpaint
|
||||
| StableDiffusionVersion::V1_5Inpaint => 9,
|
||||
_ => 4,
|
||||
};
|
||||
let unet = sd_config.build_unet(unet_weights, &device, in_channels, use_flash_attn, dtype)?;
|
||||
|
||||
let t_start = if img2img.is_some() {
|
||||
n_steps - (n_steps as f64 * img2img_strength) as usize
|
||||
@ -533,11 +684,25 @@ fn run(args: Args) -> Result<()> {
|
||||
|
||||
let vae_scale = match sd_version {
|
||||
StableDiffusionVersion::V1_5
|
||||
| StableDiffusionVersion::V1_5Inpaint
|
||||
| StableDiffusionVersion::V2_1
|
||||
| StableDiffusionVersion::V2Inpaint
|
||||
| StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::Xl => 0.18215,
|
||||
StableDiffusionVersion::Turbo => 0.13025,
|
||||
};
|
||||
|
||||
let (mask_latents, mask, mask_4) = inpainting_tensors(
|
||||
sd_version,
|
||||
mask_path,
|
||||
dtype,
|
||||
&device,
|
||||
use_guide_scale,
|
||||
&vae,
|
||||
image,
|
||||
vae_scale,
|
||||
)?;
|
||||
|
||||
for idx in 0..num_samples {
|
||||
let timesteps = scheduler.timesteps().to_vec();
|
||||
let latents = match &init_latent_dist {
|
||||
@ -576,6 +741,22 @@ fn run(args: Args) -> Result<()> {
|
||||
};
|
||||
|
||||
let latent_model_input = scheduler.scale_model_input(latent_model_input, timestep)?;
|
||||
|
||||
let latent_model_input = match sd_version {
|
||||
StableDiffusionVersion::XlInpaint
|
||||
| StableDiffusionVersion::V2Inpaint
|
||||
| StableDiffusionVersion::V1_5Inpaint => Tensor::cat(
|
||||
&[
|
||||
&latent_model_input,
|
||||
mask.as_ref().unwrap(),
|
||||
mask_latents.as_ref().unwrap(),
|
||||
],
|
||||
1,
|
||||
)?,
|
||||
_ => latent_model_input,
|
||||
}
|
||||
.to_device(&device)?;
|
||||
|
||||
let noise_pred =
|
||||
unet.forward(&latent_model_input, timestep as f64, &text_embeddings)?;
|
||||
|
||||
@ -592,6 +773,18 @@ fn run(args: Args) -> Result<()> {
|
||||
let dt = start_time.elapsed().as_secs_f32();
|
||||
println!("step {}/{n_steps} done, {:.2}s", timestep_index + 1, dt);
|
||||
|
||||
// Replace all pixels in the unmasked region with the original pixels discarding any changes.
|
||||
if args.only_update_masked {
|
||||
let mask = mask_4.as_ref().unwrap();
|
||||
let latent_to_keep = mask_latents
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_on_dim(0, 0)? // shape: [4, H, W]
|
||||
.unsqueeze(0)?; // shape: [1, 4, H, W]
|
||||
|
||||
latents = ((&latents * mask)? + &latent_to_keep * (1.0 - mask))?;
|
||||
}
|
||||
|
||||
if args.intermediary_images {
|
||||
save_image(
|
||||
&vae,
|
||||
|
Reference in New Issue
Block a user