diff --git a/README.md b/README.md index 1208956c..0fdcedca 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,8 @@ We also provide a some command line based examples using state of the art models dedicated submodels for hand-writing and printed recognition. - [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation model, generates the translated text from the input text. +- [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model + that can answer real-world questions about images. Run them using commands like: ``` diff --git a/candle-examples/examples/moondream/README.md b/candle-examples/examples/moondream/README.md new file mode 100644 index 00000000..e202de7c --- /dev/null +++ b/candle-examples/examples/moondream/README.md @@ -0,0 +1,26 @@ +# candle-moondream + +[Moondream](https://github.com/vikhyat/moondream) is a computer-vision model can answer real-world questions about images. It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices. + +## Running some examples +First download an example image +```bash +$ wget https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg +``` + + + +Now you can run Moondream from the `candle-examples` crate: +```bash +$ cargo run --example moondream --release -- --prompt "What is the girl eating?" --image "./demo-1.jpg" + +avavx: false, neon: true, simd128: false, f16c: false +temp: 0.00 repeat-penalty: 1.00 repeat-last-n: 64 +retrieved the files in 3.395583ms +Running on CPU, to run on GPU(metal), build this example with `--features metal` +loaded the model in 5.485493792s +loaded and encoded the image Tensor[dims 3, 378, 378; f32] in 4.801396417s +starting the inference loop + The girl is eating a hamburger.< +9 tokens generated (0.68 token/s) +``` \ No newline at end of file diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs new file mode 100644 index 00000000..7ea6570f --- /dev/null +++ b/candle-examples/examples/moondream/main.rs @@ -0,0 +1,245 @@ +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + +use anyhow::{Error as E, Result}; +use clap::Parser; + +use candle::{DType, Device, Tensor}; +use candle_nn::VarBuilder; +use candle_transformers::{generation::LogitsProcessor, models::moondream}; +use tokenizers::Tokenizer; + +struct TextGeneration { + model: moondream::Model, + device: Device, + tokenizer: Tokenizer, + logits_processor: LogitsProcessor, + repeat_penalty: f32, + repeat_last_n: usize, + verbose_prompt: bool, +} + +impl TextGeneration { + #[allow(clippy::too_many_arguments)] + fn new( + model: moondream::Model, + tokenizer: Tokenizer, + seed: u64, + temp: Option, + top_p: Option, + repeat_penalty: f32, + repeat_last_n: usize, + verbose_prompt: bool, + device: &Device, + ) -> Self { + let logits_processor = LogitsProcessor::new(seed, temp, top_p); + Self { + model, + tokenizer, + logits_processor, + repeat_penalty, + repeat_last_n, + verbose_prompt, + device: device.clone(), + } + } + + fn run(&mut self, prompt: &str, image_embeds: &Tensor, sample_len: usize) -> Result<()> { + use std::io::Write; + println!("starting the inference loop"); + let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?; + if tokens.is_empty() { + anyhow::bail!("Empty prompts are not supported in the Moondream model.") + } + if self.verbose_prompt { + for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) { + let token = token.replace('▁', " ").replace("<0x0A>", "\n"); + println!("{id:7} -> '{token}'"); + } + } + + let mut tokens = tokens.get_ids().to_vec(); + let mut generated_tokens = 0usize; + + let eos_token = match self.tokenizer.get_vocab(true).get("END") { + Some(token) => *token, + None => anyhow::bail!("cannot find the EOS token"), + }; + + let start_gen = std::time::Instant::now(); + for index in 0..sample_len { + let context_size = if index > 0 { 1 } else { tokens.len() }; + let ctxt = &tokens[tokens.len().saturating_sub(context_size)..]; + let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?; + let logits = if index > 0 { + self.model.text_model.forward(&input)? + } else { + self.model + .text_model + .forward_with_img(&input, &image_embeds)? + }; + let logits = logits.squeeze(0)?.to_dtype(DType::F32)?; + let logits = if self.repeat_penalty == 1. { + logits + } else { + let start_at = tokens.len().saturating_sub(self.repeat_last_n); + candle_transformers::utils::apply_repeat_penalty( + &logits, + self.repeat_penalty, + &tokens[start_at..], + )? + }; + let next_token = self.logits_processor.sample(&logits)?; + tokens.push(next_token); + generated_tokens += 1; + if next_token == eos_token { + break; + } + let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?; + print!("{token}"); + std::io::stdout().flush()?; + } + + let dt = start_gen.elapsed(); + println!( + "\n{generated_tokens} tokens generated ({:.2} token/s)", + generated_tokens as f64 / dt.as_secs_f64() + ); + + Ok(()) + } +} + +#[derive(Parser)] +struct Args { + /// Run on CPU rather than on GPU. + #[arg(long)] + cpu: bool, + + /// Enable tracing (generates a trace-timestamp.json file). + #[arg(long)] + tracing: bool, + + /// Display the token for the specified prompt. + #[arg(long)] + verbose_prompt: bool, + + #[arg(long)] + prompt: String, + + #[arg(long)] + image: String, + + /// The temperature used to generate samples. + #[arg(long)] + temperature: Option, + + /// Nucleus sampling probability cutoff. + #[arg(long)] + top_p: Option, + + /// The seed to use when generating random samples. + #[arg(long, default_value_t = 299792458)] + seed: u64, + + #[arg(long, default_value_t = 5000)] + sample_len: usize, + + /// Penalty to be applied for repeating tokens, 1. means no penalty. + #[arg(long, default_value_t = 1.0)] + repeat_penalty: f32, + + /// The context size to consider for the repeat penalty. + #[arg(long, default_value_t = 64)] + repeat_last_n: usize, +} + +/// Loads an image from disk using the image crate, this returns a tensor with shape +/// (3, 378, 378). +pub fn load_image>(p: P) -> candle::Result { + let img = image::io::Reader::open(p)? + .decode() + .map_err(candle::Error::wrap)? + .resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378 + let img = img.to_rgb8(); + let data = img.into_raw(); + let data = Tensor::from_vec(data, (378, 378, 3), &Device::Cpu)?.permute((2, 0, 1))?; + let mean = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?; + let std = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?; + (data.to_dtype(candle::DType::F32)? / 255.)? + .broadcast_sub(&mean)? + .broadcast_div(&std) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + use tracing_chrome::ChromeLayerBuilder; + use tracing_subscriber::prelude::*; + + let args = Args::parse(); + + let _guard = if args.tracing { + let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); + tracing_subscriber::registry().with(chrome_layer).init(); + Some(guard) + } else { + None + }; + println!( + "avx: {}, neon: {}, simd128: {}, f16c: {}", + candle::utils::with_avx(), + candle::utils::with_neon(), + candle::utils::with_simd128(), + candle::utils::with_f16c() + ); + println!( + "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}", + args.temperature.unwrap_or(0.), + args.repeat_penalty, + args.repeat_last_n + ); + + let start = std::time::Instant::now(); + let api = hf_hub::api::tokio::Api::new()?; + let repo = api.model("vikhyatk/moondream2".to_string()); + let model_file = repo.get("model.safetensors").await?; + let tokenizer = repo.get("tokenizer.json").await?; + println!("retrieved the files in {:?}", start.elapsed()); + let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?; + + let start = std::time::Instant::now(); + let device = candle_examples::device(args.cpu)?; + let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? }; + let config = moondream::Config::v2(); + let model = moondream::Model::new(&config, vb)?; + println!("loaded the model in {:?}", start.elapsed()); + + let start = std::time::Instant::now(); + let image = load_image(args.image)?.to_device(&device)?; + let image_embeds = image.unsqueeze(0)?; + let image_embeds = image_embeds.apply(model.vision_encoder())?; + println!( + "loaded and encoded the image {image:?} in {:?}", + start.elapsed() + ); + + let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt); + + let mut pipeline = TextGeneration::new( + model, + tokenizer, + args.seed, + args.temperature, + args.top_p, + args.repeat_penalty, + args.repeat_last_n, + args.verbose_prompt, + &device, + ); + pipeline.run(&prompt, &image_embeds, args.sample_len)?; + + Ok(()) +} diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index f7eb0abe..edca8b9d 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -438,6 +438,26 @@ impl MixFormerSequentialForCausalLM { xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1) } + pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result { + let _enter = self.span.enter(); + let xs = xs.apply(&self.embedding)?; + let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?; + let (_b_size, seq_len, _embds) = xs.dims3()?; + let mask = if seq_len <= 1 { + None + } else { + Some(get_mask(seq_len, xs.device())?) + }; + for block in self.blocks.iter_mut() { + xs = block.forward(&xs, mask.as_ref())? + } + let xs = xs + .narrow(1, seq_len - 1, 1)? + .apply(&self.head)? + .squeeze(1)?; + Ok(xs) + } + pub fn clear_kv_cache(&mut self) { self.blocks.iter_mut().for_each(|b| b.clear_kv_cache()) } diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 980ba535..ed0e0de7 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -24,6 +24,7 @@ pub mod mistral; pub mod mixformer; pub mod mixtral; pub mod mobileone; +pub mod moondream; pub mod mpt; pub mod persimmon; pub mod phi; diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs new file mode 100644 index 00000000..1172bf71 --- /dev/null +++ b/candle-transformers/src/models/moondream.rs @@ -0,0 +1,308 @@ +use crate::models::mixformer::{Config as PhiConfig, MixFormerSequentialForCausalLM as PhiModel}; +use candle::{IndexOp, Result, Tensor, D}; +use candle_nn::{layer_norm, linear_b, Linear, Module, VarBuilder}; + +pub struct Config { + pub phi_config: PhiConfig, + pub vision_config: VisionConfig, +} + +impl Config { + pub fn v2() -> Self { + Self { + phi_config: PhiConfig::v1_5(), + vision_config: VisionConfig::v2(), + } + } +} + +fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result { + let dim = q.dim(D::Minus1)?; + let scale_factor = 1.0 / (dim as f64).sqrt(); + let k = k.transpose(D::Minus2, D::Minus1)?.contiguous()?; + let mut attn_weights = (q.contiguous()?.matmul(&k)? * scale_factor)?; + attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?.contiguous()?; + let attn_weights = attn_weights.matmul(&v.contiguous()?)?; + Ok(attn_weights) +} + +#[derive(Debug, Clone, PartialEq, serde::Deserialize)] +pub struct VisionConfig { + image_embedding_dim: usize, + model_dim: usize, + hidden_dim: usize, + hidden_features: usize, + embed_len: usize, + embed_dim: usize, + num_blocks: usize, + num_heads: usize, + act: candle_nn::Activation, +} + +impl VisionConfig { + pub fn v2() -> Self { + Self { + image_embedding_dim: 1152, + model_dim: 2048, + hidden_dim: 2048 * 4, + hidden_features: 4304, + embed_len: 729, + embed_dim: 1152, + num_blocks: 27, + num_heads: 16, + act: candle_nn::Activation::Gelu, + } + } +} + +#[derive(Debug, Clone)] +struct LinearPatchEmbedding { + linear: Linear, +} + +impl LinearPatchEmbedding { + fn new(vb: VarBuilder) -> Result { + let linear = linear_b(588, 1152, true, vb.pp("linear"))?; + Ok(Self { linear }) + } +} + +impl Module for LinearPatchEmbedding { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.linear) + } +} + +#[derive(Debug, Clone)] +struct Attention { + num_heads: usize, + head_dim: usize, + qkv: Linear, + proj: Linear, +} + +impl Attention { + pub fn new(vb: VarBuilder, dim: usize, num_heads: usize) -> Result { + let qkv = linear_b(dim, dim * 3, true, vb.pp("qkv"))?; + let proj = linear_b(dim, dim, true, vb.pp("proj"))?; + Ok(Self { + num_heads, + head_dim: dim / num_heads, + qkv, + proj, + }) + } +} + +impl Module for Attention { + fn forward(&self, xs: &Tensor) -> Result { + let (b, n, c) = xs.dims3()?; + let qkv = xs + .apply(&self.qkv)? + .reshape((b, n, 3, self.num_heads, self.head_dim))? + .permute((2, 0, 3, 1, 4))?; + let (q, k, v) = (qkv.i(0)?, qkv.i(1)?, qkv.i(2)?); + let attn_weights = scaled_dot_product_attention(&q, &k, &v)?; + let attn_weights = attn_weights.transpose(1, 2)?.reshape((b, n, c))?; + attn_weights.apply(&self.proj) + } +} + +#[derive(Debug, Clone)] +struct VitBlock { + attn: Attention, + mlp: Mlp, + norm1: candle_nn::LayerNorm, + norm2: candle_nn::LayerNorm, +} + +impl VitBlock { + fn new(vb: VarBuilder, dim: usize, num_heads: usize, cfg: &VisionConfig) -> Result { + let attn = Attention::new(vb.pp("attn"), dim, num_heads)?; + let mlp = Mlp::new(vb.pp("mlp"), dim, cfg.hidden_features, dim, cfg.act)?; + let norm1 = layer_norm(dim, 1e-5, vb.pp("norm1"))?; + let norm2 = layer_norm(dim, 1e-5, vb.pp("norm2"))?; + Ok(Self { + attn, + mlp, + norm1, + norm2, + }) + } +} + +impl Module for VitBlock { + fn forward(&self, xs: &Tensor) -> Result { + let ys = xs.apply(&self.norm1)?.apply(&self.attn)?; + let xs = (xs + &ys)?; + let ys = xs.apply(&self.norm2)?.apply(&self.mlp)?; + let xs = (&xs + &ys)?; + Ok(xs) + } +} + +#[derive(Debug, Clone)] +struct VisionTransformer { + patch_embed: LinearPatchEmbedding, + pos_embed: Tensor, + blocks: Vec, + norm: candle_nn::LayerNorm, +} + +impl VisionTransformer { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let patch_embed = LinearPatchEmbedding::new(vb.pp("patch_embed"))?; + let pos_embed = vb.get((1, cfg.embed_len, cfg.embed_dim), "pos_embed")?; + let blocks = (0..cfg.num_blocks) + .map(|i| { + VitBlock::new( + vb.pp(&format!("blocks.{}", i)), + cfg.embed_dim, + cfg.num_heads, + cfg, + ) + }) + .collect::>()?; + let norm = layer_norm(cfg.embed_dim, 1e-5, vb.pp("norm"))?; + Ok(Self { + patch_embed, + pos_embed, + blocks, + norm, + }) + } +} + +impl Module for VisionTransformer { + fn forward(&self, xs: &Tensor) -> Result { + let mut xs = (&xs.apply(&self.patch_embed)? + &self.pos_embed)?; + for block in self.blocks.iter() { + xs = xs.apply(block)?; + } + xs.apply(&self.norm) + } +} + +#[derive(Debug, Clone)] +pub struct Encoder { + model: VisionTransformer, +} + +impl Encoder { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let model = VisionTransformer::new(cfg, vb.pp("model.visual"))?; + Ok(Self { model }) + } +} + +impl Module for Encoder { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.model) + } +} + +#[derive(Debug, Clone)] +struct Mlp { + fc1: Linear, + act: candle_nn::Activation, + fc2: Linear, +} + +impl Mlp { + fn new( + vb: VarBuilder, + in_features: usize, + hidden_features: usize, + out_features: usize, + act: candle_nn::Activation, + ) -> Result { + let fc1 = linear_b(in_features, hidden_features, true, vb.pp("fc1"))?; + let fc2 = linear_b(hidden_features, out_features, true, vb.pp("fc2"))?; + Ok(Self { fc1, act, fc2 }) + } +} + +impl Module for Mlp { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2) + } +} + +#[derive(Debug, Clone)] +struct VisionProjection { + mlp: Mlp, +} + +impl VisionProjection { + fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let mlp = Mlp::new( + vb.pp("mlp"), + cfg.image_embedding_dim, + cfg.hidden_dim, + cfg.model_dim, + cfg.act, + )?; + Ok(Self { mlp }) + } +} + +impl Module for VisionProjection { + fn forward(&self, xs: &Tensor) -> Result { + xs.apply(&self.mlp) + } +} + +#[derive(Debug, Clone)] +pub struct VisionEncoder { + encoder: Encoder, + projection: VisionProjection, +} + +impl VisionEncoder { + pub fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result { + let encoder = Encoder::new(cfg, vb.pp("encoder"))?; + let projection = VisionProjection::new(cfg, vb.pp("projection"))?; + Ok(Self { + encoder, + projection, + }) + } +} + +impl Module for VisionEncoder { + fn forward(&self, xs: &Tensor) -> Result { + let (b, c, hp1, wp2) = xs.dims4()?; + let (p1, p2) = (14, 14); + let h = hp1 / p1; + let w = wp2 / p2; + let xs = xs + .reshape((b, c, h, p1, h, p2))? + .permute((0, 2, 4, 1, 3, 5))? + .reshape((b, h * w, c * p1 * p2))?; + xs.apply(&self.encoder)?.apply(&self.projection) + } +} + +pub struct Model { + pub text_model: PhiModel, + pub vision_encoder: VisionEncoder, +} + +impl Model { + pub fn new(config: &Config, vb: VarBuilder) -> Result { + let text_model = PhiModel::new_v2(&config.phi_config, vb.pp("text_model"))?; + let vision_encoder = VisionEncoder::new(&config.vision_config, vb.pp("vision_encoder"))?; + Ok(Self { + text_model, + vision_encoder, + }) + } + + pub fn vision_encoder(&self) -> &VisionEncoder { + &self.vision_encoder + } + + pub fn text_model(&mut self) -> &mut PhiModel { + &mut self.text_model + } +}