diff --git a/README.md b/README.md
index 1208956c..0fdcedca 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,8 @@ We also provide a some command line based examples using state of the art models
dedicated submodels for hand-writing and printed recognition.
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
model, generates the translated text from the input text.
+- [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model
+ that can answer real-world questions about images.
Run them using commands like:
```
diff --git a/candle-examples/examples/moondream/README.md b/candle-examples/examples/moondream/README.md
new file mode 100644
index 00000000..e202de7c
--- /dev/null
+++ b/candle-examples/examples/moondream/README.md
@@ -0,0 +1,26 @@
+# candle-moondream
+
+[Moondream](https://github.com/vikhyat/moondream) is a computer-vision model can answer real-world questions about images. It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.
+
+## Running some examples
+First download an example image
+```bash
+$ wget https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg
+```
+
+
+
+Now you can run Moondream from the `candle-examples` crate:
+```bash
+$ cargo run --example moondream --release -- --prompt "What is the girl eating?" --image "./demo-1.jpg"
+
+avavx: false, neon: true, simd128: false, f16c: false
+temp: 0.00 repeat-penalty: 1.00 repeat-last-n: 64
+retrieved the files in 3.395583ms
+Running on CPU, to run on GPU(metal), build this example with `--features metal`
+loaded the model in 5.485493792s
+loaded and encoded the image Tensor[dims 3, 378, 378; f32] in 4.801396417s
+starting the inference loop
+ The girl is eating a hamburger.<
+9 tokens generated (0.68 token/s)
+```
\ No newline at end of file
diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs
new file mode 100644
index 00000000..7ea6570f
--- /dev/null
+++ b/candle-examples/examples/moondream/main.rs
@@ -0,0 +1,245 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use anyhow::{Error as E, Result};
+use clap::Parser;
+
+use candle::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::{generation::LogitsProcessor, models::moondream};
+use tokenizers::Tokenizer;
+
+struct TextGeneration {
+ model: moondream::Model,
+ device: Device,
+ tokenizer: Tokenizer,
+ logits_processor: LogitsProcessor,
+ repeat_penalty: f32,
+ repeat_last_n: usize,
+ verbose_prompt: bool,
+}
+
+impl TextGeneration {
+ #[allow(clippy::too_many_arguments)]
+ fn new(
+ model: moondream::Model,
+ tokenizer: Tokenizer,
+ seed: u64,
+ temp: Option,
+ top_p: Option,
+ repeat_penalty: f32,
+ repeat_last_n: usize,
+ verbose_prompt: bool,
+ device: &Device,
+ ) -> Self {
+ let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+ Self {
+ model,
+ tokenizer,
+ logits_processor,
+ repeat_penalty,
+ repeat_last_n,
+ verbose_prompt,
+ device: device.clone(),
+ }
+ }
+
+ fn run(&mut self, prompt: &str, image_embeds: &Tensor, sample_len: usize) -> Result<()> {
+ use std::io::Write;
+ println!("starting the inference loop");
+ let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?;
+ if tokens.is_empty() {
+ anyhow::bail!("Empty prompts are not supported in the Moondream model.")
+ }
+ if self.verbose_prompt {
+ for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
+ let token = token.replace('▁', " ").replace("<0x0A>", "\n");
+ println!("{id:7} -> '{token}'");
+ }
+ }
+
+ let mut tokens = tokens.get_ids().to_vec();
+ let mut generated_tokens = 0usize;
+
+ let eos_token = match self.tokenizer.get_vocab(true).get("END") {
+ Some(token) => *token,
+ None => anyhow::bail!("cannot find the EOS token"),
+ };
+
+ let start_gen = std::time::Instant::now();
+ for index in 0..sample_len {
+ let context_size = if index > 0 { 1 } else { tokens.len() };
+ let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
+ let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+ let logits = if index > 0 {
+ self.model.text_model.forward(&input)?
+ } else {
+ self.model
+ .text_model
+ .forward_with_img(&input, &image_embeds)?
+ };
+ let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
+ let logits = if self.repeat_penalty == 1. {
+ logits
+ } else {
+ let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+ candle_transformers::utils::apply_repeat_penalty(
+ &logits,
+ self.repeat_penalty,
+ &tokens[start_at..],
+ )?
+ };
+ let next_token = self.logits_processor.sample(&logits)?;
+ tokens.push(next_token);
+ generated_tokens += 1;
+ if next_token == eos_token {
+ break;
+ }
+ let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?;
+ print!("{token}");
+ std::io::stdout().flush()?;
+ }
+
+ let dt = start_gen.elapsed();
+ println!(
+ "\n{generated_tokens} tokens generated ({:.2} token/s)",
+ generated_tokens as f64 / dt.as_secs_f64()
+ );
+
+ Ok(())
+ }
+}
+
+#[derive(Parser)]
+struct Args {
+ /// Run on CPU rather than on GPU.
+ #[arg(long)]
+ cpu: bool,
+
+ /// Enable tracing (generates a trace-timestamp.json file).
+ #[arg(long)]
+ tracing: bool,
+
+ /// Display the token for the specified prompt.
+ #[arg(long)]
+ verbose_prompt: bool,
+
+ #[arg(long)]
+ prompt: String,
+
+ #[arg(long)]
+ image: String,
+
+ /// The temperature used to generate samples.
+ #[arg(long)]
+ temperature: Option,
+
+ /// Nucleus sampling probability cutoff.
+ #[arg(long)]
+ top_p: Option,
+
+ /// The seed to use when generating random samples.
+ #[arg(long, default_value_t = 299792458)]
+ seed: u64,
+
+ #[arg(long, default_value_t = 5000)]
+ sample_len: usize,
+
+ /// Penalty to be applied for repeating tokens, 1. means no penalty.
+ #[arg(long, default_value_t = 1.0)]
+ repeat_penalty: f32,
+
+ /// The context size to consider for the repeat penalty.
+ #[arg(long, default_value_t = 64)]
+ repeat_last_n: usize,
+}
+
+/// Loads an image from disk using the image crate, this returns a tensor with shape
+/// (3, 378, 378).
+pub fn load_image>(p: P) -> candle::Result {
+ let img = image::io::Reader::open(p)?
+ .decode()
+ .map_err(candle::Error::wrap)?
+ .resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378
+ let img = img.to_rgb8();
+ let data = img.into_raw();
+ let data = Tensor::from_vec(data, (378, 378, 3), &Device::Cpu)?.permute((2, 0, 1))?;
+ let mean = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?;
+ let std = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?;
+ (data.to_dtype(candle::DType::F32)? / 255.)?
+ .broadcast_sub(&mean)?
+ .broadcast_div(&std)
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+ use tracing_chrome::ChromeLayerBuilder;
+ use tracing_subscriber::prelude::*;
+
+ let args = Args::parse();
+
+ let _guard = if args.tracing {
+ let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
+ tracing_subscriber::registry().with(chrome_layer).init();
+ Some(guard)
+ } else {
+ None
+ };
+ println!(
+ "avx: {}, neon: {}, simd128: {}, f16c: {}",
+ candle::utils::with_avx(),
+ candle::utils::with_neon(),
+ candle::utils::with_simd128(),
+ candle::utils::with_f16c()
+ );
+ println!(
+ "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+ args.temperature.unwrap_or(0.),
+ args.repeat_penalty,
+ args.repeat_last_n
+ );
+
+ let start = std::time::Instant::now();
+ let api = hf_hub::api::tokio::Api::new()?;
+ let repo = api.model("vikhyatk/moondream2".to_string());
+ let model_file = repo.get("model.safetensors").await?;
+ let tokenizer = repo.get("tokenizer.json").await?;
+ println!("retrieved the files in {:?}", start.elapsed());
+ let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?;
+
+ let start = std::time::Instant::now();
+ let device = candle_examples::device(args.cpu)?;
+ let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+ let config = moondream::Config::v2();
+ let model = moondream::Model::new(&config, vb)?;
+ println!("loaded the model in {:?}", start.elapsed());
+
+ let start = std::time::Instant::now();
+ let image = load_image(args.image)?.to_device(&device)?;
+ let image_embeds = image.unsqueeze(0)?;
+ let image_embeds = image_embeds.apply(model.vision_encoder())?;
+ println!(
+ "loaded and encoded the image {image:?} in {:?}",
+ start.elapsed()
+ );
+
+ let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt);
+
+ let mut pipeline = TextGeneration::new(
+ model,
+ tokenizer,
+ args.seed,
+ args.temperature,
+ args.top_p,
+ args.repeat_penalty,
+ args.repeat_last_n,
+ args.verbose_prompt,
+ &device,
+ );
+ pipeline.run(&prompt, &image_embeds, args.sample_len)?;
+
+ Ok(())
+}
diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs
index f7eb0abe..edca8b9d 100644
--- a/candle-transformers/src/models/mixformer.rs
+++ b/candle-transformers/src/models/mixformer.rs
@@ -438,6 +438,26 @@ impl MixFormerSequentialForCausalLM {
xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
}
+ pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result {
+ let _enter = self.span.enter();
+ let xs = xs.apply(&self.embedding)?;
+ let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?;
+ let (_b_size, seq_len, _embds) = xs.dims3()?;
+ let mask = if seq_len <= 1 {
+ None
+ } else {
+ Some(get_mask(seq_len, xs.device())?)
+ };
+ for block in self.blocks.iter_mut() {
+ xs = block.forward(&xs, mask.as_ref())?
+ }
+ let xs = xs
+ .narrow(1, seq_len - 1, 1)?
+ .apply(&self.head)?
+ .squeeze(1)?;
+ Ok(xs)
+ }
+
pub fn clear_kv_cache(&mut self) {
self.blocks.iter_mut().for_each(|b| b.clear_kv_cache())
}
diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs
index 980ba535..ed0e0de7 100644
--- a/candle-transformers/src/models/mod.rs
+++ b/candle-transformers/src/models/mod.rs
@@ -24,6 +24,7 @@ pub mod mistral;
pub mod mixformer;
pub mod mixtral;
pub mod mobileone;
+pub mod moondream;
pub mod mpt;
pub mod persimmon;
pub mod phi;
diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs
new file mode 100644
index 00000000..1172bf71
--- /dev/null
+++ b/candle-transformers/src/models/moondream.rs
@@ -0,0 +1,308 @@
+use crate::models::mixformer::{Config as PhiConfig, MixFormerSequentialForCausalLM as PhiModel};
+use candle::{IndexOp, Result, Tensor, D};
+use candle_nn::{layer_norm, linear_b, Linear, Module, VarBuilder};
+
+pub struct Config {
+ pub phi_config: PhiConfig,
+ pub vision_config: VisionConfig,
+}
+
+impl Config {
+ pub fn v2() -> Self {
+ Self {
+ phi_config: PhiConfig::v1_5(),
+ vision_config: VisionConfig::v2(),
+ }
+ }
+}
+
+fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result {
+ let dim = q.dim(D::Minus1)?;
+ let scale_factor = 1.0 / (dim as f64).sqrt();
+ let k = k.transpose(D::Minus2, D::Minus1)?.contiguous()?;
+ let mut attn_weights = (q.contiguous()?.matmul(&k)? * scale_factor)?;
+ attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?.contiguous()?;
+ let attn_weights = attn_weights.matmul(&v.contiguous()?)?;
+ Ok(attn_weights)
+}
+
+#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
+pub struct VisionConfig {
+ image_embedding_dim: usize,
+ model_dim: usize,
+ hidden_dim: usize,
+ hidden_features: usize,
+ embed_len: usize,
+ embed_dim: usize,
+ num_blocks: usize,
+ num_heads: usize,
+ act: candle_nn::Activation,
+}
+
+impl VisionConfig {
+ pub fn v2() -> Self {
+ Self {
+ image_embedding_dim: 1152,
+ model_dim: 2048,
+ hidden_dim: 2048 * 4,
+ hidden_features: 4304,
+ embed_len: 729,
+ embed_dim: 1152,
+ num_blocks: 27,
+ num_heads: 16,
+ act: candle_nn::Activation::Gelu,
+ }
+ }
+}
+
+#[derive(Debug, Clone)]
+struct LinearPatchEmbedding {
+ linear: Linear,
+}
+
+impl LinearPatchEmbedding {
+ fn new(vb: VarBuilder) -> Result {
+ let linear = linear_b(588, 1152, true, vb.pp("linear"))?;
+ Ok(Self { linear })
+ }
+}
+
+impl Module for LinearPatchEmbedding {
+ fn forward(&self, xs: &Tensor) -> Result {
+ xs.apply(&self.linear)
+ }
+}
+
+#[derive(Debug, Clone)]
+struct Attention {
+ num_heads: usize,
+ head_dim: usize,
+ qkv: Linear,
+ proj: Linear,
+}
+
+impl Attention {
+ pub fn new(vb: VarBuilder, dim: usize, num_heads: usize) -> Result {
+ let qkv = linear_b(dim, dim * 3, true, vb.pp("qkv"))?;
+ let proj = linear_b(dim, dim, true, vb.pp("proj"))?;
+ Ok(Self {
+ num_heads,
+ head_dim: dim / num_heads,
+ qkv,
+ proj,
+ })
+ }
+}
+
+impl Module for Attention {
+ fn forward(&self, xs: &Tensor) -> Result {
+ let (b, n, c) = xs.dims3()?;
+ let qkv = xs
+ .apply(&self.qkv)?
+ .reshape((b, n, 3, self.num_heads, self.head_dim))?
+ .permute((2, 0, 3, 1, 4))?;
+ let (q, k, v) = (qkv.i(0)?, qkv.i(1)?, qkv.i(2)?);
+ let attn_weights = scaled_dot_product_attention(&q, &k, &v)?;
+ let attn_weights = attn_weights.transpose(1, 2)?.reshape((b, n, c))?;
+ attn_weights.apply(&self.proj)
+ }
+}
+
+#[derive(Debug, Clone)]
+struct VitBlock {
+ attn: Attention,
+ mlp: Mlp,
+ norm1: candle_nn::LayerNorm,
+ norm2: candle_nn::LayerNorm,
+}
+
+impl VitBlock {
+ fn new(vb: VarBuilder, dim: usize, num_heads: usize, cfg: &VisionConfig) -> Result {
+ let attn = Attention::new(vb.pp("attn"), dim, num_heads)?;
+ let mlp = Mlp::new(vb.pp("mlp"), dim, cfg.hidden_features, dim, cfg.act)?;
+ let norm1 = layer_norm(dim, 1e-5, vb.pp("norm1"))?;
+ let norm2 = layer_norm(dim, 1e-5, vb.pp("norm2"))?;
+ Ok(Self {
+ attn,
+ mlp,
+ norm1,
+ norm2,
+ })
+ }
+}
+
+impl Module for VitBlock {
+ fn forward(&self, xs: &Tensor) -> Result {
+ let ys = xs.apply(&self.norm1)?.apply(&self.attn)?;
+ let xs = (xs + &ys)?;
+ let ys = xs.apply(&self.norm2)?.apply(&self.mlp)?;
+ let xs = (&xs + &ys)?;
+ Ok(xs)
+ }
+}
+
+#[derive(Debug, Clone)]
+struct VisionTransformer {
+ patch_embed: LinearPatchEmbedding,
+ pos_embed: Tensor,
+ blocks: Vec,
+ norm: candle_nn::LayerNorm,
+}
+
+impl VisionTransformer {
+ fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result {
+ let patch_embed = LinearPatchEmbedding::new(vb.pp("patch_embed"))?;
+ let pos_embed = vb.get((1, cfg.embed_len, cfg.embed_dim), "pos_embed")?;
+ let blocks = (0..cfg.num_blocks)
+ .map(|i| {
+ VitBlock::new(
+ vb.pp(&format!("blocks.{}", i)),
+ cfg.embed_dim,
+ cfg.num_heads,
+ cfg,
+ )
+ })
+ .collect::>()?;
+ let norm = layer_norm(cfg.embed_dim, 1e-5, vb.pp("norm"))?;
+ Ok(Self {
+ patch_embed,
+ pos_embed,
+ blocks,
+ norm,
+ })
+ }
+}
+
+impl Module for VisionTransformer {
+ fn forward(&self, xs: &Tensor) -> Result {
+ let mut xs = (&xs.apply(&self.patch_embed)? + &self.pos_embed)?;
+ for block in self.blocks.iter() {
+ xs = xs.apply(block)?;
+ }
+ xs.apply(&self.norm)
+ }
+}
+
+#[derive(Debug, Clone)]
+pub struct Encoder {
+ model: VisionTransformer,
+}
+
+impl Encoder {
+ fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result {
+ let model = VisionTransformer::new(cfg, vb.pp("model.visual"))?;
+ Ok(Self { model })
+ }
+}
+
+impl Module for Encoder {
+ fn forward(&self, xs: &Tensor) -> Result {
+ xs.apply(&self.model)
+ }
+}
+
+#[derive(Debug, Clone)]
+struct Mlp {
+ fc1: Linear,
+ act: candle_nn::Activation,
+ fc2: Linear,
+}
+
+impl Mlp {
+ fn new(
+ vb: VarBuilder,
+ in_features: usize,
+ hidden_features: usize,
+ out_features: usize,
+ act: candle_nn::Activation,
+ ) -> Result {
+ let fc1 = linear_b(in_features, hidden_features, true, vb.pp("fc1"))?;
+ let fc2 = linear_b(hidden_features, out_features, true, vb.pp("fc2"))?;
+ Ok(Self { fc1, act, fc2 })
+ }
+}
+
+impl Module for Mlp {
+ fn forward(&self, xs: &Tensor) -> Result {
+ xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2)
+ }
+}
+
+#[derive(Debug, Clone)]
+struct VisionProjection {
+ mlp: Mlp,
+}
+
+impl VisionProjection {
+ fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result {
+ let mlp = Mlp::new(
+ vb.pp("mlp"),
+ cfg.image_embedding_dim,
+ cfg.hidden_dim,
+ cfg.model_dim,
+ cfg.act,
+ )?;
+ Ok(Self { mlp })
+ }
+}
+
+impl Module for VisionProjection {
+ fn forward(&self, xs: &Tensor) -> Result {
+ xs.apply(&self.mlp)
+ }
+}
+
+#[derive(Debug, Clone)]
+pub struct VisionEncoder {
+ encoder: Encoder,
+ projection: VisionProjection,
+}
+
+impl VisionEncoder {
+ pub fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result {
+ let encoder = Encoder::new(cfg, vb.pp("encoder"))?;
+ let projection = VisionProjection::new(cfg, vb.pp("projection"))?;
+ Ok(Self {
+ encoder,
+ projection,
+ })
+ }
+}
+
+impl Module for VisionEncoder {
+ fn forward(&self, xs: &Tensor) -> Result {
+ let (b, c, hp1, wp2) = xs.dims4()?;
+ let (p1, p2) = (14, 14);
+ let h = hp1 / p1;
+ let w = wp2 / p2;
+ let xs = xs
+ .reshape((b, c, h, p1, h, p2))?
+ .permute((0, 2, 4, 1, 3, 5))?
+ .reshape((b, h * w, c * p1 * p2))?;
+ xs.apply(&self.encoder)?.apply(&self.projection)
+ }
+}
+
+pub struct Model {
+ pub text_model: PhiModel,
+ pub vision_encoder: VisionEncoder,
+}
+
+impl Model {
+ pub fn new(config: &Config, vb: VarBuilder) -> Result {
+ let text_model = PhiModel::new_v2(&config.phi_config, vb.pp("text_model"))?;
+ let vision_encoder = VisionEncoder::new(&config.vision_config, vb.pp("vision_encoder"))?;
+ Ok(Self {
+ text_model,
+ vision_encoder,
+ })
+ }
+
+ pub fn vision_encoder(&self) -> &VisionEncoder {
+ &self.vision_encoder
+ }
+
+ pub fn text_model(&mut self) -> &mut PhiModel {
+ &mut self.text_model
+ }
+}