mirror of
https://github.com/huggingface/candle.git
synced 2025-06-15 10:26:33 +00:00
Add Moondream transformer implementation and example (#1970)
* moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward
This commit is contained in:
@ -131,6 +131,8 @@ We also provide a some command line based examples using state of the art models
|
|||||||
dedicated submodels for hand-writing and printed recognition.
|
dedicated submodels for hand-writing and printed recognition.
|
||||||
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
|
- [Marian-MT](./candle-examples/examples/marian-mt/): neural machine translation
|
||||||
model, generates the translated text from the input text.
|
model, generates the translated text from the input text.
|
||||||
|
- [Moondream](./candle-examples/examples/moondream/): tiny computer-vision model
|
||||||
|
that can answer real-world questions about images.
|
||||||
|
|
||||||
Run them using commands like:
|
Run them using commands like:
|
||||||
```
|
```
|
||||||
|
26
candle-examples/examples/moondream/README.md
Normal file
26
candle-examples/examples/moondream/README.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# candle-moondream
|
||||||
|
|
||||||
|
[Moondream](https://github.com/vikhyat/moondream) is a computer-vision model can answer real-world questions about images. It's tiny by today's models, with only 1.6B parameters. That enables it to run on a variety of devices, including mobile phones and edge devices.
|
||||||
|
|
||||||
|
## Running some examples
|
||||||
|
First download an example image
|
||||||
|
```bash
|
||||||
|
$ wget https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg
|
||||||
|
```
|
||||||
|
|
||||||
|
<img src="https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg" width="200">
|
||||||
|
|
||||||
|
Now you can run Moondream from the `candle-examples` crate:
|
||||||
|
```bash
|
||||||
|
$ cargo run --example moondream --release -- --prompt "What is the girl eating?" --image "./demo-1.jpg"
|
||||||
|
|
||||||
|
avavx: false, neon: true, simd128: false, f16c: false
|
||||||
|
temp: 0.00 repeat-penalty: 1.00 repeat-last-n: 64
|
||||||
|
retrieved the files in 3.395583ms
|
||||||
|
Running on CPU, to run on GPU(metal), build this example with `--features metal`
|
||||||
|
loaded the model in 5.485493792s
|
||||||
|
loaded and encoded the image Tensor[dims 3, 378, 378; f32] in 4.801396417s
|
||||||
|
starting the inference loop
|
||||||
|
The girl is eating a hamburger.<
|
||||||
|
9 tokens generated (0.68 token/s)
|
||||||
|
```
|
245
candle-examples/examples/moondream/main.rs
Normal file
245
candle-examples/examples/moondream/main.rs
Normal file
@ -0,0 +1,245 @@
|
|||||||
|
#[cfg(feature = "mkl")]
|
||||||
|
extern crate intel_mkl_src;
|
||||||
|
|
||||||
|
#[cfg(feature = "accelerate")]
|
||||||
|
extern crate accelerate_src;
|
||||||
|
|
||||||
|
use anyhow::{Error as E, Result};
|
||||||
|
use clap::Parser;
|
||||||
|
|
||||||
|
use candle::{DType, Device, Tensor};
|
||||||
|
use candle_nn::VarBuilder;
|
||||||
|
use candle_transformers::{generation::LogitsProcessor, models::moondream};
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
|
struct TextGeneration {
|
||||||
|
model: moondream::Model,
|
||||||
|
device: Device,
|
||||||
|
tokenizer: Tokenizer,
|
||||||
|
logits_processor: LogitsProcessor,
|
||||||
|
repeat_penalty: f32,
|
||||||
|
repeat_last_n: usize,
|
||||||
|
verbose_prompt: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TextGeneration {
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
fn new(
|
||||||
|
model: moondream::Model,
|
||||||
|
tokenizer: Tokenizer,
|
||||||
|
seed: u64,
|
||||||
|
temp: Option<f64>,
|
||||||
|
top_p: Option<f64>,
|
||||||
|
repeat_penalty: f32,
|
||||||
|
repeat_last_n: usize,
|
||||||
|
verbose_prompt: bool,
|
||||||
|
device: &Device,
|
||||||
|
) -> Self {
|
||||||
|
let logits_processor = LogitsProcessor::new(seed, temp, top_p);
|
||||||
|
Self {
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
logits_processor,
|
||||||
|
repeat_penalty,
|
||||||
|
repeat_last_n,
|
||||||
|
verbose_prompt,
|
||||||
|
device: device.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run(&mut self, prompt: &str, image_embeds: &Tensor, sample_len: usize) -> Result<()> {
|
||||||
|
use std::io::Write;
|
||||||
|
println!("starting the inference loop");
|
||||||
|
let tokens = self.tokenizer.encode(prompt, true).map_err(E::msg)?;
|
||||||
|
if tokens.is_empty() {
|
||||||
|
anyhow::bail!("Empty prompts are not supported in the Moondream model.")
|
||||||
|
}
|
||||||
|
if self.verbose_prompt {
|
||||||
|
for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
|
||||||
|
let token = token.replace('▁', " ").replace("<0x0A>", "\n");
|
||||||
|
println!("{id:7} -> '{token}'");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut tokens = tokens.get_ids().to_vec();
|
||||||
|
let mut generated_tokens = 0usize;
|
||||||
|
|
||||||
|
let eos_token = match self.tokenizer.get_vocab(true).get("END") {
|
||||||
|
Some(token) => *token,
|
||||||
|
None => anyhow::bail!("cannot find the EOS token"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let start_gen = std::time::Instant::now();
|
||||||
|
for index in 0..sample_len {
|
||||||
|
let context_size = if index > 0 { 1 } else { tokens.len() };
|
||||||
|
let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
|
||||||
|
let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
|
||||||
|
let logits = if index > 0 {
|
||||||
|
self.model.text_model.forward(&input)?
|
||||||
|
} else {
|
||||||
|
self.model
|
||||||
|
.text_model
|
||||||
|
.forward_with_img(&input, &image_embeds)?
|
||||||
|
};
|
||||||
|
let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
|
||||||
|
let logits = if self.repeat_penalty == 1. {
|
||||||
|
logits
|
||||||
|
} else {
|
||||||
|
let start_at = tokens.len().saturating_sub(self.repeat_last_n);
|
||||||
|
candle_transformers::utils::apply_repeat_penalty(
|
||||||
|
&logits,
|
||||||
|
self.repeat_penalty,
|
||||||
|
&tokens[start_at..],
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
let next_token = self.logits_processor.sample(&logits)?;
|
||||||
|
tokens.push(next_token);
|
||||||
|
generated_tokens += 1;
|
||||||
|
if next_token == eos_token {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let token = self.tokenizer.decode(&[next_token], true).map_err(E::msg)?;
|
||||||
|
print!("{token}");
|
||||||
|
std::io::stdout().flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let dt = start_gen.elapsed();
|
||||||
|
println!(
|
||||||
|
"\n{generated_tokens} tokens generated ({:.2} token/s)",
|
||||||
|
generated_tokens as f64 / dt.as_secs_f64()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
struct Args {
|
||||||
|
/// Run on CPU rather than on GPU.
|
||||||
|
#[arg(long)]
|
||||||
|
cpu: bool,
|
||||||
|
|
||||||
|
/// Enable tracing (generates a trace-timestamp.json file).
|
||||||
|
#[arg(long)]
|
||||||
|
tracing: bool,
|
||||||
|
|
||||||
|
/// Display the token for the specified prompt.
|
||||||
|
#[arg(long)]
|
||||||
|
verbose_prompt: bool,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
prompt: String,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
image: String,
|
||||||
|
|
||||||
|
/// The temperature used to generate samples.
|
||||||
|
#[arg(long)]
|
||||||
|
temperature: Option<f64>,
|
||||||
|
|
||||||
|
/// Nucleus sampling probability cutoff.
|
||||||
|
#[arg(long)]
|
||||||
|
top_p: Option<f64>,
|
||||||
|
|
||||||
|
/// The seed to use when generating random samples.
|
||||||
|
#[arg(long, default_value_t = 299792458)]
|
||||||
|
seed: u64,
|
||||||
|
|
||||||
|
#[arg(long, default_value_t = 5000)]
|
||||||
|
sample_len: usize,
|
||||||
|
|
||||||
|
/// Penalty to be applied for repeating tokens, 1. means no penalty.
|
||||||
|
#[arg(long, default_value_t = 1.0)]
|
||||||
|
repeat_penalty: f32,
|
||||||
|
|
||||||
|
/// The context size to consider for the repeat penalty.
|
||||||
|
#[arg(long, default_value_t = 64)]
|
||||||
|
repeat_last_n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Loads an image from disk using the image crate, this returns a tensor with shape
|
||||||
|
/// (3, 378, 378).
|
||||||
|
pub fn load_image<P: AsRef<std::path::Path>>(p: P) -> candle::Result<Tensor> {
|
||||||
|
let img = image::io::Reader::open(p)?
|
||||||
|
.decode()
|
||||||
|
.map_err(candle::Error::wrap)?
|
||||||
|
.resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378
|
||||||
|
let img = img.to_rgb8();
|
||||||
|
let data = img.into_raw();
|
||||||
|
let data = Tensor::from_vec(data, (378, 378, 3), &Device::Cpu)?.permute((2, 0, 1))?;
|
||||||
|
let mean = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?;
|
||||||
|
let std = Tensor::new(&[0.5f32, 0.5, 0.5], &Device::Cpu)?.reshape((3, 1, 1))?;
|
||||||
|
(data.to_dtype(candle::DType::F32)? / 255.)?
|
||||||
|
.broadcast_sub(&mean)?
|
||||||
|
.broadcast_div(&std)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
use tracing_chrome::ChromeLayerBuilder;
|
||||||
|
use tracing_subscriber::prelude::*;
|
||||||
|
|
||||||
|
let args = Args::parse();
|
||||||
|
|
||||||
|
let _guard = if args.tracing {
|
||||||
|
let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
|
||||||
|
tracing_subscriber::registry().with(chrome_layer).init();
|
||||||
|
Some(guard)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
println!(
|
||||||
|
"avx: {}, neon: {}, simd128: {}, f16c: {}",
|
||||||
|
candle::utils::with_avx(),
|
||||||
|
candle::utils::with_neon(),
|
||||||
|
candle::utils::with_simd128(),
|
||||||
|
candle::utils::with_f16c()
|
||||||
|
);
|
||||||
|
println!(
|
||||||
|
"temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
|
||||||
|
args.temperature.unwrap_or(0.),
|
||||||
|
args.repeat_penalty,
|
||||||
|
args.repeat_last_n
|
||||||
|
);
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let api = hf_hub::api::tokio::Api::new()?;
|
||||||
|
let repo = api.model("vikhyatk/moondream2".to_string());
|
||||||
|
let model_file = repo.get("model.safetensors").await?;
|
||||||
|
let tokenizer = repo.get("tokenizer.json").await?;
|
||||||
|
println!("retrieved the files in {:?}", start.elapsed());
|
||||||
|
let tokenizer = Tokenizer::from_file(tokenizer).map_err(E::msg)?;
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let device = candle_examples::device(args.cpu)?;
|
||||||
|
let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
|
||||||
|
let config = moondream::Config::v2();
|
||||||
|
let model = moondream::Model::new(&config, vb)?;
|
||||||
|
println!("loaded the model in {:?}", start.elapsed());
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let image = load_image(args.image)?.to_device(&device)?;
|
||||||
|
let image_embeds = image.unsqueeze(0)?;
|
||||||
|
let image_embeds = image_embeds.apply(model.vision_encoder())?;
|
||||||
|
println!(
|
||||||
|
"loaded and encoded the image {image:?} in {:?}",
|
||||||
|
start.elapsed()
|
||||||
|
);
|
||||||
|
|
||||||
|
let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt);
|
||||||
|
|
||||||
|
let mut pipeline = TextGeneration::new(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
args.seed,
|
||||||
|
args.temperature,
|
||||||
|
args.top_p,
|
||||||
|
args.repeat_penalty,
|
||||||
|
args.repeat_last_n,
|
||||||
|
args.verbose_prompt,
|
||||||
|
&device,
|
||||||
|
);
|
||||||
|
pipeline.run(&prompt, &image_embeds, args.sample_len)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -438,6 +438,26 @@ impl MixFormerSequentialForCausalLM {
|
|||||||
xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
|
xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result<Tensor> {
|
||||||
|
let _enter = self.span.enter();
|
||||||
|
let xs = xs.apply(&self.embedding)?;
|
||||||
|
let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?;
|
||||||
|
let (_b_size, seq_len, _embds) = xs.dims3()?;
|
||||||
|
let mask = if seq_len <= 1 {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(get_mask(seq_len, xs.device())?)
|
||||||
|
};
|
||||||
|
for block in self.blocks.iter_mut() {
|
||||||
|
xs = block.forward(&xs, mask.as_ref())?
|
||||||
|
}
|
||||||
|
let xs = xs
|
||||||
|
.narrow(1, seq_len - 1, 1)?
|
||||||
|
.apply(&self.head)?
|
||||||
|
.squeeze(1)?;
|
||||||
|
Ok(xs)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn clear_kv_cache(&mut self) {
|
pub fn clear_kv_cache(&mut self) {
|
||||||
self.blocks.iter_mut().for_each(|b| b.clear_kv_cache())
|
self.blocks.iter_mut().for_each(|b| b.clear_kv_cache())
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,7 @@ pub mod mistral;
|
|||||||
pub mod mixformer;
|
pub mod mixformer;
|
||||||
pub mod mixtral;
|
pub mod mixtral;
|
||||||
pub mod mobileone;
|
pub mod mobileone;
|
||||||
|
pub mod moondream;
|
||||||
pub mod mpt;
|
pub mod mpt;
|
||||||
pub mod persimmon;
|
pub mod persimmon;
|
||||||
pub mod phi;
|
pub mod phi;
|
||||||
|
308
candle-transformers/src/models/moondream.rs
Normal file
308
candle-transformers/src/models/moondream.rs
Normal file
@ -0,0 +1,308 @@
|
|||||||
|
use crate::models::mixformer::{Config as PhiConfig, MixFormerSequentialForCausalLM as PhiModel};
|
||||||
|
use candle::{IndexOp, Result, Tensor, D};
|
||||||
|
use candle_nn::{layer_norm, linear_b, Linear, Module, VarBuilder};
|
||||||
|
|
||||||
|
pub struct Config {
|
||||||
|
pub phi_config: PhiConfig,
|
||||||
|
pub vision_config: VisionConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
pub fn v2() -> Self {
|
||||||
|
Self {
|
||||||
|
phi_config: PhiConfig::v1_5(),
|
||||||
|
vision_config: VisionConfig::v2(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Tensor> {
|
||||||
|
let dim = q.dim(D::Minus1)?;
|
||||||
|
let scale_factor = 1.0 / (dim as f64).sqrt();
|
||||||
|
let k = k.transpose(D::Minus2, D::Minus1)?.contiguous()?;
|
||||||
|
let mut attn_weights = (q.contiguous()?.matmul(&k)? * scale_factor)?;
|
||||||
|
attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?.contiguous()?;
|
||||||
|
let attn_weights = attn_weights.matmul(&v.contiguous()?)?;
|
||||||
|
Ok(attn_weights)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
|
||||||
|
pub struct VisionConfig {
|
||||||
|
image_embedding_dim: usize,
|
||||||
|
model_dim: usize,
|
||||||
|
hidden_dim: usize,
|
||||||
|
hidden_features: usize,
|
||||||
|
embed_len: usize,
|
||||||
|
embed_dim: usize,
|
||||||
|
num_blocks: usize,
|
||||||
|
num_heads: usize,
|
||||||
|
act: candle_nn::Activation,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VisionConfig {
|
||||||
|
pub fn v2() -> Self {
|
||||||
|
Self {
|
||||||
|
image_embedding_dim: 1152,
|
||||||
|
model_dim: 2048,
|
||||||
|
hidden_dim: 2048 * 4,
|
||||||
|
hidden_features: 4304,
|
||||||
|
embed_len: 729,
|
||||||
|
embed_dim: 1152,
|
||||||
|
num_blocks: 27,
|
||||||
|
num_heads: 16,
|
||||||
|
act: candle_nn::Activation::Gelu,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct LinearPatchEmbedding {
|
||||||
|
linear: Linear,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LinearPatchEmbedding {
|
||||||
|
fn new(vb: VarBuilder) -> Result<Self> {
|
||||||
|
let linear = linear_b(588, 1152, true, vb.pp("linear"))?;
|
||||||
|
Ok(Self { linear })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for LinearPatchEmbedding {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
xs.apply(&self.linear)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct Attention {
|
||||||
|
num_heads: usize,
|
||||||
|
head_dim: usize,
|
||||||
|
qkv: Linear,
|
||||||
|
proj: Linear,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Attention {
|
||||||
|
pub fn new(vb: VarBuilder, dim: usize, num_heads: usize) -> Result<Self> {
|
||||||
|
let qkv = linear_b(dim, dim * 3, true, vb.pp("qkv"))?;
|
||||||
|
let proj = linear_b(dim, dim, true, vb.pp("proj"))?;
|
||||||
|
Ok(Self {
|
||||||
|
num_heads,
|
||||||
|
head_dim: dim / num_heads,
|
||||||
|
qkv,
|
||||||
|
proj,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for Attention {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
let (b, n, c) = xs.dims3()?;
|
||||||
|
let qkv = xs
|
||||||
|
.apply(&self.qkv)?
|
||||||
|
.reshape((b, n, 3, self.num_heads, self.head_dim))?
|
||||||
|
.permute((2, 0, 3, 1, 4))?;
|
||||||
|
let (q, k, v) = (qkv.i(0)?, qkv.i(1)?, qkv.i(2)?);
|
||||||
|
let attn_weights = scaled_dot_product_attention(&q, &k, &v)?;
|
||||||
|
let attn_weights = attn_weights.transpose(1, 2)?.reshape((b, n, c))?;
|
||||||
|
attn_weights.apply(&self.proj)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct VitBlock {
|
||||||
|
attn: Attention,
|
||||||
|
mlp: Mlp,
|
||||||
|
norm1: candle_nn::LayerNorm,
|
||||||
|
norm2: candle_nn::LayerNorm,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VitBlock {
|
||||||
|
fn new(vb: VarBuilder, dim: usize, num_heads: usize, cfg: &VisionConfig) -> Result<Self> {
|
||||||
|
let attn = Attention::new(vb.pp("attn"), dim, num_heads)?;
|
||||||
|
let mlp = Mlp::new(vb.pp("mlp"), dim, cfg.hidden_features, dim, cfg.act)?;
|
||||||
|
let norm1 = layer_norm(dim, 1e-5, vb.pp("norm1"))?;
|
||||||
|
let norm2 = layer_norm(dim, 1e-5, vb.pp("norm2"))?;
|
||||||
|
Ok(Self {
|
||||||
|
attn,
|
||||||
|
mlp,
|
||||||
|
norm1,
|
||||||
|
norm2,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for VitBlock {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
let ys = xs.apply(&self.norm1)?.apply(&self.attn)?;
|
||||||
|
let xs = (xs + &ys)?;
|
||||||
|
let ys = xs.apply(&self.norm2)?.apply(&self.mlp)?;
|
||||||
|
let xs = (&xs + &ys)?;
|
||||||
|
Ok(xs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct VisionTransformer {
|
||||||
|
patch_embed: LinearPatchEmbedding,
|
||||||
|
pos_embed: Tensor,
|
||||||
|
blocks: Vec<VitBlock>,
|
||||||
|
norm: candle_nn::LayerNorm,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VisionTransformer {
|
||||||
|
fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
|
||||||
|
let patch_embed = LinearPatchEmbedding::new(vb.pp("patch_embed"))?;
|
||||||
|
let pos_embed = vb.get((1, cfg.embed_len, cfg.embed_dim), "pos_embed")?;
|
||||||
|
let blocks = (0..cfg.num_blocks)
|
||||||
|
.map(|i| {
|
||||||
|
VitBlock::new(
|
||||||
|
vb.pp(&format!("blocks.{}", i)),
|
||||||
|
cfg.embed_dim,
|
||||||
|
cfg.num_heads,
|
||||||
|
cfg,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<Result<_>>()?;
|
||||||
|
let norm = layer_norm(cfg.embed_dim, 1e-5, vb.pp("norm"))?;
|
||||||
|
Ok(Self {
|
||||||
|
patch_embed,
|
||||||
|
pos_embed,
|
||||||
|
blocks,
|
||||||
|
norm,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for VisionTransformer {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
let mut xs = (&xs.apply(&self.patch_embed)? + &self.pos_embed)?;
|
||||||
|
for block in self.blocks.iter() {
|
||||||
|
xs = xs.apply(block)?;
|
||||||
|
}
|
||||||
|
xs.apply(&self.norm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Encoder {
|
||||||
|
model: VisionTransformer,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Encoder {
|
||||||
|
fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
|
||||||
|
let model = VisionTransformer::new(cfg, vb.pp("model.visual"))?;
|
||||||
|
Ok(Self { model })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for Encoder {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
xs.apply(&self.model)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct Mlp {
|
||||||
|
fc1: Linear,
|
||||||
|
act: candle_nn::Activation,
|
||||||
|
fc2: Linear,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Mlp {
|
||||||
|
fn new(
|
||||||
|
vb: VarBuilder,
|
||||||
|
in_features: usize,
|
||||||
|
hidden_features: usize,
|
||||||
|
out_features: usize,
|
||||||
|
act: candle_nn::Activation,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let fc1 = linear_b(in_features, hidden_features, true, vb.pp("fc1"))?;
|
||||||
|
let fc2 = linear_b(hidden_features, out_features, true, vb.pp("fc2"))?;
|
||||||
|
Ok(Self { fc1, act, fc2 })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for Mlp {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
xs.apply(&self.fc1)?.apply(&self.act)?.apply(&self.fc2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct VisionProjection {
|
||||||
|
mlp: Mlp,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VisionProjection {
|
||||||
|
fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
|
||||||
|
let mlp = Mlp::new(
|
||||||
|
vb.pp("mlp"),
|
||||||
|
cfg.image_embedding_dim,
|
||||||
|
cfg.hidden_dim,
|
||||||
|
cfg.model_dim,
|
||||||
|
cfg.act,
|
||||||
|
)?;
|
||||||
|
Ok(Self { mlp })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for VisionProjection {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
xs.apply(&self.mlp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct VisionEncoder {
|
||||||
|
encoder: Encoder,
|
||||||
|
projection: VisionProjection,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VisionEncoder {
|
||||||
|
pub fn new(cfg: &VisionConfig, vb: VarBuilder) -> Result<Self> {
|
||||||
|
let encoder = Encoder::new(cfg, vb.pp("encoder"))?;
|
||||||
|
let projection = VisionProjection::new(cfg, vb.pp("projection"))?;
|
||||||
|
Ok(Self {
|
||||||
|
encoder,
|
||||||
|
projection,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Module for VisionEncoder {
|
||||||
|
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
|
||||||
|
let (b, c, hp1, wp2) = xs.dims4()?;
|
||||||
|
let (p1, p2) = (14, 14);
|
||||||
|
let h = hp1 / p1;
|
||||||
|
let w = wp2 / p2;
|
||||||
|
let xs = xs
|
||||||
|
.reshape((b, c, h, p1, h, p2))?
|
||||||
|
.permute((0, 2, 4, 1, 3, 5))?
|
||||||
|
.reshape((b, h * w, c * p1 * p2))?;
|
||||||
|
xs.apply(&self.encoder)?.apply(&self.projection)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Model {
|
||||||
|
pub text_model: PhiModel,
|
||||||
|
pub vision_encoder: VisionEncoder,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Model {
|
||||||
|
pub fn new(config: &Config, vb: VarBuilder) -> Result<Self> {
|
||||||
|
let text_model = PhiModel::new_v2(&config.phi_config, vb.pp("text_model"))?;
|
||||||
|
let vision_encoder = VisionEncoder::new(&config.vision_config, vb.pp("vision_encoder"))?;
|
||||||
|
Ok(Self {
|
||||||
|
text_model,
|
||||||
|
vision_encoder,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn vision_encoder(&self) -> &VisionEncoder {
|
||||||
|
&self.vision_encoder
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn text_model(&mut self) -> &mut PhiModel {
|
||||||
|
&mut self.text_model
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user