Add LLaVA support (#2234)

* first commit

* llava

* clippy and fmt

* some fixes

* minor fixes

* remove useless file

* refactor: Remove llava/constants.rs and update llava/mod.rs

* modify variable name

* modify code after clippy

* Minor tweaks.

---------

Co-authored-by: laurent <laurent.mazare@gmail.com>
This commit is contained in:
chenwanqq
2024-06-03 17:54:09 +08:00
committed by GitHub
parent 03344d3c19
commit cd4d941ed1
12 changed files with 1567 additions and 0 deletions

View File

@ -262,6 +262,20 @@ impl ClipEncoder {
}
Ok(xs)
}
// required by LLaVA
pub fn output_hidden_states(
&self,
xs: &Tensor,
causal_attention_mask: Option<&Tensor>,
) -> Result<Vec<Tensor>> {
let mut xs = xs.clone();
let mut hidden_states = Vec::new();
for layer in self.layers.iter() {
xs = layer.forward(&xs, causal_attention_mask)?;
hidden_states.push(xs.clone());
}
Ok(hidden_states)
}
}
/// A CLIP transformer based model.

View File

@ -46,6 +46,19 @@ impl ClipVisionConfig {
patch_size: 32,
}
}
pub fn clip_vit_large_patch14_336() -> Self {
Self {
embed_dim: 1024,
activation: Activation::QuickGelu,
intermediate_size: 4096,
num_hidden_layers: 24,
num_attention_heads: 16,
projection_dim: 768,
num_channels: 3,
image_size: 336,
patch_size: 14,
}
}
}
// https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L112
@ -130,6 +143,17 @@ impl ClipVisionTransformer {
pre_layer_norm,
})
}
// required by LLaVA
pub fn output_hidden_states(&self, pixel_values: &Tensor) -> Result<Vec<Tensor>> {
let hidden_states = pixel_values
.apply(&self.embeddings)?
.apply(&self.pre_layer_norm)?;
let mut result = self.encoder.output_hidden_states(&hidden_states, None)?;
let encoder_outputs = result.last().unwrap();
let pooled_output = encoder_outputs.i((.., 0, ..))?;
result.push(self.final_layer_norm.forward(&pooled_output)?.clone());
Ok(result)
}
}
impl Module for ClipVisionTransformer {