Added new language pairs to marian-mt example. (#2860)

* added new language pairs to marian-mt

* lint

* seperated python code for converting tokenizers into its own file and and added a reqirements.txt for dependencies, updated instructions in readme and included python version

* Cleanup.

---------

Co-authored-by: Laurent <laurent.mazare@gmail.com>
This commit is contained in:
Kyle Birnbaum
2025-04-02 14:50:14 -07:00
committed by GitHub
parent b4daa03e59
commit d6db305829
6 changed files with 311 additions and 1435 deletions

View File

@ -81,6 +81,126 @@ impl Config {
vocab_size: 59514,
}
}
pub fn opus_mt_en_zh() -> Self {
Self {
activation_function: candle_nn::Activation::Swish,
d_model: 512,
decoder_attention_heads: 8,
decoder_ffn_dim: 2048,
decoder_layers: 6,
decoder_start_token_id: 65000,
decoder_vocab_size: Some(65001),
encoder_attention_heads: 8,
encoder_ffn_dim: 2048,
encoder_layers: 6,
eos_token_id: 0,
forced_eos_token_id: 0,
is_encoder_decoder: true,
max_position_embeddings: 512,
pad_token_id: 65000,
scale_embedding: true,
share_encoder_decoder_embeddings: true,
use_cache: true,
vocab_size: 65001,
}
}
pub fn opus_mt_en_hi() -> Self {
Self {
activation_function: candle_nn::Activation::Swish,
d_model: 512,
decoder_attention_heads: 8,
decoder_ffn_dim: 2048,
decoder_layers: 6,
decoder_start_token_id: 61949,
decoder_vocab_size: Some(61950),
encoder_attention_heads: 8,
encoder_ffn_dim: 2048,
encoder_layers: 6,
eos_token_id: 0,
forced_eos_token_id: 0,
is_encoder_decoder: true,
max_position_embeddings: 512,
pad_token_id: 61949,
scale_embedding: true,
share_encoder_decoder_embeddings: true,
use_cache: true,
vocab_size: 61950,
}
}
pub fn opus_mt_en_es() -> Self {
Self {
activation_function: candle_nn::Activation::Swish,
d_model: 512,
decoder_attention_heads: 8,
decoder_ffn_dim: 2048,
decoder_layers: 6,
decoder_start_token_id: 65000,
decoder_vocab_size: Some(65001),
encoder_attention_heads: 8,
encoder_ffn_dim: 2048,
encoder_layers: 6,
eos_token_id: 0,
forced_eos_token_id: 0,
is_encoder_decoder: true,
max_position_embeddings: 512,
pad_token_id: 65000,
scale_embedding: true,
share_encoder_decoder_embeddings: true,
use_cache: true,
vocab_size: 65001,
}
}
pub fn opus_mt_en_fr() -> Self {
Self {
activation_function: candle_nn::Activation::Swish,
d_model: 512,
decoder_attention_heads: 8,
decoder_ffn_dim: 2048,
decoder_layers: 6,
decoder_start_token_id: 59513,
decoder_vocab_size: Some(59514),
encoder_attention_heads: 8,
encoder_ffn_dim: 2048,
encoder_layers: 6,
eos_token_id: 0,
forced_eos_token_id: 0,
is_encoder_decoder: true,
max_position_embeddings: 512,
pad_token_id: 59513,
scale_embedding: true,
share_encoder_decoder_embeddings: true,
use_cache: true,
vocab_size: 59514,
}
}
pub fn opus_mt_en_ru() -> Self {
Self {
activation_function: candle_nn::Activation::Swish,
d_model: 512,
decoder_attention_heads: 8,
decoder_ffn_dim: 2048,
decoder_layers: 6,
decoder_start_token_id: 62517,
decoder_vocab_size: Some(62518),
encoder_attention_heads: 8,
encoder_ffn_dim: 2048,
encoder_layers: 6,
eos_token_id: 0,
forced_eos_token_id: 0,
is_encoder_decoder: true,
max_position_embeddings: 512,
pad_token_id: 62517,
scale_embedding: true,
share_encoder_decoder_embeddings: true,
use_cache: true,
vocab_size: 62518,
}
}
}
#[derive(Debug, Clone)]