mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 02:38:10 +00:00
Added new language pairs to marian-mt example. (#2860)
* added new language pairs to marian-mt * lint * seperated python code for converting tokenizers into its own file and and added a reqirements.txt for dependencies, updated instructions in readme and included python version * Cleanup. --------- Co-authored-by: Laurent <laurent.mazare@gmail.com>
This commit is contained in:
@ -0,0 +1,53 @@
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.convert_slow_tokenizer import SpmConverter, requires_backends, import_protobuf
|
||||
|
||||
class MarianConverter(SpmConverter):
|
||||
def __init__(self, *args, index: int = 0):
|
||||
requires_backends(self, "protobuf")
|
||||
|
||||
super(SpmConverter, self).__init__(*args)
|
||||
|
||||
# from .utils import sentencepiece_model_pb2 as model_pb2
|
||||
model_pb2 = import_protobuf()
|
||||
|
||||
m = model_pb2.ModelProto()
|
||||
print(self.original_tokenizer.spm_files)
|
||||
with open(self.original_tokenizer.spm_files[index], "rb") as f:
|
||||
m.ParseFromString(f.read())
|
||||
self.proto = m
|
||||
print(self.original_tokenizer)
|
||||
#with open(self.original_tokenizer.vocab_path, "r") as f:
|
||||
dir_path = Path(self.original_tokenizer.spm_files[0]).parents[0]
|
||||
with open(dir_path / "vocab.json", "r") as f:
|
||||
import json
|
||||
self._vocab = json.load(f)
|
||||
|
||||
if self.proto.trainer_spec.byte_fallback:
|
||||
if not getattr(self, "handle_byte_fallback", None):
|
||||
warnings.warn(
|
||||
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
||||
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
|
||||
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
|
||||
"unknown tokens into a sequence of byte tokens matching the original piece of text."
|
||||
)
|
||||
|
||||
def vocab(self, proto):
|
||||
vocab_size = max(self._vocab.values()) + 1
|
||||
vocab = [("<NIL>", -100) for _ in range(vocab_size)]
|
||||
for piece in proto.pieces:
|
||||
try:
|
||||
index = self._vocab[piece.piece]
|
||||
except Exception:
|
||||
print(f"Ignored missing piece {piece.piece}")
|
||||
vocab[index] = (piece.piece, piece.score)
|
||||
return vocab
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en", use_fast=False)
|
||||
fast_tokenizer = MarianConverter(tokenizer, index=0).converted()
|
||||
fast_tokenizer.save("tokenizer-marian-base-fr.json")
|
||||
fast_tokenizer = MarianConverter(tokenizer, index=1).converted()
|
||||
fast_tokenizer.save("tokenizer-marian-base-en.json")
|
22
candle-examples/examples/marian-mt/python/requirements.txt
Normal file
22
candle-examples/examples/marian-mt/python/requirements.txt
Normal file
@ -0,0 +1,22 @@
|
||||
certifi==2025.1.31
|
||||
charset-normalizer==3.4.1
|
||||
click==8.1.8
|
||||
filelock==3.18.0
|
||||
fsspec==2025.3.2
|
||||
huggingface-hub==0.30.1
|
||||
idna==3.10
|
||||
joblib==1.4.2
|
||||
numpy==2.2.4
|
||||
packaging==24.2
|
||||
protobuf==6.30.2
|
||||
pyyaml==6.0.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
sacremoses==0.1.1
|
||||
safetensors==0.5.3
|
||||
sentencepiece==0.2.0
|
||||
tokenizers==0.21.1
|
||||
tqdm==4.67.1
|
||||
transformers==4.50.3
|
||||
typing-extensions==4.13.0
|
||||
urllib3==2.3.0
|
Reference in New Issue
Block a user