Remove the end of text tokens. (#289)

This commit is contained in:
Laurent Mazare
2023-07-31 20:43:57 +01:00
committed by GitHub
parent 9ae1f6afee
commit 6b98b66eb3
2 changed files with 61 additions and 2 deletions

View File

@ -266,7 +266,8 @@ fn run_eval(tokenizer: Tokenizer, config_path: &std::path::PathBuf, args: Args)
let file = std::io::BufReader::new(file);
let mut tokens = vec![];
for line in file.lines() {
let line = tokenizer.encode(line?, false).map_err(E::msg)?;
let line = line?.replace("<|endoftext|>", "");
let line = tokenizer.encode(line, false).map_err(E::msg)?;
tokens.push(line.get_ids().to_vec())
}
let tokens = tokens.concat();