Allow for lazy loading of npz files, use it in llama to reduce memory usage in the cpu version. (#141)

This commit is contained in:
Laurent Mazare
2023-07-11 20:22:34 +01:00
committed by GitHub
parent 37cad85869
commit fa760759e5
4 changed files with 77 additions and 9 deletions

View File

@ -145,11 +145,7 @@ fn main() -> Result<()> {
let cache = model::Cache::new(!args.no_kv_cache, &config, &device);
let (llama, tokenizer_filename) = match args.npy {
Some(filename) => {
let tensors = Tensor::read_npz(filename)?
.into_iter()
.map(|(n, t)| Ok((n, t.to_dtype(DTYPE)?)))
.collect::<Result<std::collections::HashMap<String, Tensor>>>()?;
let vb = VarBuilder::from_tensors(tensors, DTYPE, &device);
let vb = VarBuilder::from_npz(filename, DTYPE, &device)?;
let tokenizer = std::path::PathBuf::from("llama-tokenizer.json");
(Llama::load(vb, &cache, &config)?, tokenizer)
}