Avoid copying the data on squeeze and unsqueeze. (#1884)

* Avoid copying the data on squeeze and unsqueeze.

* Fix the quantized llama example.

* Unrelated fix for the quantized stable-lm example on cuda.

* Fix for mamba on cuda (unrelated to the PR).
This commit is contained in:
Laurent Mazare
2024-03-20 13:04:36 +01:00
committed by GitHub
parent 2a8679509e
commit 455c42aa72
5 changed files with 47 additions and 8 deletions

View File

@ -288,12 +288,12 @@ fn main() -> Result<()> {
};
let device = candle_examples::device(args.cpu)?;
let (model, device) = if args.quantized {
let model = if args.quantized {
let filename = &filenames[0];
let vb =
candle_transformers::quantized_var_builder::VarBuilder::from_gguf(filename, &device)?;
let model = QStableLM::new(&config, vb)?;
(Model::Quantized(model), Device::Cpu)
Model::Quantized(model)
} else {
let dtype = if device.is_cuda() {
DType::BF16
@ -302,7 +302,7 @@ fn main() -> Result<()> {
};
let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
let model = StableLM::new(&config, vb)?;
(Model::StableLM(model), device)
Model::StableLM(model)
};
println!("loaded the model in {:?}", start.elapsed());