Avoid copying the data on squeeze and unsqueeze. (#1884)

* Avoid copying the data on squeeze and unsqueeze. * Fix the quantized llama example. * Unrelated fix for the quantized stable-lm example on cuda. * Fix for mamba on cuda (unrelated to the PR).
2025-06-21 20:22:49 +00:00 · 2024-03-20 13:04:36 +01:00
parent 2a8679509e
commit 455c42aa72
5 changed files with 47 additions and 8 deletions
--- a/candle-examples/examples/stable-lm/main.rs
+++ b/candle-examples/examples/stable-lm/main.rs
@ -288,12 +288,12 @@ fn main() -> Result<()> {
    };

    let device = candle_examples::device(args.cpu)?;
-    let (model, device) = if args.quantized {
+    let model = if args.quantized {
        let filename = &filenames[0];
        let vb =
            candle_transformers::quantized_var_builder::VarBuilder::from_gguf(filename, &device)?;
        let model = QStableLM::new(&config, vb)?;
-        (Model::Quantized(model), Device::Cpu)
+        Model::Quantized(model)
    } else {
        let dtype = if device.is_cuda() {
            DType::BF16
@ -302,7 +302,7 @@ fn main() -> Result<()> {
        };
        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
        let model = StableLM::new(&config, vb)?;
-        (Model::StableLM(model), device)
+        Model::StableLM(model)
    };

    println!("loaded the model in {:?}", start.elapsed());