Add a flag to force running the quantized model on CPUs. (#1778)

* Add a flag to force running the quantized model on CPUs. * Add encodec to the readme.
2025-06-16 02:38:10 +00:00 · 2024-02-28 14:58:42 +01:00
parent 60ee5cfd4d
commit 57267cd536
2 changed files with 10 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -83,6 +83,8 @@ We also provide a some command line based examples using state of the art models
 - [Replit-code-v1.5](./candle-examples/examples/replit-code/): a 3.3b LLM specialized for code completion.
 - [Yi-6B / Yi-34B](./candle-examples/examples/yi/): two bilingual
  (English/Chinese) general LLMs with 6b and 34b parameters.
 - [EnCodec](./candle-examples/examples/encodec/): high-quality audio compression
  model using residual vector quantization.
 - [Quantized LLaMA](./candle-examples/examples/quantized/): quantized version of
  the LLaMA model using the same quantization techniques as
  [llama.cpp](https://github.com/ggerganov/llama.cpp).
@ -210,13 +212,15 @@ If you have an addition to this list, please submit a pull request.
    - Text to text.
        - T5 and its variants: FlanT5, UL2, MADLAD400 (translation), CoEdit (Grammar correction).
        - Marian MT (Machine Translation).
    - Whisper (multi-lingual support).
    - Text to image.
        - Stable Diffusion v1.5, v2.1, XL v1.0.
        - Wurstchen v2.
    - Image to text.
        - BLIP.
        - TrOCR.
    - Audio.
        - Whisper, multi-lingual text-to-speech.
        - EnCodec, audio compression model.
    - Computer Vision Models.
        - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
          ConvNeXTv2.
--- a/candle-examples/examples/quantized/main.rs
+++ b/candle-examples/examples/quantized/main.rs
@ -216,6 +216,10 @@ struct Args {
    #[arg(long)]
    split_prompt: bool,
    /// Run on CPU rather than GPU even if a GPU is available.
    #[arg(long)]
    cpu: bool,
    /// Penalty to be applied for repeating tokens, 1. means no penalty.
    #[arg(long, default_value_t = 1.1)]
    repeat_penalty: f32,
@ -365,7 +369,7 @@ fn main() -> anyhow::Result<()> {
    let model_path = args.model()?;
    let mut file = std::fs::File::open(&model_path)?;
    let start = std::time::Instant::now();
-    let device = candle_examples::device(false)?;
+    let device = candle_examples::device(args.cpu)?;
    let mut model = match model_path.extension().and_then(|v| v.to_str()) {
        Some("gguf") => {