Support flash-attn in quantized phi3. (#2194)

2025-06-16 10:38:54 +00:00 · 2024-05-18 17:12:56 +02:00
parent 01545f7303
commit eefc1c77ef
2 changed files with 50 additions and 11 deletions
--- a/candle-examples/examples/quantized-phi/main.rs
+++ b/candle-examples/examples/quantized-phi/main.rs
@ -90,6 +90,9 @@ struct Args {
    /// The model size to use.
    #[arg(long, default_value = "phi-3b")]
    which: Which,
+
+    #[arg(long)]
+    use_flash_attn: bool,
 }

 impl Args {
@ -213,7 +216,13 @@ fn main() -> anyhow::Result<()> {
        );
        match args.which {
            Which::Phi2 => Model::Phi2(Phi2::from_gguf(model, &mut file, &device)?),
-            Which::Phi3 => Model::Phi3(Phi3::from_gguf(1, model, &mut file, &device)?),
+            Which::Phi3 => Model::Phi3(Phi3::from_gguf(
+                1,
+                args.use_flash_attn,
+                model,
+                &mut file,
+                &device,
+            )?),
            Which::Phi3b => Model::Phi3b(Phi3b::from_gguf(model, &mut file, &device)?),
        }
    };