Support flash-attn in quantized phi3. (#2194)

This commit is contained in:
Laurent Mazare
2024-05-18 17:12:56 +02:00
committed by GitHub
parent 01545f7303
commit eefc1c77ef
2 changed files with 50 additions and 11 deletions

View File

@ -90,6 +90,9 @@ struct Args {
/// The model size to use.
#[arg(long, default_value = "phi-3b")]
which: Which,
#[arg(long)]
use_flash_attn: bool,
}
impl Args {
@ -213,7 +216,13 @@ fn main() -> anyhow::Result<()> {
);
match args.which {
Which::Phi2 => Model::Phi2(Phi2::from_gguf(model, &mut file, &device)?),
Which::Phi3 => Model::Phi3(Phi3::from_gguf(1, model, &mut file, &device)?),
Which::Phi3 => Model::Phi3(Phi3::from_gguf(
1,
args.use_flash_attn,
model,
&mut file,
&device,
)?),
Which::Phi3b => Model::Phi3b(Phi3b::from_gguf(model, &mut file, &device)?),
}
};