mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 18:48:51 +00:00
Support flash-attn in quantized phi3. (#2194)
This commit is contained in:
@ -90,6 +90,9 @@ struct Args {
|
||||
/// The model size to use.
|
||||
#[arg(long, default_value = "phi-3b")]
|
||||
which: Which,
|
||||
|
||||
#[arg(long)]
|
||||
use_flash_attn: bool,
|
||||
}
|
||||
|
||||
impl Args {
|
||||
@ -213,7 +216,13 @@ fn main() -> anyhow::Result<()> {
|
||||
);
|
||||
match args.which {
|
||||
Which::Phi2 => Model::Phi2(Phi2::from_gguf(model, &mut file, &device)?),
|
||||
Which::Phi3 => Model::Phi3(Phi3::from_gguf(1, model, &mut file, &device)?),
|
||||
Which::Phi3 => Model::Phi3(Phi3::from_gguf(
|
||||
1,
|
||||
args.use_flash_attn,
|
||||
model,
|
||||
&mut file,
|
||||
&device,
|
||||
)?),
|
||||
Which::Phi3b => Model::Phi3b(Phi3b::from_gguf(model, &mut file, &device)?),
|
||||
}
|
||||
};
|
||||
|
Reference in New Issue
Block a user