mirror of
https://github.com/huggingface/candle.git
synced 2025-06-15 18:28:24 +00:00
Add flag to run Moondream in f16 precision (#2015)
* moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward * Use latest release special token; Fix token/s accuracy; Use GeluPytorchTanh in VisionConfig v2 * Add flag to use f16 * Avoid breaking the quantized version on cuda. --------- Co-authored-by: laurent <laurent.mazare@gmail.com>
This commit is contained in:
@ -194,6 +194,10 @@ struct Args {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
quantized: bool,
|
quantized: bool,
|
||||||
|
|
||||||
|
/// Use f16 precision for all the computations rather than f32.
|
||||||
|
#[arg(long)]
|
||||||
|
f16: bool,
|
||||||
|
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
model_file: Option<String>,
|
model_file: Option<String>,
|
||||||
|
|
||||||
@ -283,7 +287,12 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
let start = std::time::Instant::now();
|
let start = std::time::Instant::now();
|
||||||
let device = candle_examples::device(args.cpu)?;
|
let device = candle_examples::device(args.cpu)?;
|
||||||
let config = moondream::Config::v2();
|
let config = moondream::Config::v2();
|
||||||
let dtype = if device.is_cuda() && !args.quantized {
|
let dtype = if args.quantized {
|
||||||
|
if args.f16 {
|
||||||
|
anyhow::bail!("Quantized model does not support f16");
|
||||||
|
}
|
||||||
|
DType::F32
|
||||||
|
} else if device.is_cuda() || args.f16 {
|
||||||
DType::F16
|
DType::F16
|
||||||
} else {
|
} else {
|
||||||
DType::F32
|
DType::F32
|
||||||
|
Reference in New Issue
Block a user