mirror of
https://github.com/huggingface/candle.git
synced 2025-06-16 10:38:54 +00:00
More ggml cuda kernels (#1977)
* Add more cuda kernels for quantized matmul. * Add the vec-dot bits. * Expose the quantized matmul-vec kernels. * Also include the quantize-q8-1 kernel. * Glue code for the q8-1 quantization. * mm-vec product via q8-1 quantization. * Add a test. * Add a mm test. * Get the test to return some sensible results. * Also test dmmv. * Fix the launch params. * Allow for tweaking the force_dmmv parameter while it's experimental.
This commit is contained in:
@ -235,6 +235,10 @@ struct Args {
|
||||
/// Group-Query Attention, use 8 for the 70B version of LLaMAv2.
|
||||
#[arg(long)]
|
||||
gqa: Option<usize>,
|
||||
|
||||
/// Use the (experimental) fast cuda kernels.
|
||||
#[arg(long)]
|
||||
fast_cuda: bool,
|
||||
}
|
||||
|
||||
impl Args {
|
||||
@ -341,6 +345,10 @@ fn main() -> anyhow::Result<()> {
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
candle::quantized::cuda::set_force_dmmv(!args.fast_cuda);
|
||||
|
||||
let temperature = if args.temperature == 0. {
|
||||
None
|
||||
} else {
|
||||
|
Reference in New Issue
Block a user