Prebuild all our kernels.

Expose more conv1d functions/structs. (#1726 )
Make the r, k, v tensors contiguous. (#1719 )
2025-06-19 03:54:56 +00:00 · 2024-03-18 16:39:38 +01:00 · 2024-02-17 18:50:55 +01:00 · 2024-02-16 09:17:35 +01:00 · 2024-02-15 16:47:33 +01:00
14 changed files with 109 additions and 213 deletions
--- a/candle-core/src/metal_backend.rs
+++ b/candle-core/src/metal_backend.rs
@ -588,6 +588,7 @@ impl BackendStorage for MetalStorage {
                (DType::U32, DType::F32) => "cast_u32_f32",
                (DType::U32, DType::U8) => "cast_u32_u8",
                (DType::U32, DType::I64) => "cast_u32_i64",
                (DType::U32, DType::F16) => "cast_u32_f16",
                (DType::U32, DType::BF16) => "cast_u32_bf16",
                (DType::U8, DType::U32) => "cast_u8_u32",
--- a/candle-examples/examples/llama/main.rs
+++ b/candle-examples/examples/llama/main.rs
@ -57,7 +57,7 @@ struct Args {
    seed: u64,
    /// The length of the sample to generate (in tokens).
-    #[arg(long, default_value_t = 100)]
+    #[arg(long, default_value_t = 10000)]
    sample_len: usize,
    /// Disable the key-value cache.
@ -143,7 +143,6 @@ fn main() -> Result<()> {
            }
            Which::TinyLlama1_1BChat => vec![api.get("model.safetensors")?],
        };
        println!("building the model");
        let cache = model::Cache::new(!args.no_kv_cache, dtype, &config, &device)?;
        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
@ -157,6 +156,7 @@ fn main() -> Result<()> {
        .map_err(E::msg)?
        .get_ids()
        .to_vec();
    let mut tokenizer = candle_examples::token_output_stream::TokenOutputStream::new(tokenizer);
    println!("starting the inference loop");
    print!("{prompt}");
@ -190,18 +190,16 @@ fn main() -> Result<()> {
        token_generated += 1;
        tokens.push(next_token);
        // Extracting the last token as a string is complicated, here we just apply some simple
        // heuristics as it seems to work well enough for this example. See the following for more
        // details:
        // https://github.com/huggingface/tokenizers/issues/1141#issuecomment-1562644141
        if let Some(text) = tokenizer.id_to_token(next_token) {
            let text = text.replace('▁', " ").replace("<0x0A>", "\n");
            print!("{text}");
            std::io::stdout().flush()?;
        }
        if Some(next_token) == eos_token_id {
            break;
        }
        if let Some(t) = tokenizer.next_token(next_token)? {
            print!("{t}");
            std::io::stdout().flush()?;
        }
    }
    if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
        print!("{rest}");
    }
    let dt = start_gen.elapsed();
    println!(
--- a/candle-examples/examples/llama2-c/main.rs
+++ b/candle-examples/examples/llama2-c/main.rs
@ -328,6 +328,7 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
        .map_err(E::msg)?
        .get_ids()
        .to_vec();
    let mut tokenizer = candle_examples::token_output_stream::TokenOutputStream::new(tokenizer);
    let start_gen = std::time::Instant::now();
    for index in 0.. {
@ -353,16 +354,14 @@ fn run_inference(args: &InferenceCmd, common_args: &Args) -> Result<()> {
        let next_token = logits_processor.sample(&logits)?;
        tokens.push(next_token);
-        // Extracting the last token as a string is complicated, here we just apply some simple
+        if let Some(t) = tokenizer.next_token(next_token)? {
-        // heuristics as it seems to work well enough for this example. See the following for more
+            print!("{t}");
        // details:
        // https://github.com/huggingface/tokenizers/issues/1141#issuecomment-1562644141
        if let Some(text) = tokenizer.id_to_token(next_token) {
            let text = text.replace('▁', " ").replace("<0x0A>", "\n");
            print!("{text}");
            std::io::stdout().flush()?;
        }
    }
    if let Some(rest) = tokenizer.decode_rest().map_err(E::msg)? {
        print!("{rest}");
    }
    let dt = start_gen.elapsed();
    println!(
        "\n{} tokens generated ({:.2} token/s)\n",
--- a/candle-examples/examples/mistral/main.rs
+++ b/candle-examples/examples/mistral/main.rs
@ -152,7 +152,7 @@ struct Args {
    seed: u64,
    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 100)]
+    #[arg(long, short = 'n', default_value_t = 10000)]
    sample_len: usize,
    #[arg(long)]
--- a/candle-examples/examples/mixtral/main.rs
+++ b/candle-examples/examples/mixtral/main.rs
@ -143,7 +143,7 @@ struct Args {
    seed: u64,
    /// The length of the sample to generate (in tokens).
-    #[arg(long, short = 'n', default_value_t = 100)]
+    #[arg(long, short = 'n', default_value_t = 10000)]
    sample_len: usize,
    #[arg(long, default_value = "mistralai/Mixtral-8x7B-v0.1")]
--- a/candle-metal-kernels/.gitignore
+++ b/candle-metal-kernels/.gitignore
@ -0,0 +1,2 @@
 src/compiled/
--- a/candle-metal-kernels/build.rs
+++ b/candle-metal-kernels/build.rs
@ -0,0 +1,45 @@
 use std::path::Path;
 use std::process::Command;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    let files: std::fs::ReadDir = std::fs::read_dir("src/").unwrap();
    for file in files {
        let file = file?;
        let path = file.path();
        if let Some(extension) = path.extension() {
            if extension == "metal" {
                build_kernel(&path)?;
            }
            println!("cargo:warning=output {:?}", path.file_stem());
        }
    }
    Ok(())
 }
 fn build_kernel(path: &Path) -> Result<(), Box<dyn std::error::Error>> {
    let stem = path
        .file_stem()
        .expect("expect real filename")
        .to_str()
        .expect("expect real stem");
    Command::new("xcrun")
        .args([
            "metal",
            "-c",
            path.as_os_str().to_str().expect("Expect a real filename"),
            "-I",
            "src/",
            "-o",
            &format!("src/compiled/{stem}.air"),
        ])
        .output()?;
    Command::new("xcrun")
        .args([
            "metallib",
            &format!("src/compiled/{stem}.air"),
            "-o",
            &format!("src/compiled/{stem}.metallib"),
        ])
        .output()?;
    Ok(())
 }
--- a/candle-metal-kernels/src/cast.metal
+++ b/candle-metal-kernels/src/cast.metal
@ -73,6 +73,7 @@ kernel void FN_NAME_STRIDED( \
 } \
 CAST(cast_u32_f32, cast_u32_f32_strided, uint32_t, float)
 CAST(cast_u32_f16, cast_u32_f16_strided, uint32_t, half)
 CAST(cast_u32_u8, cast_u32_u8_strided, uint32_t, uint8_t)
 CAST(cast_u8_u32, cast_u8_u32_strided, uint8_t, uint32_t)
 CAST(cast_u8_f32, cast_u8_f32_strided, uint8_t, float)
--- a/candle-metal-kernels/src/lib.rs
+++ b/candle-metal-kernels/src/lib.rs
@ -1,22 +1,22 @@
 use metal::{
-    Buffer, CommandBufferRef, CompileOptions, ComputeCommandEncoderRef, ComputePipelineState,
+    Buffer, CommandBufferRef, ComputeCommandEncoderRef, ComputePipelineState, Device, Function,
-    Device, Function, FunctionConstantValues, Library, MTLDataType, MTLSize, NSUInteger,
+    FunctionConstantValues, Library, MTLDataType, MTLSize, NSUInteger,
 };
 use std::collections::HashMap;
 use std::ffi::c_void;
 use std::sync::RwLock;
-const AFFINE: &str = include_str!("affine.metal");
+const AFFINE: &[u8] = include_bytes!("compiled/affine.metallib");
-const INDEXING: &str = include_str!("indexing.metal");
+const INDEXING: &[u8] = include_bytes!("compiled/indexing.metallib");
-const UNARY: &str = include_str!("unary.metal");
+const UNARY: &[u8] = include_bytes!("compiled/unary.metallib");
-const BINARY: &str = include_str!("binary.metal");
+const BINARY: &[u8] = include_bytes!("compiled/binary.metallib");
-const TERNARY: &str = include_str!("ternary.metal");
+const TERNARY: &[u8] = include_bytes!("compiled/ternary.metallib");
-const CAST: &str = include_str!("cast.metal");
+const CAST: &[u8] = include_bytes!("compiled/cast.metallib");
-const CONV: &str = include_str!("conv.metal");
+const CONV: &[u8] = include_bytes!("compiled/conv.metallib");
-const REDUCE: &str = include_str!("reduce.metal");
+const REDUCE: &[u8] = include_bytes!("compiled/reduce.metallib");
-const RANDOM: &str = include_str!("random.metal");
+const RANDOM: &[u8] = include_bytes!("compiled/random.metallib");
 const MFA: &[u8] = include_bytes!("libMetalFlashAttention.metallib");
-const QUANTIZED: &str = include_str!("quantized.metal");
+const QUANTIZED: &[u8] = include_bytes!("compiled/quantized.metallib");
 /// Most kernels apply similarly across the tensors
 /// This creates a strategy that uses the maximum amount of threads per threadgroup (capped at the
@ -235,7 +235,7 @@ impl Kernels {
        }
    }
-    fn get_library_source(&self, source: Source) -> &'static str {
+    fn get_library_source(&self, source: Source) -> &'static [u8] {
        match source {
            Source::Affine => AFFINE,
            Source::Unary => UNARY,
@ -247,7 +247,7 @@ impl Kernels {
            Source::Conv => CONV,
            Source::Random => RANDOM,
            Source::Quantized => QUANTIZED,
-            Source::Mfa => panic!("Invalid lib"),
+            Source::Mfa => MFA,
        }
    }
@ -262,22 +262,12 @@ impl Kernels {
        if let Some(lib) = libraries.get(&source) {
            Ok(lib.clone())
        } else {
-            let lib = match source {
+            let source_data = self.get_library_source(source);
-                Source::Mfa => {
+            let lib = device.new_library_with_data(source_data).map_err(|e| {
                    let source_data = MFA;
                    device.new_library_with_data(source_data).map_err(|e| {
                MetalKernelError::LoadLibraryError(format!(
                    "Candle metal requires macosx > 13.0 or higher, cannot load mfa: {e}"
                ))
-                    })?
+            })?;
                }
                source => {
                    let source_content = self.get_library_source(source);
                    device
                        .new_library_with_source(source_content, &CompileOptions::new())
                        .map_err(|e| MetalKernelError::LoadLibraryError(e.to_string()))?
                }
            };
            libraries.insert(source, lib.clone());
            Ok(lib)
        }
--- a/candle-nn/src/conv.rs
+++ b/candle-nn/src/conv.rs
@ -302,6 +302,22 @@ pub fn conv1d(
    Ok(Conv1d::new(ws, Some(bs), cfg))
 }
 pub fn conv1d_no_bias(
    in_channels: usize,
    out_channels: usize,
    kernel_size: usize,
    cfg: Conv1dConfig,
    vb: crate::VarBuilder,
 ) -> Result<Conv1d> {
    let init_ws = crate::init::DEFAULT_KAIMING_NORMAL;
    let ws = vb.get_with_hints(
        (out_channels, in_channels / cfg.groups, kernel_size),
        "weight",
        init_ws,
    )?;
    Ok(Conv1d::new(ws, None, cfg))
 }
 pub fn conv_transpose1d(
    in_channels: usize,
    out_channels: usize,
--- a/candle-nn/src/lib.rs
+++ b/candle-nn/src/lib.rs
@ -19,8 +19,9 @@ pub mod var_map;
 pub use activation::{prelu, Activation, PReLU};
 pub use batch_norm::{batch_norm, BatchNorm, BatchNormConfig};
 pub use conv::{
-    conv1d, conv2d, conv2d_no_bias, conv_transpose2d, conv_transpose2d_no_bias, Conv1d,
+    conv1d, conv1d_no_bias, conv2d, conv2d_no_bias, conv_transpose1d, conv_transpose1d_no_bias,
-    Conv1dConfig, Conv2d, Conv2dConfig, ConvTranspose2d, ConvTranspose2dConfig,
+    conv_transpose2d, conv_transpose2d_no_bias, Conv1d, Conv1dConfig, Conv2d, Conv2dConfig,
    ConvTranspose1d, ConvTranspose1dConfig, ConvTranspose2d, ConvTranspose2dConfig,
 };
 pub use embedding::{embedding, Embedding};
 pub use func::{func, func_t, Func, FuncT};
--- a/candle-transformers/src/models/mod.rs
+++ b/candle-transformers/src/models/mod.rs
@ -42,7 +42,6 @@ pub mod t5;
 pub mod trocr;
 pub mod vgg;
 pub mod vit;
 pub mod vocos;
 pub mod whisper;
 pub mod with_tracing;
 pub mod wuerstchen;
--- a/candle-transformers/src/models/rwkv_v5.rs
+++ b/candle-transformers/src/models/rwkv_v5.rs
@ -165,9 +165,9 @@ impl SelfAttention {
        let mut out: Vec<Tensor> = Vec::with_capacity(t);
        for t_ in 0..t {
            //
-            let rt = receptance.i((.., .., t_..t_ + 1))?;
+            let rt = receptance.i((.., .., t_..t_ + 1))?.contiguous()?;
-            let kt = key.i((.., .., .., t_..t_ + 1))?;
+            let kt = key.i((.., .., .., t_..t_ + 1))?.contiguous()?;
-            let vt = value.i((.., .., t_..t_ + 1))?;
+            let vt = value.i((.., .., t_..t_ + 1))?.contiguous()?;
            let at = kt.matmul(&vt)?;
            let rhs = (time_faaaa.broadcast_mul(&at)? + &state_)?;
            let out_ = rt.matmul(&rhs)?.squeeze(2)?;
--- a/candle-transformers/src/models/vocos.rs
+++ b/candle-transformers/src/models/vocos.rs
@ -1,156 +0,0 @@
 #![allow(unused)]
 use candle::{DType, Module, Result, Tensor, D};
 use candle_nn::{conv1d, embedding, linear, Conv1d, Conv1dConfig, Embedding, Linear, VarBuilder};
 pub struct AdaLayerNorm {
    eps: f64,
    dim: usize,
    scale: Embedding,
    shift: Embedding,
 }
 fn layer_norm(x: &Tensor, eps: f64) -> Result<Tensor> {
    let x_dtype = x.dtype();
    let internal_dtype = match x_dtype {
        DType::F16 | DType::BF16 => DType::F32,
        d => d,
    };
    let hidden_size = x.dim(D::Minus1)?;
    let x = x.to_dtype(internal_dtype)?;
    let x = {
        let mean_x = (x.sum_keepdim(D::Minus1)? / hidden_size as f64)?;
        x.broadcast_sub(&mean_x)?
    };
    let norm_x = (x.sqr()?.sum_keepdim(D::Minus1)? / hidden_size as f64)?;
    let x_normed = x.broadcast_div(&(norm_x + eps)?.sqrt()?)?;
    x_normed.to_dtype(x_dtype)
 }
 impl AdaLayerNorm {
    pub fn new(
        num_embeddings: usize,
        embedding_dim: usize,
        eps: f64,
        vb: VarBuilder,
    ) -> Result<Self> {
        let scale = embedding(num_embeddings, embedding_dim, vb.pp("scale"))?;
        let shift = embedding(num_embeddings, embedding_dim, vb.pp("shift"))?;
        Ok(Self {
            eps,
            dim: embedding_dim,
            scale,
            shift,
        })
    }
    pub fn forward(&self, xs: &Tensor, cond_embedding_id: &Tensor) -> Result<Tensor> {
        let scale = self.scale.forward(cond_embedding_id)?;
        let shift = self.shift.forward(cond_embedding_id)?;
        let xs = layer_norm(xs, self.eps)?;
        xs * scale + shift
    }
 }
 pub struct ConvNeXtBlock {
    dwconv: Conv1d,
    pwconv1: Linear,
    pwconv2: Linear,
    gamma: Option<Tensor>,
 }
 impl ConvNeXtBlock {
    pub fn new(
        dim: usize,
        intermediate_dim: usize,
        layer_scale_init_value: f64,
        adanorm_num_embeddings: Option<usize>,
        vb: VarBuilder,
    ) -> Result<Self> {
        let dwconv = {
            let cfg = Conv1dConfig {
                padding: 3,
                groups: dim,
                ..Default::default()
            };
            conv1d(dim, dim, 7, cfg, vb.pp("dwconv"))?
        };
        let pwconv1 = linear(dim, intermediate_dim, vb.pp("pwconv1"))?;
        let pwconv2 = linear(intermediate_dim, dim, vb.pp("pwconv2"))?;
        let gamma = if layer_scale_init_value > 0. {
            Some(vb.get(dim, "gamma")?)
        } else {
            None
        };
        Ok(Self {
            dwconv,
            pwconv1,
            pwconv2,
            gamma,
        })
    }
    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let residual = xs;
        let xs = xs.apply(&self.dwconv)?.transpose(1, 2)?;
        // TODO: norm
        let xs = xs.apply(&self.pwconv1)?.gelu()?.apply(&self.pwconv2)?;
        let xs = match self.gamma.as_ref() {
            Some(gamma) => (gamma * xs)?,
            None => xs,
        };
        xs.transpose(1, 2)? + residual
    }
 }
 struct VocosBackbone {
    embed: Conv1d,
    convnext: Vec<ConvNeXtBlock>,
    final_layer_norm: candle_nn::LayerNorm,
 }
 impl VocosBackbone {
    pub fn new(
        input_channels: usize,
        dim: usize,
        intermediate_dim: usize,
        num_layers: dim,
        layer_scale_init_value: f64,
        adanorm_num_embeddings: Option<usize>,
        vb: VarBuilder,
    ) -> Result<Self> {
        let embed = {
            let cfg = Conv1dConfig {
                padding: 3,
                ..Default::default()
            };
            conv1d(input_channels, dim, 7, cfg, vb.pp("embed"))?
        };
        let mut convnext = Vec::with_capacity(num_layers);
        let vb_c = vb.pp("convnext");
        for i in 0..num_layers {
            let block = ConvNeXtBlock::new(
                dim,
                intermediate_dim,
                layer_scale_init_value,
                adanorm_num_embeddings,
                vb_c.pp(i),
            )?;
        }
        let final_layer_norm = candle_nn::layer_norm(dim, 1e-6, vb.pp("final_layer_norm"))?;
        Ok(Self {
            embed,
            convnext,
            final_layer_norm,
        })
    }
    pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let xs = xs.apply(&self.embed)?;
        // TODO: norm
        let mut xs = xs.transpose(1, 2)?;
        for conv_block in self.convnext.iter() {
            xs = conv_block.forward(&xs)?
        }
        xs.apply(&self.final_layer_norm)
    }
 }
Author	SHA1	Message	Date
Nicolas Patry	5ac3302fac	Prebuild all our kernels.	2024-03-18 16:39:38 +01:00
Laurent Mazare	41416d2376	Expose more conv1d functions/structs. (#1726 )	2024-02-17 18:50:55 +01:00
Laurent Mazare	5ebcfeaf0f	Make the r, k, v tensors contiguous. (#1719 )	2024-02-16 09:17:35 +01:00
Laurent Mazare	7c7400fb63	Use the tokenizer-output-stream in the llama example. (#1715 ) * Use the tokenizer-output-stream in the llama example. * Also use tokenizer-output-stream for llama2-c.	2024-02-15 16:47:33 +01:00