Fixing order.

Splitting the features to enable different mkl linking.
Fix reinforcement learning example (#2837 )
2025-06-18 03:28:50 +00:00 · 2025-03-28 11:43:33 +01:00 · 2025-03-28 10:13:13 +01:00 · 2025-03-26 16:27:45 +01:00 · 2025-03-26 08:10:03 +01:00 · 2025-03-26 08:09:27 +01:00
33 changed files with 116 additions and 79 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -51,7 +51,7 @@ half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_di
 hound = "3.5.1"
 image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
-intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
+intel-mkl-src = { version = "0.8.1" }
 libc = { version = "0.2.147" }
 log = "0.4"
 memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -45,7 +45,8 @@ criterion = { workspace = true }
 default = []
 cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda"]
 cudnn = ["cuda", "cudarc/cudnn"]
-mkl = ["dep:libc", "dep:intel-mkl-src"]
+_mkl = ["dep:libc", "dep:intel-mkl-src"]
+mkl = ["_mkl", "intel-mkl-src?/mkl-static-lp64-iomp"] 
 accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels", "dep:ug-metal"]

--- a/candle-core/benches/benchmarks/mod.rs
+++ b/candle-core/benches/benchmarks/mod.rs
@ -39,7 +39,7 @@ impl BenchDevice for Device {
            Device::Cpu => {
                let cpu_type = if cfg!(feature = "accelerate") {
                    "accelerate"
-                } else if cfg!(feature = "mkl") {
+                } else if cfg!(feature = "_mkl") {
                    "mkl"
                } else {
                    "cpu"
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -1,7 +1,7 @@
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;

-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 use anyhow::Result;
--- a/candle-core/examples/cuda_sum_benchmark.rs
+++ b/candle-core/examples/cuda_sum_benchmark.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-core/examples/metal_basics.rs
+++ b/candle-core/examples/metal_basics.rs
@ -1,7 +1,7 @@
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;

-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 use anyhow::Result;
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
@ -1246,7 +1246,7 @@ impl MatMul {
 impl Map2 for MatMul {
    const OP: &'static str = "mat_mul";

-    #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))]
+    #[cfg(all(not(feature = "_mkl"), not(feature = "accelerate")))]
    fn f<T: 'static + WithDType + num_traits::Num + Copy>(
        &self,
        lhs: &[T],
@ -1411,7 +1411,7 @@ impl Map2 for MatMul {
        Ok(dst)
    }

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    fn f<T: 'static + WithDType + num_traits::Num + Copy>(
        &self,
        lhs: &[T],
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -68,7 +68,7 @@ mod indexer;
 pub mod layout;
 #[cfg(feature = "metal")]
 pub mod metal_backend;
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 mod mkl;
 pub mod npy;
 pub mod op;
@ -118,7 +118,7 @@ pub use metal_backend::{MetalDevice, MetalError, MetalStorage};
 #[cfg(not(feature = "metal"))]
 pub use dummy_metal_backend::{MetalDevice, MetalError, MetalStorage};

-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@ -294,16 +294,16 @@ macro_rules! bin_op {
                $e(v1, v2)
            }

-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            const F32_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            const F64_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            #[inline(always)]
            fn f32_vec(xs1: &[f32], xs2: &[f32], ys: &mut [f32]) {
                crate::mkl::$f32_vec(xs1, xs2, ys)
            }
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            #[inline(always)]
            fn f64_vec(xs1: &[f64], xs2: &[f64], ys: &mut [f64]) {
                crate::mkl::$f64_vec(xs1, xs2, ys)
@ -418,16 +418,16 @@ macro_rules! unary_op {
                todo!("no unary function for i64")
            }

-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            const F32_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            const F64_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            #[inline(always)]
            fn f32_vec(xs: &[f32], ys: &mut [f32]) {
                crate::mkl::$f32_vec(xs, ys)
            }
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
            #[inline(always)]
            fn f64_vec(xs: &[f64], ys: &mut [f64]) {
                crate::mkl::$f64_vec(xs, ys)
@ -518,19 +518,19 @@ impl UnaryOpT for Gelu {
    }
    const KERNEL: &'static str = "ugelu";

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    const F32_VEC: bool = true;

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    #[inline(always)]
    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
        crate::mkl::vs_gelu(xs, ys)
    }

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    const F64_VEC: bool = true;

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    #[inline(always)]
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::mkl::vd_gelu(xs, ys)
@ -625,19 +625,19 @@ impl UnaryOpT for Silu {
    }
    const KERNEL: &'static str = "usilu";

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    const F32_VEC: bool = true;

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    #[inline(always)]
    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
        crate::mkl::vs_silu(xs, ys)
    }

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    const F64_VEC: bool = true;

-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
    #[inline(always)]
    fn f64_vec(xs: &[f64], ys: &mut [f64]) {
        crate::mkl::vd_silu(xs, ys)
--- a/candle-core/src/pickle.rs
+++ b/candle-core/src/pickle.rs
@ -45,6 +45,7 @@ pub enum OpCode {
    BinFloat = b'G',
    Append = b'a',
    Appends = b'e',
+    Long1 = 0x8a,
 }

 // Avoid using FromPrimitive so as not to drag another dependency.
@ -84,6 +85,7 @@ impl TryFrom<u8> for OpCode {
            b'G' => Ok(Self::BinFloat),
            b'a' => Ok(Self::Append),
            b'e' => Ok(Self::Appends),
+            0x8a => Ok(Self::Long1),
            value => Err(value),
        }
    }
@ -106,6 +108,7 @@ pub enum Object {
        class_name: String,
    },
    Int(i32),
+    Long(i64),
    Float(f64),
    Unicode(String),
    Bool(bool),
@ -170,6 +173,14 @@ impl Object {
        }
    }

+    pub fn int_or_long(self) -> OResult<i64> {
+        match self {
+            Self::Int(t) => Ok(t as i64),
+            Self::Long(t) => Ok(t),
+            _ => Err(self),
+        }
+    }
+
    pub fn tuple(self) -> OResult<Vec<Self>> {
        match self {
            Self::Tuple(t) => Ok(t),
@ -590,6 +601,15 @@ impl Stack {
                let obj = self.new_obj(class, args)?;
                self.push(obj)
            }
+            OpCode::Long1 => {
+                let n_bytes = r.read_u8()?;
+                let mut v = 0;
+                // Decode the next n bytes in little endian
+                for i in 0..n_bytes {
+                    v |= (r.read_u8()? as i64) << (i * 8);
+                }
+                self.push(Object::Long(v))
+            }
        }
        Ok(false)
    }
@ -607,10 +627,10 @@ fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
    let mut args = args.tuple()?;
    let stride = Vec::<usize>::try_from(args.remove(3))?;
    let size = Vec::<usize>::try_from(args.remove(2))?;
-    let offset = args.remove(1).int()? as usize;
+    let offset = args.remove(1).int_or_long()? as usize;
    let storage = args.remove(0).persistent_load()?;
    let mut storage = storage.tuple()?;
-    let storage_size = storage.remove(4).int()? as usize;
+    let storage_size = storage.remove(4).int_or_long()? as usize;
    let path = storage.remove(2).unicode()?;
    let (_module_name, class_name) = storage.remove(1).class()?;
    let dtype = match class_name.as_str() {
@ -624,7 +644,11 @@ fn rebuild_args(args: Object) -> Result<(Layout, DType, String, usize)> {
            crate::bail!("unsupported storage type {other}")
        }
    };
-    let layout = Layout::new(crate::Shape::from(size), stride, offset);
+    let layout = Layout::new(
+        crate::Shape::from(size),
+        stride,
+        offset * dtype.size_in_bytes(),
+    );
    Ok((layout, dtype, path, storage_size))
 }

--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@ -17,7 +17,7 @@ pub fn has_accelerate() -> bool {
 }

 pub fn has_mkl() -> bool {
-    cfg!(feature = "mkl")
+    cfg!(feature = "_mkl")
 }

 pub fn cuda_is_available() -> bool {
--- a/candle-examples/examples/gemma/main.rs
+++ b/candle-examples/examples/gemma/main.rs
@ -50,6 +50,8 @@ enum Which {
    InstructV2_9B,
    #[value(name = "3-1b")]
    BaseV3_1B,
+    #[value(name = "3-1b-it")]
+    InstructV3_1B,
 }

 enum Model {
@ -272,6 +274,7 @@ fn main() -> Result<()> {
            Which::BaseV2_9B => "google/gemma-2-9b".to_string(),
            Which::InstructV2_9B => "google/gemma-2-9b-it".to_string(),
            Which::BaseV3_1B => "google/gemma-3-1b-pt".to_string(),
+            Which::InstructV3_1B => "google/gemma-3-1b-it".to_string(),
        },
    };
    let repo = api.repo(Repo::with_revision(
@ -292,13 +295,10 @@ fn main() -> Result<()> {
            .split(',')
            .map(std::path::PathBuf::from)
            .collect::<Vec<_>>(),
-        None => {
-            if args.which == Which::BaseV3_1B {
-                vec![repo.get("model.safetensors")?]
-            } else {
-                candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?
-            }
-        }
+        None => match args.which {
+            Which::BaseV3_1B | Which::InstructV3_1B => vec![repo.get("model.safetensors")?],
+            _ => candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?,
+        },
    };
    println!("retrieved the files in {:?}", start.elapsed());
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
@ -331,7 +331,7 @@ fn main() -> Result<()> {
            let model = Model2::new(args.use_flash_attn, &config, vb)?;
            Model::V2(model)
        }
-        Which::BaseV3_1B => {
+        Which::BaseV3_1B | Which::InstructV3_1B => {
            let config: Config3 = serde_json::from_reader(std::fs::File::open(config_filename)?)?;
            let model = Model3::new(args.use_flash_attn, &config, vb)?;
            Model::V3(model)
--- a/candle-examples/examples/mnist-training/main.rs
+++ b/candle-examples/examples/mnist-training/main.rs
@ -7,6 +7,7 @@ extern crate accelerate_src;

 use clap::{Parser, ValueEnum};
 use rand::prelude::*;
+use rand::rng;

 use candle::{DType, Result, Tensor, D};
 use candle_nn::{loss, ops, Conv2d, Linear, Module, ModuleT, Optimizer, VarBuilder, VarMap};
@ -138,7 +139,7 @@ fn training_loop_cnn(
    let mut batch_idxs = (0..n_batches).collect::<Vec<usize>>();
    for epoch in 1..args.epochs {
        let mut sum_loss = 0f32;
-        batch_idxs.shuffle(&mut thread_rng());
+        batch_idxs.shuffle(&mut rng());
        for batch_idx in batch_idxs.iter() {
            let train_images = train_images.narrow(0, batch_idx * BSIZE, BSIZE)?;
            let train_labels = train_labels.narrow(0, batch_idx * BSIZE, BSIZE)?;
--- a/candle-examples/examples/reinforcement-learning/ddpg.rs
+++ b/candle-examples/examples/reinforcement-learning/ddpg.rs
@ -5,7 +5,7 @@ use candle_nn::{
    func, linear, sequential::seq, Activation, AdamW, Optimizer, ParamsAdamW, Sequential,
    VarBuilder, VarMap,
 };
-use rand::{distributions::Uniform, thread_rng, Rng};
+use rand::{distr::Uniform, rng, Rng};

 use super::gym_env::GymEnv;

@ -103,8 +103,8 @@ impl ReplayBuffer {
        if self.size < batch_size {
            Ok(None)
        } else {
-            let transitions: Vec<&Transition> = thread_rng()
-                .sample_iter(Uniform::from(0..self.size))
+            let transitions: Vec<&Transition> = rng()
+                .sample_iter(Uniform::try_from(0..self.size).map_err(Error::wrap)?)
                .take(batch_size)
                .map(|i| self.buffer.get(i).unwrap())
                .collect();
@ -498,11 +498,11 @@ pub fn run() -> Result<()> {
        OuNoise::new(MU, THETA, SIGMA, size_action)?,
    )?;

-    let mut rng = rand::thread_rng();
+    let mut rng = rand::rng();

    for episode in 0..MAX_EPISODES {
        // let mut state = env.reset(episode as u64)?;
-        let mut state = env.reset(rng.gen::<u64>())?;
+        let mut state = env.reset(rng.random::<u64>())?;

        let mut total_reward = 0.0;
        for _ in 0..EPISODE_LENGTH {
@ -538,7 +538,7 @@ pub fn run() -> Result<()> {
    agent.train = false;
    for episode in 0..10 {
        // let mut state = env.reset(episode as u64)?;
-        let mut state = env.reset(rng.gen::<u64>())?;
+        let mut state = env.reset(rng.random::<u64>())?;
        let mut total_reward = 0.0;
        for _ in 0..EPISODE_LENGTH {
            let mut action = 2.0 * agent.actions(&state)?;
--- a/candle-examples/examples/reinforcement-learning/dqn.rs
+++ b/candle-examples/examples/reinforcement-learning/dqn.rs
@ -1,9 +1,8 @@
 use std::collections::VecDeque;

-use rand::distributions::Uniform;
-use rand::{thread_rng, Rng};
+use rand::{distr::Uniform, rng, Rng};

-use candle::{DType, Device, Module, Result, Tensor};
+use candle::{DType, Device, Error, Module, Result, Tensor};
 use candle_nn::loss::mse;
 use candle_nn::{linear, seq, Activation, AdamW, Optimizer, VarBuilder, VarMap};

@ -65,8 +64,8 @@ pub fn run() -> Result<()> {
        // fed to the model so that it performs a backward pass.
        if memory.len() > BATCH_SIZE {
            // Sample randomly from the memory.
-            let batch = thread_rng()
-                .sample_iter(Uniform::from(0..memory.len()))
+            let batch = rng()
+                .sample_iter(Uniform::try_from(0..memory.len()).map_err(Error::wrap)?)
                .take(BATCH_SIZE)
                .map(|i| memory.get(i).unwrap().clone())
                .collect::<Vec<_>>();
--- a/candle-examples/examples/reinforcement-learning/policy_gradient.rs
+++ b/candle-examples/examples/reinforcement-learning/policy_gradient.rs
@ -4,7 +4,7 @@ use candle_nn::{
    linear, ops::log_softmax, ops::softmax, sequential::seq, Activation, AdamW, Optimizer,
    ParamsAdamW, VarBuilder, VarMap,
 };
-use rand::{distributions::Distribution, rngs::ThreadRng, Rng};
+use rand::{distr::Distribution, rngs::ThreadRng, Rng};

 fn new_model(
    input_shape: &[usize],
@ -39,7 +39,7 @@ fn accumulate_rewards(steps: &[Step<i64>]) -> Vec<f64> {
 }

 fn weighted_sample(probs: Vec<f32>, rng: &mut ThreadRng) -> Result<usize> {
-    let distribution = rand::distributions::WeightedIndex::new(probs).map_err(Error::wrap)?;
+    let distribution = rand::distr::weighted::WeightedIndex::new(probs).map_err(Error::wrap)?;
    let mut rng = rng;
    Ok(distribution.sample(&mut rng))
 }
@ -65,10 +65,10 @@ pub fn run() -> Result<()> {

    let mut optimizer = AdamW::new(varmap.all_vars(), optimizer_params)?;

-    let mut rng = rand::thread_rng();
+    let mut rng = rand::rng();

    for epoch_idx in 0..100 {
-        let mut state = env.reset(rng.gen::<u64>())?;
+        let mut state = env.reset(rng.random::<u64>())?;
        let mut steps: Vec<Step<i64>> = vec![];

        loop {
@ -84,7 +84,7 @@ pub fn run() -> Result<()> {
            steps.push(step.copy_with_obs(&state));

            if step.terminated || step.truncated {
-                state = env.reset(rng.gen::<u64>())?;
+                state = env.reset(rng.random::<u64>())?;
                if steps.len() > 5000 {
                    break;
                }
--- a/candle-examples/examples/whisper-microphone/main.rs
+++ b/candle-examples/examples/whisper-microphone/main.rs
@ -9,7 +9,7 @@ use candle::{Device, IndexOp, Tensor};
 use candle_nn::{ops::softmax, VarBuilder};
 use clap::{Parser, ValueEnum};
 use hf_hub::{api::sync::Api, Repo, RepoType};
-use rand::{distributions::Distribution, SeedableRng};
+use rand::{distr::Distribution, SeedableRng};
 use tokenizers::Tokenizer;

 mod multilingual;
@ -204,7 +204,7 @@ impl Decoder {
            let next_token = if t > 0f64 {
                let prs = softmax(&(&logits / t)?, 0)?;
                let logits_v: Vec<f32> = prs.to_vec1()?;
-                let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
+                let distr = rand::distr::weighted::WeightedIndex::new(&logits_v)?;
                distr.sample(&mut self.rng) as u32
            } else {
                let logits_v: Vec<f32> = logits.to_vec1()?;
--- a/candle-examples/examples/whisper/main.rs
+++ b/candle-examples/examples/whisper/main.rs
@ -14,7 +14,9 @@ use candle::{Device, IndexOp, Tensor};
 use candle_nn::{ops::softmax, VarBuilder};
 use clap::{Parser, ValueEnum};
 use hf_hub::{api::sync::Api, Repo, RepoType};
-use rand::{distributions::Distribution, SeedableRng};
+use rand::distr::weighted::WeightedIndex;
+use rand::distr::Distribution;
+use rand::SeedableRng;
 use tokenizers::Tokenizer;

 mod multilingual;
@ -208,7 +210,7 @@ impl Decoder {
            let next_token = if t > 0f64 {
                let prs = softmax(&(&logits / t)?, 0)?;
                let logits_v: Vec<f32> = prs.to_vec1()?;
-                let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
+                let distr = WeightedIndex::new(&logits_v)?;
                distr.sample(&mut self.rng) as u32
            } else {
                let logits_v: Vec<f32> = logits.to_vec1()?;
--- a/candle-flash-attn/build.rs
+++ b/candle-flash-attn/build.rs
@ -88,19 +88,26 @@ fn main() -> Result<()> {
        .arg("--use_fast_math")
        .arg("--verbose");

+    let mut is_target_msvc = false;
    if let Ok(target) = std::env::var("TARGET") {
        if target.contains("msvc") {
+            is_target_msvc = true;
            builder = builder.arg("-D_USE_MATH_DEFINES");
        }
    }

+    if !is_target_msvc {
+        builder = builder.arg("-Xcompiler").arg("-fPIC");
+    }
+
    let out_file = build_dir.join("libflashattention.a");
    builder.build_lib(out_file);

    println!("cargo:rustc-link-search={}", build_dir.display());
    println!("cargo:rustc-link-lib=flashattention");
    println!("cargo:rustc-link-lib=dylib=cudart");
-    println!("cargo:rustc-link-lib=dylib=stdc++");
-
+    if !is_target_msvc {
+        println!("cargo:rustc-link-lib=dylib=stdc++");
+    }
    Ok(())
 }
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@ -33,7 +33,8 @@ criterion = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate"]
 cuda = ["candle/cuda"]
-mkl = ["dep:intel-mkl-src", "candle/mkl"]
+_mkl = ["dep:intel-mkl-src", "candle/_mkl"]
+mkl = ["candle/mkl"]
 metal = ["candle/metal", "dep:candle-metal-kernels", "dep:metal"]

 [[bench]]
--- a/candle-nn/benches/benchmarks/mod.rs
+++ b/candle-nn/benches/benchmarks/mod.rs
@ -34,7 +34,7 @@ impl BenchDevice for Device {
            Device::Cpu => {
                let cpu_type = if cfg!(feature = "accelerate") {
                    "accelerate"
-                } else if cfg!(feature = "mkl") {
+                } else if cfg!(feature = "_mkl") {
                    "mkl"
                } else {
                    "cpu"
--- a/candle-nn/examples/basic_optimizer.rs
+++ b/candle-nn/examples/basic_optimizer.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/examples/cpu_benchmarks.rs
+++ b/candle-nn/examples/cpu_benchmarks.rs
@ -1,5 +1,5 @@
 /// This example contains some simple benchmarks so that it's easy to run them in perf etc.
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/src/kv_cache.rs
+++ b/candle-nn/src/kv_cache.rs
@ -11,6 +11,7 @@ pub struct Cache {
    all_data: Option<Tensor>,
    dim: usize,
    current_seq_len: usize,
+    grow_by: usize,
    max_seq_len: usize,
 }

@ -20,6 +21,7 @@ impl Cache {
            all_data: None,
            dim,
            current_seq_len: 0,
+            grow_by: max_seq_len,
            max_seq_len,
        }
    }
@ -65,11 +67,11 @@ impl Cache {
        };
        let ad = self.all_data.as_mut().unwrap();
        if self.current_seq_len + seq_len > self.max_seq_len {
-            candle::bail!(
-                "kv-cache: above max-seq-len {}+{seq_len}>{}",
-                self.current_seq_len,
-                self.max_seq_len
-            )
+            let mut shape = src.dims().to_vec();
+            shape[self.dim] = self.grow_by;
+            let next_ad = Tensor::zeros(shape, src.dtype(), src.device())?;
+            *ad = Tensor::cat(&[&*ad, &next_ad], self.dim)?;
+            self.max_seq_len += self.grow_by;
        }
        ad.slice_set(src, self.dim, self.current_seq_len)?;
        self.current_seq_len += seq_len;
--- a/candle-nn/tests/batch_norm.rs
+++ b/candle-nn/tests/batch_norm.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/group_norm.rs
+++ b/candle-nn/tests/group_norm.rs
@ -18,7 +18,7 @@ t = torch.tensor(
 print(group_norm(t, num_groups=2))
 print(group_norm(t, num_groups=3))
 */
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/kv_cache.rs
+++ b/candle-nn/tests/kv_cache.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/layer_norm.rs
+++ b/candle-nn/tests/layer_norm.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/loss.rs
+++ b/candle-nn/tests/loss.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/ops.rs
+++ b/candle-nn/tests/ops.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/optim.rs
+++ b/candle-nn/tests/optim.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
--- a/candle-nn/tests/rnn.rs
+++ b/candle-nn/tests/rnn.rs
@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;

 #[cfg(feature = "accelerate")]
Author	SHA1	Message	Date
Nicolas Patry	2c0f6b008e	Fixing order.	2025-03-28 11:43:33 +01:00
Nicolas Patry	9862cd3ba2	Splitting the features to enable different mkl linking.	2025-03-28 10:13:13 +01:00
LongYinan	cb02b389d5	Fix reinforcement learning example (#2837 )	2025-03-26 16:27:45 +01:00
Kyle Birnbaum	0d4097031c	fixed rand import for mnist-training (#2833 )	2025-03-26 08:10:03 +01:00
Kyle Birnbaum	10853b803c	fixed rand imports for whisper-microphone example (#2834 )	2025-03-26 08:09:27 +01:00
xkeyC	f3d472952f	fix: `candle-flash-attn` linux and `msvc` build (#2829 ) * fix: candle-flash-attn linux and msvc build * Missing newline at eof. --------- Co-authored-by: laurent <laurent.mazare@gmail.com>	2025-03-25 08:45:12 +01:00
Christian Balcom	67b85f79f1	Pickle decoder fix and Long1 opcode addition. (#2824 ) * Pickle decoder changes: added Long1 opcode, fixed tensor offset calculation * Apply rustfmt. --------- Co-authored-by: Laurent <laurent.mazare@gmail.com>	2025-03-23 08:10:08 +01:00
Benjamin Beurdouche	0b24f7f0a4	Fix for whisper example. rand::distribution is now rand::distr (#2811 )	2025-03-16 19:14:55 +01:00
Laurent Mazare	3afb04925a	Allow for growing the default KV cache when needed. (#2810 )	2025-03-16 17:30:25 +01:00
André Cipriani Bandarra	cbf5fc80c2	Add Gemma 3 1b IT toe Gemma examples (#2809 ) - Updates the Gemma example to include Gemma 3 1b instruction tuned.	2025-03-16 17:00:48 +01:00