diff --git a/Cargo.toml b/Cargo.toml
index cd597eb4..44bc5412 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,7 +51,7 @@ half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_di
 hound = "3.5.1"
 image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
-intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
+intel-mkl-src = { version = "0.8.1" }
 libc = { version = "0.2.147" }
 log = "0.4"
 memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
index d5d5bde0..66e456ed 100644
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@@ -45,7 +45,8 @@ criterion = { workspace = true }
 default = []
 cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda"]
 cudnn = ["cuda", "cudarc/cudnn"]
-mkl = ["dep:libc", "dep:intel-mkl-src"]
+_mkl = ["dep:libc", "dep:intel-mkl-src"]
+mkl = ["_mkl", "intel-mkl-src?/mkl-static-lp64-iomp"] 
 accelerate = ["dep:libc", "dep:accelerate-src"]
 metal = ["dep:metal", "dep:candle-metal-kernels", "dep:ug-metal"]
 
diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs
index 721b292d..66597ae1 100644
--- a/candle-core/benches/benchmarks/mod.rs
+++ b/candle-core/benches/benchmarks/mod.rs
@@ -39,7 +39,7 @@ impl BenchDevice for Device {
             Device::Cpu => {
                 let cpu_type = if cfg!(feature = "accelerate") {
                     "accelerate"
-                } else if cfg!(feature = "mkl") {
+                } else if cfg!(feature = "_mkl") {
                     "mkl"
                 } else {
                     "cpu"
diff --git a/candle-core/examples/basics.rs b/candle-core/examples/basics.rs
index fe15187b..5cce0c6b 100644
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-core/examples/cuda_basics.rs b/candle-core/examples/cuda_basics.rs
index 9af1b006..c09365b6 100644
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@@ -1,7 +1,7 @@
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;
 
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 use anyhow::Result;
diff --git a/candle-core/examples/cuda_sum_benchmark.rs b/candle-core/examples/cuda_sum_benchmark.rs
index d6d182e8..f7cf97ec 100644
--- a/candle-core/examples/cuda_sum_benchmark.rs
+++ b/candle-core/examples/cuda_sum_benchmark.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-core/examples/metal_basics.rs b/candle-core/examples/metal_basics.rs
index f9ff81ad..44fe98da 100644
--- a/candle-core/examples/metal_basics.rs
+++ b/candle-core/examples/metal_basics.rs
@@ -1,7 +1,7 @@
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;
 
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 use anyhow::Result;
diff --git a/candle-core/src/cpu_backend/mod.rs b/candle-core/src/cpu_backend/mod.rs
index 612359f4..008ff887 100644
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
@@ -1246,7 +1246,7 @@ impl MatMul {
 impl Map2 for MatMul {
     const OP: &'static str = "mat_mul";
 
-    #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))]
+    #[cfg(all(not(feature = "_mkl"), not(feature = "accelerate")))]
     fn f<T: 'static + WithDType + num_traits::Num + Copy>(
         &self,
         lhs: &[T],
@@ -1411,7 +1411,7 @@ impl Map2 for MatMul {
         Ok(dst)
     }
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     fn f<T: 'static + WithDType + num_traits::Num + Copy>(
         &self,
         lhs: &[T],
diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs
index 16dc8e02..acdd3461 100644
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@@ -68,7 +68,7 @@ mod indexer;
 pub mod layout;
 #[cfg(feature = "metal")]
 pub mod metal_backend;
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 mod mkl;
 pub mod npy;
 pub mod op;
@@ -118,7 +118,7 @@ pub use metal_backend::{MetalDevice, MetalError, MetalStorage};
 #[cfg(not(feature = "metal"))]
 pub use dummy_metal_backend::{MetalDevice, MetalError, MetalStorage};
 
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs
index c5fc3fc4..501b7843 100644
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@@ -294,16 +294,16 @@ macro_rules! bin_op {
                 $e(v1, v2)
             }
 
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             const F32_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             const F64_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             #[inline(always)]
             fn f32_vec(xs1: &[f32], xs2: &[f32], ys: &mut [f32]) {
                 crate::mkl::$f32_vec(xs1, xs2, ys)
             }
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             #[inline(always)]
             fn f64_vec(xs1: &[f64], xs2: &[f64], ys: &mut [f64]) {
                 crate::mkl::$f64_vec(xs1, xs2, ys)
@@ -418,16 +418,16 @@ macro_rules! unary_op {
                 todo!("no unary function for i64")
             }
 
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             const F32_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             const F64_VEC: bool = true;
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             #[inline(always)]
             fn f32_vec(xs: &[f32], ys: &mut [f32]) {
                 crate::mkl::$f32_vec(xs, ys)
             }
-            #[cfg(feature = "mkl")]
+            #[cfg(feature = "_mkl")]
             #[inline(always)]
             fn f64_vec(xs: &[f64], ys: &mut [f64]) {
                 crate::mkl::$f64_vec(xs, ys)
@@ -518,19 +518,19 @@ impl UnaryOpT for Gelu {
     }
     const KERNEL: &'static str = "ugelu";
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     const F32_VEC: bool = true;
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     #[inline(always)]
     fn f32_vec(xs: &[f32], ys: &mut [f32]) {
         crate::mkl::vs_gelu(xs, ys)
     }
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     const F64_VEC: bool = true;
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     #[inline(always)]
     fn f64_vec(xs: &[f64], ys: &mut [f64]) {
         crate::mkl::vd_gelu(xs, ys)
@@ -625,19 +625,19 @@ impl UnaryOpT for Silu {
     }
     const KERNEL: &'static str = "usilu";
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     const F32_VEC: bool = true;
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     #[inline(always)]
     fn f32_vec(xs: &[f32], ys: &mut [f32]) {
         crate::mkl::vs_silu(xs, ys)
     }
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     const F64_VEC: bool = true;
 
-    #[cfg(feature = "mkl")]
+    #[cfg(feature = "_mkl")]
     #[inline(always)]
     fn f64_vec(xs: &[f64], ys: &mut [f64]) {
         crate::mkl::vd_silu(xs, ys)
diff --git a/candle-core/src/utils.rs b/candle-core/src/utils.rs
index aa4d2705..9e0a9026 100644
--- a/candle-core/src/utils.rs
+++ b/candle-core/src/utils.rs
@@ -17,7 +17,7 @@ pub fn has_accelerate() -> bool {
 }
 
 pub fn has_mkl() -> bool {
-    cfg!(feature = "mkl")
+    cfg!(feature = "_mkl")
 }
 
 pub fn cuda_is_available() -> bool {
diff --git a/candle-nn/Cargo.toml b/candle-nn/Cargo.toml
index e62f4c32..c3f3cba0 100644
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@@ -33,7 +33,8 @@ criterion = { workspace = true }
 default = []
 accelerate = ["dep:accelerate-src", "candle/accelerate"]
 cuda = ["candle/cuda"]
-mkl = ["dep:intel-mkl-src", "candle/mkl"]
+mkl = ["candle/mkl"]
+_mkl = ["dep:intel-mkl-src", "candle/_mkl"]
 metal = ["candle/metal", "dep:candle-metal-kernels", "dep:metal"]
 
 [[bench]]
diff --git a/candle-nn/benches/benchmarks/mod.rs b/candle-nn/benches/benchmarks/mod.rs
index a34d8884..3620cc04 100644
--- a/candle-nn/benches/benchmarks/mod.rs
+++ b/candle-nn/benches/benchmarks/mod.rs
@@ -34,7 +34,7 @@ impl BenchDevice for Device {
             Device::Cpu => {
                 let cpu_type = if cfg!(feature = "accelerate") {
                     "accelerate"
-                } else if cfg!(feature = "mkl") {
+                } else if cfg!(feature = "_mkl") {
                     "mkl"
                 } else {
                     "cpu"
diff --git a/candle-nn/examples/basic_optimizer.rs b/candle-nn/examples/basic_optimizer.rs
index 810f7a7a..d0d23ae1 100644
--- a/candle-nn/examples/basic_optimizer.rs
+++ b/candle-nn/examples/basic_optimizer.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/examples/cpu_benchmarks.rs b/candle-nn/examples/cpu_benchmarks.rs
index 430316b8..b3d5bcca 100644
--- a/candle-nn/examples/cpu_benchmarks.rs
+++ b/candle-nn/examples/cpu_benchmarks.rs
@@ -1,5 +1,5 @@
 /// This example contains some simple benchmarks so that it's easy to run them in perf etc.
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/batch_norm.rs b/candle-nn/tests/batch_norm.rs
index 8ce49c92..3d3905b3 100644
--- a/candle-nn/tests/batch_norm.rs
+++ b/candle-nn/tests/batch_norm.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/group_norm.rs b/candle-nn/tests/group_norm.rs
index 8145a220..c5fde03c 100644
--- a/candle-nn/tests/group_norm.rs
+++ b/candle-nn/tests/group_norm.rs
@@ -18,7 +18,7 @@ t = torch.tensor(
 print(group_norm(t, num_groups=2))
 print(group_norm(t, num_groups=3))
 */
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/kv_cache.rs b/candle-nn/tests/kv_cache.rs
index b8d2ec48..42a34ad3 100644
--- a/candle-nn/tests/kv_cache.rs
+++ b/candle-nn/tests/kv_cache.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/layer_norm.rs b/candle-nn/tests/layer_norm.rs
index 30f598b3..51be5af7 100644
--- a/candle-nn/tests/layer_norm.rs
+++ b/candle-nn/tests/layer_norm.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/loss.rs b/candle-nn/tests/loss.rs
index ccfc029f..964b58ee 100644
--- a/candle-nn/tests/loss.rs
+++ b/candle-nn/tests/loss.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/ops.rs b/candle-nn/tests/ops.rs
index 6c66f39f..eea01a0a 100644
--- a/candle-nn/tests/ops.rs
+++ b/candle-nn/tests/ops.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/optim.rs b/candle-nn/tests/optim.rs
index 4eb14ed8..ee2faa5e 100644
--- a/candle-nn/tests/optim.rs
+++ b/candle-nn/tests/optim.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]
diff --git a/candle-nn/tests/rnn.rs b/candle-nn/tests/rnn.rs
index 498c9188..058a99b7 100644
--- a/candle-nn/tests/rnn.rs
+++ b/candle-nn/tests/rnn.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "mkl")]
+#[cfg(feature = "_mkl")]
 extern crate intel_mkl_src;
 
 #[cfg(feature = "accelerate")]