Rename the candle crate to candle-core (#301)

* Rename to candle-core. * More candle-core renaming.
2025-06-16 10:38:54 +00:00 · 2023-08-02 08:20:22 +01:00
parent 6e33ff62d6
commit 51e51da896
23 changed files with 77 additions and 76 deletions
--- a/candle-book/src/guide/hello_world.md
+++ b/candle-book/src/guide/hello_world.md
@ -5,8 +5,8 @@ We will now create the hello world of the ML world, building a model capable of
 Open `src/main.rs` and fill in this content:
 ```rust
-# extern crate candle;
+# extern crate candle_core;
-use candle::{DType, Device, Result, Tensor};
+use candle_core::{DType, Device, Result, Tensor};
 struct Model {
    first: Tensor,
@ -49,8 +49,8 @@ Now that we have this, we might want to complexify things a bit, for instance by
 the classical `Linear` layer. We can do as such
 ```rust
-# extern crate candle;
+# extern crate candle_core;
-# use candle::{DType, Device, Result, Tensor};
+# use candle_core::{DType, Device, Result, Tensor};
 struct Linear{
    weight: Tensor,
    bias: Tensor,
@ -79,8 +79,8 @@ impl Model {
 This will change the model running code into a new function
 ```rust
-# extern crate candle;
+# extern crate candle_core;
-# use candle::{DType, Device, Result, Tensor};
+# use candle_core::{DType, Device, Result, Tensor};
 # struct Linear{
 #     weight: Tensor,
 #     bias: Tensor,
@ -144,9 +144,9 @@ cargo add --git https://github.com/LaurentMazare/candle.git candle-nn
 And rewrite our examples using it
 ```rust
-# extern crate candle;
+# extern crate candle_core;
 # extern crate candle_nn;
-use candle::{DType, Device, Result, Tensor};
+use candle_core::{DType, Device, Result, Tensor};
 use candle_nn::Linear;
 struct Model {
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@ -1,5 +1,5 @@
 [package]
-name = "candle"
+name = "candle-core"
 version = "0.1.0"
 edition = "2021"
--- a/candle-core/examples/basics.rs
+++ b/candle-core/examples/basics.rs
@ -2,7 +2,7 @@
 extern crate intel_mkl_src;
 use anyhow::Result;
-use candle::{Device, Tensor};
+use candle_core::{Device, Tensor};
 fn main() -> Result<()> {
    let a = Tensor::randn(0f32, 1., (2, 3), &Device::Cpu)?;
--- a/candle-core/examples/cuda_basics.rs
+++ b/candle-core/examples/cuda_basics.rs
@ -2,7 +2,7 @@
 extern crate intel_mkl_src;
 use anyhow::Result;
-use candle::{Device, Tensor};
+use candle_core::{Device, Tensor};
 fn main() -> Result<()> {
    let device = Device::new_cuda(0)?;
--- a/candle-core/examples/cuda_sum_benchmark.rs
+++ b/candle-core/examples/cuda_sum_benchmark.rs
@ -4,7 +4,7 @@ extern crate intel_mkl_src;
 use std::str::FromStr;
 use anyhow::Result;
-use candle::{Device, Tensor};
+use candle_core::{Device, Tensor};
 fn cos_sin(n: usize, device: &Device) -> Result<Tensor> {
    let thetas: Vec<_> = (0..n).map(|i| (i as f32 / n as f32)).collect();
--- a/candle-core/src/indexer.rs
+++ b/candle-core/src/indexer.rs
@ -7,7 +7,7 @@ impl Tensor {
    /// Intended to be use by the trait `.i()`
    ///
    /// ```
-    /// # use candle::{Tensor, DType, Device, IndexOp};
+    /// # use candle_core::{Tensor, DType, Device, IndexOp};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
    /// let c = a.i(0..1)?;
@ -22,7 +22,7 @@ impl Tensor {
    /// let c = a.i((.., ..=2))?;
    /// assert_eq!(c.shape().dims(), &[2, 3]);
    ///
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    fn index(&self, indexers: &[TensorIndexer]) -> Result<Self, Error> {
        let mut x = self.clone();
--- a/candle-core/src/lib.rs
+++ b/candle-core/src/lib.rs
@ -1,8 +1,8 @@
 //! ML framework for Rust
 //!
 //! ```rust
-//! use candle::{Tensor, DType, Device};
+//! use candle_core::{Tensor, DType, Device};
-//! # use candle::Error;
+//! # use candle_core::Error;
 //! # fn main() -> Result<(), Error>{
 //!
 //! let a = Tensor::arange(0f32, 6f32, &Device::Cpu)?.reshape((2, 3))?;
--- a/candle-core/src/tensor.rs
+++ b/candle-core/src/tensor.rs
@ -54,13 +54,13 @@ impl AsRef<Tensor> for Tensor {
 /// The core struct for manipulating tensors.
 ///
 /// ```rust
-/// use candle::{Tensor, DType, Device};
+/// use candle_core::{Tensor, DType, Device};
 ///
 /// let a = Tensor::arange(0f32, 6f32, &Device::Cpu)?.reshape((2, 3))?;
 /// let b = Tensor::arange(0f32, 12f32, &Device::Cpu)?.reshape((3, 4))?;
 ///
 /// let c = a.matmul(&b)?;
-/// # Ok::<(), candle::Error>(())
+/// # Ok::<(), candle_core::Error>(())
 /// ```
 ///
 /// Tensors are reference counted with [`Arc`] so cloning them is cheap.
@ -163,11 +163,11 @@ impl Tensor {
    /// Creates a new tensor filled with ones.
    ///
    /// ```rust
-    /// use candle::{Tensor, DType, Device};
+    /// use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::ones((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = Tensor::from_slice(&[1.0f32, 1.0, 1.0, 1.0, 1.0, 1.0], (2, 3), &Device::Cpu)?;
    /// // a == b
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn ones<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -> Result<Self> {
        Self::ones_impl(shape, dtype, device, false)
@ -176,11 +176,11 @@ impl Tensor {
    /// Creates a new tensor filled with ones with same shape, dtype, and device as the other tensor.
    ///
    /// ```rust
-    /// use candle::{Tensor, DType, Device};
+    /// use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = a.ones_like()?;
    /// // b == a + 1
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn ones_like(&self) -> Result<Self> {
        Tensor::ones(self.shape(), self.dtype(), self.device())
@ -208,11 +208,11 @@ impl Tensor {
    /// Creates a new tensor filled with zeros.
    ///
    /// ```rust
-    /// use candle::{Tensor, DType, Device};
+    /// use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = Tensor::from_slice(&[0.0f32, 0.0, 0.0, 0.0, 0.0, 0.0], (2, 3), &Device::Cpu)?;
    /// // a == b
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn zeros<S: Into<Shape>>(shape: S, dtype: DType, device: &Device) -> Result<Self> {
        Self::zeros_impl(shape, dtype, device, false)
@ -222,11 +222,11 @@ impl Tensor {
    /// tensor.
    ///
    /// ```rust
-    /// use candle::{Tensor, DType, Device};
+    /// use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = a.zeros_like()?;
    /// // b is on CPU f32.
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn zeros_like(&self) -> Result<Self> {
        Tensor::zeros(self.shape(), self.dtype(), self.device())
@ -516,11 +516,11 @@ impl Tensor {
    /// be performed.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let a = Tensor::new(&[[0f32, 1.], [2., 3.]], &Device::Cpu)?;
    /// let a = a.affine(4., -2.)?;
    /// assert_eq!(a.to_vec2::<f32>()?, &[[-2.0, 2.0], [6.0, 10.0]]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn affine(&self, mul: f64, add: f64) -> Result<Self> {
        let storage = self.storage().affine(self.layout(), mul, add)?;
@ -642,7 +642,7 @@ impl Tensor {
    /// that the number of elements for each dimension index in `sum_dims` is 1.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let a = Tensor::new(&[[0f32, 1.], [2., 3.]], &Device::Cpu)?;
    /// let s = a.sum_keepdim(0)?;
    /// assert_eq!(s.to_vec2::<f32>()?, &[[2., 4.]]);
@ -650,7 +650,7 @@ impl Tensor {
    /// assert_eq!(s.to_vec2::<f32>()?, &[[1.], [5.]]);
    /// let s = a.sum_keepdim((0, 1))?;
    /// assert_eq!(s.to_vec2::<f32>()?, &[[6.]]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn sum_keepdim<D: Dims>(&self, sum_dims: D) -> Result<Self> {
        self.sum_impl(sum_dims, true)
@ -854,12 +854,12 @@ impl Tensor {
    /// vocabulary size, and `h` the hidden size.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let values = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
    /// let ids = Tensor::new(&[2u32, 1u32, 2u32], &Device::Cpu)?;
    /// let emb = values.embedding(&ids)?;
    /// assert_eq!(emb.to_vec2::<f32>()?, &[[4., 5.], [2., 3.], [4., 5.]]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn embedding(&self, ids: &Self) -> Result<Self> {
        if self.rank() != 2 || ids.rank() != 1 {
@ -1191,11 +1191,11 @@ impl Tensor {
    /// scalar with zero dimensions.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
    /// let tensor = tensor.sum_all()?;
    /// assert_eq!(tensor.to_scalar::<f32>()?, 15.);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn sum_all(&self) -> Result<Tensor> {
        let dims: Vec<_> = (0..self.rank()).collect();
@ -1252,11 +1252,11 @@ impl Tensor {
    /// Flattens the input tensor by reshaping it into a one dimension tensor.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
    /// let tensor = tensor.flatten_all()?;
    /// assert_eq!(tensor.to_vec1::<f32>()?, &[0., 1., 2., 3., 4., 5.]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn flatten_all(&self) -> Result<Tensor> {
        self.flatten_(None::<usize>, None::<usize>)
@ -1265,13 +1265,13 @@ impl Tensor {
    /// Returns the sub-tensor fixing the index at `i` on the first dimension.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
    /// let t = tensor.get(0)?;
    /// assert_eq!(t.to_vec1::<f32>()?, &[0., 1.]);
    /// let t = tensor.get(1)?;
    /// assert_eq!(t.to_vec1::<f32>()?, &[2., 3.]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn get(&self, i: usize) -> Result<Tensor> {
        let dims = self.dims();
@ -1286,11 +1286,11 @@ impl Tensor {
    /// input are swapped.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let tensor = Tensor::new(&[[0f32, 1.], [2., 3.], [4., 5.]], &Device::Cpu)?;
    /// let tensor = tensor.t()?;
    /// assert_eq!(tensor.to_vec2::<f32>()?, &[[0.0, 2.0, 4.0], [1.0, 3.0, 5.0]]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn t(&self) -> Result<Tensor> {
        let rank = self.rank();
@ -1433,12 +1433,12 @@ impl Tensor {
    /// Casts the input tensor to the target `dtype`.
    ///
    /// ```rust
-    /// use candle::{Tensor, Device};
+    /// use candle_core::{Tensor, Device};
    /// let tensor = Tensor::new(3.14159265358979f64, &Device::Cpu)?;
    /// assert_eq!(tensor.to_scalar::<f64>()?, 3.14159265358979);
-    /// let tensor = tensor.to_dtype(candle::DType::F32)?;
+    /// let tensor = tensor.to_dtype(candle_core::DType::F32)?;
    /// assert_eq!(tensor.to_scalar::<f32>()?, 3.1415927);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn to_dtype(&self, dtype: DType) -> Result<Self> {
        if self.dtype() == dtype {
@ -1483,7 +1483,7 @@ impl Tensor {
    /// a new storage and copies the data over, the returned tensor is always contiguous.
    ///
    /// ```rust
-    /// # use candle::{Tensor, DType, Device, D};
+    /// # use candle_core::{Tensor, DType, Device, D};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
    /// let c = a.reshape((1, 6))?;
@ -1491,7 +1491,7 @@ impl Tensor {
    ///
    /// let c = a.reshape((3, 2))?;
    /// assert_eq!(c.shape().dims(), &[3, 2]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn reshape<S: Into<Shape>>(&self, shape: S) -> Result<Tensor> {
        let shape = shape.into();
@ -1526,7 +1526,7 @@ impl Tensor {
    /// Creates a new tensor with the specified dimension removed if its size was one.
    ///
    /// ```rust
-    /// # use candle::{Tensor, DType, Device, D};
+    /// # use candle_core::{Tensor, DType, Device, D};
    /// let a = Tensor::zeros((2, 3, 1), DType::F32, &Device::Cpu)?;
    ///
    /// let c = a.squeeze(2)?;
@ -1534,7 +1534,7 @@ impl Tensor {
    ///
    /// let c = a.squeeze(D::Minus1)?;
    /// assert_eq!(c.shape().dims(), &[2, 3]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn squeeze<D: Dim>(&self, dim: D) -> Result<Self> {
        // The PyTorch semantics are to return the same tensor if the target dimension
@ -1553,7 +1553,7 @@ impl Tensor {
    /// Creates a new tensor with a dimension of size one inserted at the specified position.
    ///
    /// ```rust
-    /// # use candle::{Tensor, DType, Device, D};
+    /// # use candle_core::{Tensor, DType, Device, D};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
    /// let c = a.unsqueeze(0)?;
@ -1561,7 +1561,7 @@ impl Tensor {
    ///
    /// let c = a.unsqueeze(D::Minus1)?;
    /// assert_eq!(c.shape().dims(), &[2, 3, 1]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn unsqueeze<D: Dim>(&self, dim: D) -> Result<Self> {
        let mut dims = self.dims().to_vec();
@ -1576,7 +1576,7 @@ impl Tensor {
    /// All tensors must have the same rank, and the output has one additional rank
    ///
    /// ```rust
-    /// # use candle::{Tensor, DType, Device};
+    /// # use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
@ -1585,7 +1585,7 @@ impl Tensor {
    ///
    /// let c = Tensor::stack(&[&a, &b], 2)?;
    /// assert_eq!(c.shape().dims(), &[2, 3, 2]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn stack<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
        if args.is_empty() {
@ -1605,7 +1605,7 @@ impl Tensor {
    /// the same rank
    ///
    /// ```rust
-    /// # use candle::{Tensor, DType, Device};
+    /// # use candle_core::{Tensor, DType, Device};
    /// let a = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    /// let b = Tensor::zeros((2, 3), DType::F32, &Device::Cpu)?;
    ///
@ -1614,7 +1614,7 @@ impl Tensor {
    ///
    /// let c = Tensor::cat(&[&a, &b], 1)?;
    /// assert_eq!(c.shape().dims(), &[2, 6]);
-    /// # Ok::<(), candle::Error>(())
+    /// # Ok::<(), candle_core::Error>(())
    /// ```
    pub fn cat<A: AsRef<Tensor>, D: Dim>(args: &[A], dim: D) -> Result<Self> {
        if args.is_empty() {
--- a/candle-core/tests/custom_op_tests.rs
+++ b/candle-core/tests/custom_op_tests.rs
@ -1,6 +1,6 @@
-use candle::backend::BackendStorage;
+use candle_core::backend::BackendStorage;
-use candle::cpu_backend;
+use candle_core::cpu_backend;
-use candle::{CpuStorage, CustomOp1, DType, Device, Error, Layout, Result, Shape, Tensor};
+use candle_core::{CpuStorage, CustomOp1, DType, Device, Error, Layout, Result, Shape, Tensor};
 mod test_utils;
 use test_utils::to_vec1_round;
@ -24,7 +24,7 @@ impl CustomOp1 for Elu {
    }
    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
-        let storage = candle::map_dtype!(
+        let storage = candle_core::map_dtype!(
            "elu",
            s,
            |s| cpu_backend::unary_map(s, l, |v| fwd(v, self.alpha)),
@ -67,7 +67,7 @@ impl CustomOp1 for EluBackward {
    }
    fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> {
-        let storage = candle::map_dtype!(
+        let storage = candle_core::map_dtype!(
            "elu-bwd",
            s,
            |s| cpu_backend::unary_map(s, l, |v| bwd(v, self.alpha)),
@ -104,7 +104,7 @@ impl CustomOp1 for EluWithBackward {
 #[test]
 fn custom_op1_with_backward() -> Result<()> {
    let cpu = &Device::Cpu;
-    let t = candle::Var::new(&[-2f32, 0f32, 2f32], cpu)?;
+    let t = candle_core::Var::new(&[-2f32, 0f32, 2f32], cpu)?;
    let elu_t = t.custom_op1(EluWithBackward::new(2.))?;
    assert_eq!(to_vec1_round(&elu_t, 4)?, &[-1.7293, 0.0, 2.0]);
--- a/candle-core/tests/display_tests.rs
+++ b/candle-core/tests/display_tests.rs
@ -1,5 +1,5 @@
 use anyhow::Result;
-use candle::{DType, Device::Cpu, Tensor};
+use candle_core::{DType, Device::Cpu, Tensor};
 #[test]
 fn display_scalar() -> Result<()> {
--- a/candle-core/tests/grad_tests.rs
+++ b/candle-core/tests/grad_tests.rs
@ -1,5 +1,5 @@
 use anyhow::{Context, Result};
-use candle::{Device, Shape, Tensor, Var};
+use candle_core::{Device, Shape, Tensor, Var};
 mod test_utils;
 fn simple_grad(device: &Device) -> Result<()> {
--- a/candle-core/tests/indexing_tests.rs
+++ b/candle-core/tests/indexing_tests.rs
@ -1,5 +1,5 @@
 use anyhow::Result;
-use candle::{Device, IndexOp, Tensor};
+use candle_core::{Device, IndexOp, Tensor};
 mod test_utils;
--- a/candle-core/tests/layout_tests.rs
+++ b/candle-core/tests/layout_tests.rs
@ -1,5 +1,6 @@
 mod test_utils;
 use candle::{Device, IndexOp, Result, Tensor};
 use candle_core as candle;
 fn contiguous(device: &Device) -> Result<()> {
    let tensor = Tensor::arange(0u32, 24u32, device)?.reshape((2, 3, 4))?;
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -1,5 +1,5 @@
 mod test_utils;
-use candle::{DType, Device, IndexOp, Result, Tensor};
+use candle_core::{DType, Device, IndexOp, Result, Tensor};
 fn zeros(device: &Device) -> Result<()> {
    let tensor = Tensor::zeros((5, 2), DType::F32, device)?;
--- a/candle-core/tests/test_utils.rs
+++ b/candle-core/tests/test_utils.rs
@ -1,6 +1,6 @@
 #![allow(dead_code)]
-use candle::{Result, Tensor};
+use candle_core::{Result, Tensor};
 #[macro_export]
 macro_rules! test_device {
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@ -11,7 +11,7 @@ license = "MIT/Apache-2.0"
 readme = "README.md"
 [dependencies]
-candle = { path = "../candle-core" }
+candle = { path = "../candle-core", package = "candle-core" }
 candle-nn = { path = "../candle-nn" }
 candle-transformers = { path = "../candle-transformers" }
 candle-flash-attn = { path = "../candle-flash-attn", optional = true }
--- a/candle-examples/examples/llama_multiprocess/model.rs
+++ b/candle-examples/examples/llama_multiprocess/model.rs
@ -106,7 +106,7 @@ impl TensorParallelRowLinear {
        let rank = comm.rank();
        let size = comm.world_size();
        let weight = vb.get_sharded("weight", 1, rank, size)?;
-        Ok(Self::new(Linear::new(weight, None), comm.clone()))
+        Ok(Self::new(Linear::new(weight, None), comm))
    }
 }
@ -296,8 +296,8 @@ impl CausalSelfAttention {
        let k = k.transpose(1, 2)?;
        let v = v.transpose(1, 2)?;
        let softmax_scale = 1f32 / (self.head_dim as f32).sqrt();
-        let y =
+        let y = candle_flash_attn::flash_attn(&q, &k, &v, softmax_scale, seq_len > 1)?
-            candle_flash_attn::flash_attn(q, k, v, softmax_scale, seq_len > 1)?.transpose(1, 2)?;
+            .transpose(1, 2)?;
        // Convert to contiguous as matmul doesn't support strided vs for now.
        let y = y.transpose(1, 2)?.reshape(&[b_sz, seq_len, n_embd])?;
        let y = self.o_proj.forward(&y)?;
@ -363,7 +363,7 @@ impl Mlp {
    fn load(vb: VarBuilder, _cfg: &Config, comm: Rc<Comm>) -> Result<Self> {
        let c_fc1 = TensorParallelColumnLinear::load(vb.pp("gate_proj"), comm.clone())?;
        let c_fc2 = TensorParallelColumnLinear::load(vb.pp("up_proj"), comm.clone())?;
-        let c_proj = TensorParallelRowLinear::load(vb.pp("down_proj"), comm.clone())?;
+        let c_proj = TensorParallelRowLinear::load(vb.pp("down_proj"), comm)?;
        Ok(Self::new(c_fc1, c_fc2, c_proj))
    }
 }
@ -396,7 +396,7 @@ impl Block {
    fn load(vb: VarBuilder, cache: &Cache, cfg: &Config, comm: Rc<Comm>) -> Result<Self> {
        let attn = CausalSelfAttention::load(vb.pp("self_attn"), cache, cfg, comm.clone())?;
-        let mlp = Mlp::load(vb.pp("mlp"), cfg, comm.clone())?;
+        let mlp = Mlp::load(vb.pp("mlp"), cfg, comm)?;
        let input_layernorm = RmsNorm::load(cfg.hidden_size, vb.pp("input_layernorm"))?;
        let post_attention_layernorm =
            RmsNorm::load(cfg.hidden_size, vb.pp("post_attention_layernorm"))?;
--- a/candle-flash-attn/Cargo.toml
+++ b/candle-flash-attn/Cargo.toml
@ -11,7 +11,7 @@ license = "MIT/Apache-2.0"
 readme = "README.md"
 [dependencies]
-candle = { path = "../candle-core", features = ["cuda"] }
+candle = { path = "../candle-core", features = ["cuda"], package = "candle-core" }
 half = { version = "2.3.1", features = ["num-traits"] }
 [build-dependencies]
--- a/candle-nn/Cargo.toml
+++ b/candle-nn/Cargo.toml
@ -11,7 +11,7 @@ license = "MIT/Apache-2.0"
 readme = "README.md"
 [dependencies]
-candle = { path = "../candle-core" }
+candle = { path = "../candle-core", package = "candle-core" }
 thiserror = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 safetensors = { workspace = true }
--- a/candle-pyo3/Cargo.toml
+++ b/candle-pyo3/Cargo.toml
@ -16,7 +16,7 @@ crate-type = ["cdylib"]
 doc = false
 [dependencies]
-candle = { path = "../candle-core" }
+candle = { path = "../candle-core", package = "candle-core" }
 pyo3 = { version = "0.19.0", features = ["extension-module"] }
 half = { workspace = true }
--- a/candle-transformers/Cargo.toml
+++ b/candle-transformers/Cargo.toml
@ -11,7 +11,7 @@ license = "MIT/Apache-2.0"
 readme = "README.md"
 [dependencies]
-candle = { path = "../candle-core" }
+candle = { path = "../candle-core", package = "candle-core" }
 hf-hub = { workspace = true}
 candle-nn = { path = "../candle-nn" }
 intel-mkl-src = { workspace = true, optional = true }
--- a/candle-wasm-examples/llama2-c/Cargo.toml
+++ b/candle-wasm-examples/llama2-c/Cargo.toml
@ -11,7 +11,7 @@ license = "MIT/Apache-2.0"
 readme = "README.md"
 [dependencies]
-candle = { path = "../../candle-core" }
+candle = { path = "../../candle-core", package = "candle-core" }
 candle-nn = { path = "../../candle-nn" }
 num-traits = { workspace = true }
--- a/candle-wasm-examples/whisper/Cargo.toml
+++ b/candle-wasm-examples/whisper/Cargo.toml
@ -11,7 +11,7 @@ license = "MIT/Apache-2.0"
 readme = "README.md"
 [dependencies]
-candle = { path = "../../candle-core" }
+candle = { path = "../../candle-core", package = "candle-core" }
 candle-nn = { path = "../../candle-nn" }
 num-traits = { workspace = true }
 tokenizers = { workspace = true, features = ["unstable_wasm"] }