Add more to the binary operators.

2025-06-17 11:08:52 +00:00 · 2023-06-20 09:49:40 +01:00
parent 7a31ba93e4
commit 786544292d
4 changed files with 129 additions and 14 deletions
--- a/src/error.rs
+++ b/src/error.rs
@ -1,4 +1,4 @@
-use crate::{DType, Shape};
+use crate::{DType, Device, Shape};

 /// Main library error type.
 #[derive(thiserror::Error, Debug)]
@ -13,6 +13,20 @@ pub enum Error {
        op: &'static str,
    },

+    #[error("device mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
+    DeviceMismatchBinaryOp {
+        lhs: Device,
+        rhs: Device,
+        op: &'static str,
+    },
+
+    #[error("dtype mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
+    DTypeMismatchBinaryOp {
+        lhs: DType,
+        rhs: DType,
+        op: &'static str,
+    },
+
    #[error("unexpected rank, expected: {expected}, got: {got} ({shape:?})")]
    UnexpectedNumberOfDims {
        expected: usize,
--- a/src/shape.rs
+++ b/src/shape.rs
@ -144,6 +144,7 @@ impl Shape {
    pub(crate) fn stride_contiguous(&self) -> Vec<usize> {
        self.0
            .iter()
+            .rev()
            .scan(1, |prod, u| {
                let prod_pre_mult = *prod;
                *prod *= u;
--- a/src/storage.rs
+++ b/src/storage.rs
@ -1,4 +1,4 @@
-use crate::{DType, Device};
+use crate::{DType, Device, Error, Result, Shape};

 // TODO: Think about whether we would be better off with a dtype and
 // a buffer as an owned slice of bytes.
@ -35,4 +35,75 @@ impl Storage {
            Self::Cpu(storage) => storage.dtype(),
        }
    }
+
+    pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
+        let lhs = self.device();
+        let rhs = rhs.device();
+        if lhs != rhs {
+            Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op })
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(crate) fn same_dtype(&self, rhs: &Self, op: &'static str) -> Result<()> {
+        let lhs = self.dtype();
+        let rhs = rhs.dtype();
+        if lhs != rhs {
+            Err(Error::DTypeMismatchBinaryOp { lhs, rhs, op })
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(crate) fn add_impl(
+        &self,
+        rhs: &Self,
+        shape: &Shape,
+        _lhs_stride: &[usize],
+        _rhs_stride: &[usize],
+    ) -> Result<Self> {
+        self.same_device(rhs, "add")?;
+        self.same_dtype(rhs, "add")?;
+        // The ggml implementation has different paths based on whether the rhs is contiguous
+        // or not, for now we only consider the general case but we should benchmark and do the
+        // same if it helps.
+        // https://github.com/ggerganov/llama.cpp/blob/aacdbd40562684665b6f7b8ba6695b7a2088bbb0/ggml.c#L7895
+        match (self, rhs) {
+            (Storage::Cpu(lhs), Storage::Cpu(rhs)) => match (lhs, rhs) {
+                (CpuStorage::F32(_), CpuStorage::F32(_)) => {
+                    let elem_count = shape.elem_count();
+                    let data = vec![0f32; elem_count];
+                    // TODO: properly fill data with the sum
+                    Ok(Storage::Cpu(CpuStorage::F32(data)))
+                }
+                (CpuStorage::F64(_), CpuStorage::F64(_)) => {
+                    let elem_count = shape.elem_count();
+                    let data = vec![0f64; elem_count];
+                    // TODO: properly fill data with the sum
+                    Ok(Storage::Cpu(CpuStorage::F64(data)))
+                }
+                _ => {
+                    // This should be covered by the dtype check above.
+                    Err(Error::DTypeMismatchBinaryOp {
+                        lhs: lhs.dtype(),
+                        rhs: rhs.dtype(),
+                        op: "add",
+                    })
+                }
+            },
+        }
+    }
+
+    pub(crate) fn mul_impl(
+        &self,
+        rhs: &Self,
+        _shape: &Shape,
+        _lhs_stride: &[usize],
+        _rhs_stride: &[usize],
+    ) -> Result<Self> {
+        self.same_device(rhs, "mul")?;
+        self.same_dtype(rhs, "mul")?;
+        todo!()
+    }
 }
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -2,7 +2,7 @@ use crate::{op::Op, storage::Storage, DType, Device, Error, Result, Shape};
 use std::sync::Arc;

 #[allow(dead_code)]
-pub(crate) struct Tensor_ {
+pub struct Tensor_ {
    storage: Storage,
    shape: Shape,
    // The strides are given in number of elements and not in bytes.
@ -10,8 +10,17 @@ pub(crate) struct Tensor_ {
    op: Option<Op>,
 }

+#[derive(Clone)]
 pub struct Tensor(Arc<Tensor_>);

+impl std::ops::Deref for Tensor {
+    type Target = Tensor_;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
 impl std::fmt::Debug for Tensor {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[{:?}, {:?}]", &self.shape().dims(), self.device())
@ -45,7 +54,7 @@ impl Tensor {
        Ok(Self(Arc::new(tensor_)))
    }

-    pub(crate) fn same_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<()> {
+    pub(crate) fn same_shape_binary_op(&self, rhs: &Self, op: &'static str) -> Result<&Shape> {
        let lhs = self.shape();
        let rhs = rhs.shape();
        if lhs != rhs {
@ -55,18 +64,38 @@ impl Tensor {
                op,
            })
        } else {
-            Ok(())
+            Ok(lhs)
        }
    }

+    // TODO: Also make an inplace version or a pre-allocated? This could be tricky
+    // if this can create cycles in the compute graph.
    pub fn add(&self, rhs: &Self) -> Result<Self> {
-        self.same_shape_binary_op(rhs, "add")?;
-        todo!()
+        let shape = self.same_shape_binary_op(rhs, "add")?;
+        let storage = self
+            .storage
+            .add_impl(&rhs.storage, shape, self.stride(), rhs.stride())?;
+        let tensor_ = Tensor_ {
+            storage,
+            shape: shape.clone(),
+            stride: shape.stride_contiguous(),
+            op: Some(Op::Add(self.clone(), rhs.clone())),
+        };
+        Ok(Self(Arc::new(tensor_)))
    }

    pub fn mul(&self, rhs: &Self) -> Result<Self> {
-        self.same_shape_binary_op(rhs, "mul")?;
-        todo!()
+        let shape = self.same_shape_binary_op(rhs, "mul")?;
+        let storage = self
+            .storage
+            .mul_impl(&rhs.storage, shape, self.stride(), rhs.stride())?;
+        let tensor_ = Tensor_ {
+            storage,
+            shape: shape.clone(),
+            stride: shape.stride_contiguous(),
+            op: Some(Op::Mul(self.clone(), rhs.clone())),
+        };
+        Ok(Self(Arc::new(tensor_)))
    }

    pub fn to_scalar<S: crate::WithDType>(&self) -> Result<S> {
@ -77,7 +106,7 @@ impl Tensor {
                shape: self.shape().clone(),
            });
        }
-        match &self.0.storage {
+        match &self.storage {
            Storage::Cpu(cpu_storage) => {
                let data = S::cpu_storage_as_slice(cpu_storage)?;
                Ok(data[0])
@ -96,15 +125,15 @@ impl Tensor {
    }

    pub fn dtype(&self) -> DType {
-        self.0.storage.dtype()
+        self.storage.dtype()
    }

    pub fn device(&self) -> Device {
-        self.0.storage.device()
+        self.storage.device()
    }

    pub fn shape(&self) -> &Shape {
-        &self.0.shape
+        &self.shape
    }

    pub fn dims(&self) -> &[usize] {
@ -112,7 +141,7 @@ impl Tensor {
    }

    pub fn stride(&self) -> &[usize] {
-        &self.0.stride
+        &self.stride
    }

    pub fn rank(&self) -> usize {