Add the batcher. (#293)

2025-06-20 12:06:35 +00:00 · 2023-08-01 09:16:10 +01:00
parent fa98ca0c35
commit e1e8127f15
3 changed files with 111 additions and 18 deletions
--- a/candle-nn/src/dataset.rs
+++ b/candle-nn/src/dataset.rs
@ -0,0 +1,96 @@
+use candle::{Result, Tensor};
+
+pub struct Batcher<I> {
+    inner: I,
+    batch_size: usize,
+    return_last_incomplete_batch: bool,
+}
+
+impl<I> Batcher<I> {
+    fn new(inner: I) -> Self {
+        Self {
+            inner,
+            batch_size: 16,
+            return_last_incomplete_batch: false,
+        }
+    }
+
+    pub fn batch_size(mut self, batch_size: usize) -> Self {
+        self.batch_size = batch_size;
+        self
+    }
+
+    pub fn return_last_incomplete_batch(mut self, r: bool) -> Self {
+        self.return_last_incomplete_batch = r;
+        self
+    }
+}
+
+pub struct Iter1<I: Iterator<Item = Tensor>> {
+    inner: I,
+}
+
+pub struct Iter2<I: Iterator<Item = (Tensor, Tensor)>> {
+    inner: I,
+}
+
+impl<I: Iterator<Item = Tensor>> Batcher<Iter1<I>> {
+    pub fn new1(inner: I) -> Self {
+        Self::new(Iter1 { inner })
+    }
+}
+
+impl<I: Iterator<Item = (Tensor, Tensor)>> Batcher<Iter2<I>> {
+    pub fn new2(inner: I) -> Self {
+        Self::new(Iter2 { inner })
+    }
+}
+
+impl<I: Iterator<Item = Tensor>> Iterator for Batcher<Iter1<I>> {
+    type Item = Result<Tensor>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut items = Vec::with_capacity(self.batch_size);
+        for _i in 0..self.batch_size {
+            // We have two levels of inner here so that we can have two implementations of the
+            // Iterator trait that are different for Iter1 and Iter2. If rust gets better
+            // specialization at some point we can get rid of this.
+            match self.inner.inner.next() {
+                Some(item) => items.push(item),
+                None => {
+                    if self.return_last_incomplete_batch {
+                        break;
+                    }
+                    return None;
+                }
+            }
+        }
+        Some(Tensor::stack(&items, 0))
+    }
+}
+
+impl<I: Iterator<Item = (Tensor, Tensor)>> Iterator for Batcher<Iter2<I>> {
+    type Item = Result<(Tensor, Tensor)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut xs = Vec::with_capacity(self.batch_size);
+        let mut ys = Vec::with_capacity(self.batch_size);
+        for _i in 0..self.batch_size {
+            match self.inner.inner.next() {
+                Some((x, y)) => {
+                    xs.push(x);
+                    ys.push(y)
+                }
+                None => {
+                    if self.return_last_incomplete_batch {
+                        break;
+                    }
+                    return None;
+                }
+            }
+        }
+        let xs = Tensor::stack(&xs, 0);
+        let ys = Tensor::stack(&ys, 0);
+        Some(xs.and_then(|xs| ys.map(|ys| (xs, ys))))
+    }
+}
--- a/candle-nn/src/lib.rs
+++ b/candle-nn/src/lib.rs
@ -2,6 +2,7 @@
 // error type if needed or add some specialized cases on the candle-core side.
 pub mod activation;
 pub mod conv;
+pub mod dataset;
 pub mod embedding;
 pub mod init;
 pub mod layer_norm;