From d1d89bac1f17c4cdd64582535bd1c6d7a5df845f Mon Sep 17 00:00:00 2001 From: drbh Date: Sun, 5 Nov 2023 04:55:49 -0500 Subject: [PATCH] feat: download cifar dataset parquet files (#1259) --- candle-datasets/src/vision/cifar.rs | 59 ++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/candle-datasets/src/vision/cifar.rs b/candle-datasets/src/vision/cifar.rs index 0683c4d2..4b403a2e 100644 --- a/candle-datasets/src/vision/cifar.rs +++ b/candle-datasets/src/vision/cifar.rs @@ -4,7 +4,9 @@ //! //! The binary version of the dataset is used. use crate::vision::Dataset; -use candle::{DType, Device, Result, Tensor}; +use candle::{DType, Device, Error, Result, Tensor}; +use hf_hub::{api::sync::Api, Repo, RepoType}; +use parquet::file::reader::{FileReader, SerializedFileReader}; use std::fs::File; use std::io::{BufReader, Read}; @@ -60,3 +62,58 @@ pub fn load_dir>(dir: T) -> Result { labels: 10, }) } + +fn load_parquet(parquet: SerializedFileReader) -> Result<(Tensor, Tensor)> { + let samples = parquet.metadata().file_metadata().num_rows() as usize; + let mut buffer_images: Vec = Vec::with_capacity(samples * 1_024); + let mut buffer_labels: Vec = Vec::with_capacity(samples); + for row in parquet.into_iter().flatten() { + for (_name, field) in row.get_column_iter() { + if let parquet::record::Field::Group(subrow) = field { + for (_name, field) in subrow.get_column_iter() { + if let parquet::record::Field::Bytes(value) = field { + let image = image::load_from_memory(value.data()).unwrap(); + buffer_images.extend(image.to_rgb8().as_raw()); + } + } + } else if let parquet::record::Field::Long(label) = field { + buffer_labels.push(*label as u8); + } + } + } + let images = (Tensor::from_vec(buffer_images, (samples, 3, 32, 32), &Device::Cpu)? + .to_dtype(DType::U8)? + / 255.)?; + let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?; + Ok((images, labels)) +} + +pub fn load() -> Result { + let api = Api::new().map_err(|e| Error::Msg(format!("Api error: {e}")))?; + let dataset_id = "cifar10".to_string(); + let repo = Repo::with_revision( + dataset_id, + RepoType::Dataset, + "refs/convert/parquet".to_string(), + ); + let repo = api.repo(repo); + let test_parquet_filename = repo + .get("plain_text/test/0000.parquet") + .map_err(|e| Error::Msg(format!("Api error: {e}")))?; + let train_parquet_filename = repo + .get("plain_text/train/0000.parquet") + .map_err(|e| Error::Msg(format!("Api error: {e}")))?; + let test_parquet = SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?) + .map_err(|e| Error::Msg(format!("Parquet error: {e}")))?; + let train_parquet = SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?) + .map_err(|e| Error::Msg(format!("Parquet error: {e}")))?; + let (test_images, test_labels) = load_parquet(test_parquet)?; + let (train_images, train_labels) = load_parquet(train_parquet)?; + Ok(crate::vision::Dataset { + train_images, + train_labels, + test_images, + test_labels, + labels: 10, + }) +}