mirror of
https://github.com/huggingface/candle.git
synced 2025-06-15 02:16:37 +00:00
Training:
- Removed a lot of surface (SerializedFileReader ownership is really painful). - Moved example + vision to hf.co version. - Removed feature gate.
This commit is contained in:
@ -19,7 +19,5 @@ memmap2 = { workspace = true }
|
||||
tokenizers = { workspace = true, features = ["onig"] }
|
||||
rand = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
parquet = { version = "45.0.0", optional = true }
|
||||
|
||||
[features]
|
||||
hub = ["dep:parquet"]
|
||||
parquet = { version = "45.0.0"}
|
||||
image = { workspace = true }
|
||||
|
@ -3,6 +3,8 @@
|
||||
//! The files can be obtained from the following link:
|
||||
//! <http://yann.lecun.com/exdb/mnist/>
|
||||
use candle::{DType, Device, Result, Tensor};
|
||||
use hf_hub::{api::sync::Api, Repo, RepoType};
|
||||
use parquet::file::reader::{FileReader, SerializedFileReader};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, Read};
|
||||
|
||||
@ -63,3 +65,54 @@ pub fn load_dir<T: AsRef<std::path::Path>>(dir: T) -> Result<crate::vision::Data
|
||||
labels: 10,
|
||||
})
|
||||
}
|
||||
|
||||
fn load_parquet(parquet: SerializedFileReader<std::fs::File>) -> Result<(Tensor, Tensor)> {
|
||||
let samples = parquet.metadata().file_metadata().num_rows() as usize;
|
||||
let mut buffer_images: Vec<u8> = Vec::with_capacity(samples * 784);
|
||||
let mut buffer_labels: Vec<u8> = Vec::with_capacity(samples);
|
||||
for row in parquet.into_iter().flatten() {
|
||||
for (_name, field) in row.get_column_iter() {
|
||||
if let parquet::record::Field::Group(subrow) = field {
|
||||
for (_name, field) in subrow.get_column_iter() {
|
||||
if let parquet::record::Field::Bytes(value) = field {
|
||||
let image = image::load_from_memory(value.data()).unwrap();
|
||||
buffer_images.extend(image.to_luma8().as_raw());
|
||||
}
|
||||
}
|
||||
} else if let parquet::record::Field::Long(label) = field {
|
||||
buffer_labels.push(*label as u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
let images = (Tensor::from_vec(buffer_images, (samples, 784), &Device::Cpu)?
|
||||
.to_dtype(DType::F32)?
|
||||
/ 255.)?;
|
||||
let labels = Tensor::from_vec(buffer_labels, (samples,), &Device::Cpu)?;
|
||||
Ok((images, labels))
|
||||
}
|
||||
|
||||
pub fn load() -> Result<crate::vision::Dataset> {
|
||||
let api = Api::new().unwrap();
|
||||
let dataset_id = "mnist".to_string();
|
||||
let repo = Repo::with_revision(
|
||||
dataset_id,
|
||||
RepoType::Dataset,
|
||||
"refs/convert/parquet".to_string(),
|
||||
);
|
||||
let repo = api.repo(repo);
|
||||
let test_parquet_filename = repo.get("mnist/mnist-test.parquet").unwrap();
|
||||
let train_parquet_filename = repo.get("mnist/mnist-train.parquet").unwrap();
|
||||
let test_parquet =
|
||||
SerializedFileReader::new(std::fs::File::open(test_parquet_filename)?).unwrap();
|
||||
let train_parquet =
|
||||
SerializedFileReader::new(std::fs::File::open(train_parquet_filename)?).unwrap();
|
||||
let (test_images, test_labels) = load_parquet(test_parquet)?;
|
||||
let (train_images, train_labels) = load_parquet(train_parquet)?;
|
||||
Ok(crate::vision::Dataset {
|
||||
train_images,
|
||||
train_labels,
|
||||
test_images,
|
||||
test_labels,
|
||||
labels: 10,
|
||||
})
|
||||
}
|
||||
|
Reference in New Issue
Block a user