onnx: fix pad, unsqueeze (#2317)

* onnx: fix pad, unsqueeze

both implementations have off-by-one errors:
- Pad 'reflect' cycle for eg `dim==3` is `[0,1,2,1]` which has length of
  4 (or `dim*2 - 2`) not 5 (current code `dim*2 - 1`)
- Unsqueeze(-1) for tensor with `dim==3` should be 3 (ie `dim+index+1`)
  not 2 (ie currently `dim+index`)

in addition, Pad is incorrectly calculating the starting padding.
If we want to pad out 2 elements to the start, and we have this cycle
of indices of length 6, then we should skip 4 elements, but currently
we skip 2. A more visual representation of what's going on is below:

```
pad_start: 2
data:      [a,b,c,d]
indices:   [0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 2, 1, 0, ..] // zigzag between 0..4
actual:    skip [ c  d| c  b  a  b]
expected:  ~  skip  ~ [ c  b| a  b  c  d]
```

The values between `[` and `|` are padding and the values between
`|` and `]` in the example should match the original data being padded.

* Fix clippy lints.

---------

Co-authored-by: Laurent <laurent.mazare@gmail.com>
This commit is contained in:
shua
2024-07-23 23:10:57 +02:00
committed by GitHub
parent ebc9aa60bc
commit 6056fd5c90
19 changed files with 93 additions and 26 deletions

View File

@ -16,7 +16,7 @@ use candle_transformers::models::beit;
/// Loads an image from disk using the image crate, this returns a tensor with shape /// Loads an image from disk using the image crate, this returns a tensor with shape
/// (3, 384, 384). Beit special normalization is applied. /// (3, 384, 384). Beit special normalization is applied.
pub fn load_image384_beit_norm<P: AsRef<std::path::Path>>(p: P) -> Result<Tensor> { pub fn load_image384_beit_norm<P: AsRef<std::path::Path>>(p: P) -> Result<Tensor> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)? .map_err(candle::Error::wrap)?
.resize_to_fill(384, 384, image::imageops::FilterType::Triangle); .resize_to_fill(384, 384, image::imageops::FilterType::Triangle);

View File

@ -55,7 +55,7 @@ const SEP_TOKEN_ID: u32 = 102;
/// Loads an image from disk using the image crate, this returns a tensor with shape /// Loads an image from disk using the image crate, this returns a tensor with shape
/// (3, 384, 384). OpenAI normalization is applied. /// (3, 384, 384). OpenAI normalization is applied.
pub fn load_image<P: AsRef<std::path::Path>>(p: P) -> Result<Tensor> { pub fn load_image<P: AsRef<std::path::Path>>(p: P) -> Result<Tensor> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)? .map_err(candle::Error::wrap)?
.resize_to_fill(384, 384, image::imageops::FilterType::Triangle); .resize_to_fill(384, 384, image::imageops::FilterType::Triangle);

View File

@ -33,7 +33,7 @@ struct Args {
} }
fn load_image<T: AsRef<std::path::Path>>(path: T, image_size: usize) -> anyhow::Result<Tensor> { fn load_image<T: AsRef<std::path::Path>>(path: T, image_size: usize) -> anyhow::Result<Tensor> {
let img = image::io::Reader::open(path)?.decode()?; let img = image::ImageReader::open(path)?.decode()?;
let (height, width) = (image_size, image_size); let (height, width) = (image_size, image_size);
let img = img.resize_to_fill( let img = img.resize_to_fill(
width as u32, width as u32,

View File

@ -16,7 +16,7 @@ use candle_transformers::models::eva2;
/// Loads an image from disk using the image crate, this returns a tensor with shape /// Loads an image from disk using the image crate, this returns a tensor with shape
/// (3, 448, 448). OpenAI normalization is applied. /// (3, 448, 448). OpenAI normalization is applied.
pub fn load_image448_openai_norm<P: AsRef<std::path::Path>>(p: P) -> Result<Tensor> { pub fn load_image448_openai_norm<P: AsRef<std::path::Path>>(p: P) -> Result<Tensor> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)? .map_err(candle::Error::wrap)?
.resize_to_fill(448, 448, image::imageops::FilterType::Triangle); .resize_to_fill(448, 448, image::imageops::FilterType::Triangle);

View File

@ -57,7 +57,7 @@ fn load_image<T: AsRef<std::path::Path>>(
llava_config: &LLaVAConfig, llava_config: &LLaVAConfig,
dtype: DType, dtype: DType,
) -> Result<((u32, u32), Tensor)> { ) -> Result<((u32, u32), Tensor)> {
let img = image::io::Reader::open(path)?.decode()?; let img = image::ImageReader::open(path)?.decode()?;
let img_tensor = process_image(&img, processor, llava_config)?; let img_tensor = process_image(&img, processor, llava_config)?;
Ok(((img.width(), img.height()), img_tensor.to_dtype(dtype)?)) Ok(((img.width(), img.height()), img_tensor.to_dtype(dtype)?))
} }

View File

@ -208,7 +208,7 @@ struct Args {
/// Loads an image from disk using the image crate, this returns a tensor with shape /// Loads an image from disk using the image crate, this returns a tensor with shape
/// (3, 378, 378). /// (3, 378, 378).
pub fn load_image<P: AsRef<std::path::Path>>(p: P) -> candle::Result<Tensor> { pub fn load_image<P: AsRef<std::path::Path>>(p: P) -> candle::Result<Tensor> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)? .map_err(candle::Error::wrap)?
.resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378 .resize_to_fill(378, 378, image::imageops::FilterType::Triangle); // Adjusted to 378x378

View File

@ -139,7 +139,7 @@ pub fn main() -> anyhow::Result<()> {
let (_one, h, w) = mask.dims3()?; let (_one, h, w) = mask.dims3()?;
let mask = mask.expand((3, h, w))?; let mask = mask.expand((3, h, w))?;
let mut img = image::io::Reader::open(&args.image)? let mut img = image::ImageReader::open(&args.image)?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;
let mask_pixels = mask.permute((1, 2, 0))?.flatten_all()?.to_vec1::<u8>()?; let mask_pixels = mask.permute((1, 2, 0))?.flatten_all()?.to_vec1::<u8>()?;

View File

@ -380,7 +380,7 @@ fn text_embeddings(
} }
fn image_preprocess<T: AsRef<std::path::Path>>(path: T) -> anyhow::Result<Tensor> { fn image_preprocess<T: AsRef<std::path::Path>>(path: T) -> anyhow::Result<Tensor> {
let img = image::io::Reader::open(path)?.decode()?; let img = image::ImageReader::open(path)?.decode()?;
let (height, width) = (img.height() as usize, img.width() as usize); let (height, width) = (img.height() as usize, img.width() as usize);
let height = height - height % 32; let height = height - height % 32;
let width = width - width % 32; let width = width - width % 32;

View File

@ -145,7 +145,7 @@ impl ViTImageProcessor {
pub fn load_images(&self, image_path: Vec<&str>) -> Result<Vec<image::DynamicImage>> { pub fn load_images(&self, image_path: Vec<&str>) -> Result<Vec<image::DynamicImage>> {
let mut images: Vec<image::DynamicImage> = Vec::new(); let mut images: Vec<image::DynamicImage> = Vec::new();
for path in image_path { for path in image_path {
let img = image::io::Reader::open(path)?.decode().unwrap(); let img = image::ImageReader::open(path)?.decode().unwrap();
images.push(img); images.push(img);
} }

View File

@ -159,7 +159,7 @@ pub fn main() -> Result<()> {
let net_width = darknet.width()?; let net_width = darknet.width()?;
let net_height = darknet.height()?; let net_height = darknet.height()?;
let original_image = image::io::Reader::open(&image_name)? let original_image = image::ImageReader::open(&image_name)?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;
let image = { let image = {

View File

@ -390,7 +390,7 @@ pub fn run<T: Task>(args: Args) -> anyhow::Result<()> {
for image_name in args.images.iter() { for image_name in args.images.iter() {
println!("processing {image_name}"); println!("processing {image_name}");
let mut image_name = std::path::PathBuf::from(image_name); let mut image_name = std::path::PathBuf::from(image_name);
let original_image = image::io::Reader::open(&image_name)? let original_image = image::ImageReader::open(&image_name)?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;
let (width, height) = { let (width, height) = {

View File

@ -3,7 +3,7 @@ use candle::{Device, Result, Tensor};
/// Loads an image from disk using the image crate at the requested resolution. /// Loads an image from disk using the image crate at the requested resolution.
// This returns a tensor with shape (3, res, res). imagenet normalization is applied. // This returns a tensor with shape (3, res, res). imagenet normalization is applied.
pub fn load_image<P: AsRef<std::path::Path>>(p: P, res: u32) -> Result<Tensor> { pub fn load_image<P: AsRef<std::path::Path>>(p: P, res: u32) -> Result<Tensor> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)? .map_err(candle::Error::wrap)?
.resize_to_fill(res, res, image::imageops::FilterType::Triangle); .resize_to_fill(res, res, image::imageops::FilterType::Triangle);

View File

@ -34,7 +34,7 @@ pub fn load_image<P: AsRef<std::path::Path>>(
p: P, p: P,
resize_longest: Option<usize>, resize_longest: Option<usize>,
) -> Result<(Tensor, usize, usize)> { ) -> Result<(Tensor, usize, usize)> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;
let (initial_h, initial_w) = (img.height() as usize, img.width() as usize); let (initial_h, initial_w) = (img.height() as usize, img.width() as usize);
@ -65,7 +65,7 @@ pub fn load_image_and_resize<P: AsRef<std::path::Path>>(
width: usize, width: usize,
height: usize, height: usize,
) -> Result<Tensor> { ) -> Result<Tensor> {
let img = image::io::Reader::open(p)? let img = image::ImageReader::open(p)?
.decode() .decode()
.map_err(candle::Error::wrap)? .map_err(candle::Error::wrap)?
.resize_to_fill( .resize_to_fill(

View File

@ -570,6 +570,11 @@ fn simple_eval_(
.map(|&i| { .map(|&i| {
if i == xs.rank() as i64 { if i == xs.rank() as i64 {
Ok(xs.rank()) Ok(xs.rank())
} else if i < 0 {
// normalize_axis doesn't work correctly here
// because we actually want normalized with respect
// to the final size, not the current (off by one)
Ok(xs.rank() - (-i as usize) + 1)
} else { } else {
xs.normalize_axis(i) xs.normalize_axis(i)
} }
@ -1040,8 +1045,8 @@ fn simple_eval_(
std::iter::repeat((min..max).chain((min + 1..=max).rev())).flatten() std::iter::repeat((min..max).chain((min + 1..=max).rev())).flatten()
} }
let idx = if dim > 1 { let idx = if dim > 1 {
let cycle_len = dim * 2 - 1; let cycle_len = dim * 2 - 2;
let skip = (pads_pre[i] as usize) % cycle_len; let skip = cycle_len - ((pads_pre[i] as usize) % cycle_len);
let idx = zigzag(0, (dim - 1) as i64) let idx = zigzag(0, (dim - 1) as i64)
.skip(skip) .skip(skip)
.take((pads_pre[i] as usize) + dim + (pads_post[i] as usize)); .take((pads_pre[i] as usize) + dim + (pads_post[i] as usize));

View File

@ -977,7 +977,59 @@ fn test_constant_of_shape() -> Result<()> {
} }
// "Unsqueeze" // "Unsqueeze"
// #[test] #[test]
fn test_unsqueeze() -> Result<()> {
let manual_graph = create_model_proto_with_graph(Some(GraphProto {
node: vec![NodeProto {
op_type: "Unsqueeze".to_string(),
domain: "".to_string(),
attribute: vec![],
input: vec![INPUT_X.to_string(), INPUT_Y.to_string()],
output: vec![OUTPUT_Z.to_string()],
name: "".to_string(),
doc_string: "".to_string(),
}],
name: "".to_string(),
initializer: vec![],
input: vec![],
output: vec![ValueInfoProto {
name: OUTPUT_Z.to_string(),
doc_string: "".to_string(),
r#type: None,
}],
value_info: vec![ValueInfoProto {
name: INPUT_X.to_string(),
doc_string: "".to_string(),
r#type: None,
}],
doc_string: "".to_string(),
sparse_initializer: vec![],
quantization_annotation: vec![],
}));
let x = Tensor::from_vec(
vec![
1.0f32, 2.0f32, //
3.0f32, 4.0f32, //
],
&[2, 2],
&Device::Cpu,
)?;
let y = Tensor::from_vec(vec![-1i64], &[1], &Device::Cpu)?;
let inputs = HashMap::from_iter([(INPUT_X.to_string(), x.clone()), (INPUT_Y.to_string(), y)]);
let eval = candle_onnx::simple_eval(&manual_graph, inputs)?;
assert_eq!(eval.len(), 1);
let z = eval.get(OUTPUT_Z).expect("Output 'z' not found");
assert_eq!(z.dims(), &[2, 2, 1]);
assert_eq!(
z.flatten_all()?.to_vec1::<f32>()?,
x.flatten_all()?.to_vec1::<f32>()?
);
Ok(())
}
// "Clip" // "Clip"
// #[test] // #[test]
@ -3268,13 +3320,23 @@ fn test_if() -> Result<()> {
#[test] #[test]
fn test_pad() -> Result<()> { fn test_pad() -> Result<()> {
let data = Tensor::from_vec(vec![1.0, 1.2, 2.3, 3.4, 4.5, 5.7], (3, 2), &Device::Cpu)?; let data = Tensor::from_vec(
let pads = Tensor::from_vec(vec![0i64, 2, 0, 0], (4,), &Device::Cpu)?; vec![
1.0, 2.0, 3.0, //
4.0, 5.0, 6.0, //
],
(2, 3),
&Device::Cpu,
)?;
let pads = Tensor::from_vec(vec![0i64, 1, 0, 0], (4,), &Device::Cpu)?;
let mode = "reflect"; let mode = "reflect";
let expected = Tensor::from_vec( let expected = Tensor::from_vec(
vec![1.0, 1.2, 1.0, 1.2, 2.3, 3.4, 2.3, 3.4, 4.5, 5.7, 4.5, 5.7], vec![
(3, 4), 2.0, 1.0, 2.0, 3.0, //
5.0, 4.0, 5.0, 6.0, //
],
(2, 4),
&Device::Cpu, &Device::Cpu,
)?; )?;

View File

@ -124,7 +124,7 @@ impl Model {
impl Model { impl Model {
fn load_image(&self, image: Vec<u8>) -> Result<Tensor, JsError> { fn load_image(&self, image: Vec<u8>) -> Result<Tensor, JsError> {
let device = &Device::Cpu; let device = &Device::Cpu;
let img = image::io::Reader::new(std::io::Cursor::new(image)) let img = image::ImageReader::new(std::io::Cursor::new(image))
.with_guessed_format()? .with_guessed_format()?
.decode() .decode()
.map_err(|e| JsError::new(&e.to_string()))? .map_err(|e| JsError::new(&e.to_string()))?

View File

@ -195,7 +195,7 @@ impl Model {
} }
impl Model { impl Model {
fn load_image(&self, image: Vec<u8>) -> Result<Tensor, JsError> { fn load_image(&self, image: Vec<u8>) -> Result<Tensor, JsError> {
let img = image::io::Reader::new(std::io::Cursor::new(image)) let img = image::ImageReader::new(std::io::Cursor::new(image))
.with_guessed_format()? .with_guessed_format()?
.decode() .decode()
.map_err(|e| JsError::new(&e.to_string()))? .map_err(|e| JsError::new(&e.to_string()))?

View File

@ -38,7 +38,7 @@ impl Model {
pub fn set_image_embeddings(&mut self, image_data: Vec<u8>) -> Result<(), JsError> { pub fn set_image_embeddings(&mut self, image_data: Vec<u8>) -> Result<(), JsError> {
sam::console_log!("image data: {}", image_data.len()); sam::console_log!("image data: {}", image_data.len());
let image_data = std::io::Cursor::new(image_data); let image_data = std::io::Cursor::new(image_data);
let image = image::io::Reader::new(image_data) let image = image::ImageReader::new(image_data)
.with_guessed_format()? .with_guessed_format()?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;

View File

@ -48,7 +48,7 @@ impl Model {
) -> Result<Vec<Vec<Bbox>>> { ) -> Result<Vec<Vec<Bbox>>> {
console_log!("image data: {}", image_data.len()); console_log!("image data: {}", image_data.len());
let image_data = std::io::Cursor::new(image_data); let image_data = std::io::Cursor::new(image_data);
let original_image = image::io::Reader::new(image_data) let original_image = image::ImageReader::new(image_data)
.with_guessed_format()? .with_guessed_format()?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;
@ -127,7 +127,7 @@ impl ModelPose {
) -> Result<Vec<Bbox>> { ) -> Result<Vec<Bbox>> {
console_log!("image data: {}", image_data.len()); console_log!("image data: {}", image_data.len());
let image_data = std::io::Cursor::new(image_data); let image_data = std::io::Cursor::new(image_data);
let original_image = image::io::Reader::new(image_data) let original_image = image::ImageReader::new(image_data)
.with_guessed_format()? .with_guessed_format()?
.decode() .decode()
.map_err(candle::Error::wrap)?; .map_err(candle::Error::wrap)?;