diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs index 3690e0dc..6a9e73f8 100644 --- a/candle-core/src/cuda_backend/mod.rs +++ b/candle-core/src/cuda_backend/mod.rs @@ -99,7 +99,7 @@ pub trait WrapErr { impl> WrapErr for std::result::Result { fn w(self) -> std::result::Result { - self.map_err(|e| crate::Error::Cuda(Box::new(e.into()))) + self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt()) } } @@ -1761,6 +1761,11 @@ impl BackendStorage for CudaStorage { let dev = &self.device; let d1 = d1 as u32; let d2 = d2 as u32; + // Nothing to copy so we exit early to avoid launching a kernel and some potential invalid + // argument with a null pointer. + if d1 == 0 || d2 == 0 { + return Ok(()); + } let dst_s = dst_s as u32; let src_s = src_s as u32; let (src, dst, kname) = match (&self.slice, &mut dst.slice) { diff --git a/candle-kernels/src/cuda_utils.cuh b/candle-kernels/src/cuda_utils.cuh index b0a85249..2673b8aa 100644 --- a/candle-kernels/src/cuda_utils.cuh +++ b/candle-kernels/src/cuda_utils.cuh @@ -14,7 +14,7 @@ __device__ bool is_contiguous( size_t acc = 1; for (unsigned int d = 0; d < num_dims; d++) { unsigned int dim_idx = num_dims - 1 - d; - if (acc != strides[dim_idx]) { + if (dims[dim_idx] > 1 && acc != strides[dim_idx]) { return false; } acc *= dims[dim_idx]; diff --git a/candle-nn/src/rnn.rs b/candle-nn/src/rnn.rs index 07795eda..dbfa639b 100644 --- a/candle-nn/src/rnn.rs +++ b/candle-nn/src/rnn.rs @@ -31,7 +31,7 @@ pub trait RNN { let (_b_size, seq_len, _features) = input.dims3()?; let mut output = Vec::with_capacity(seq_len); for seq_index in 0..seq_len { - let input = input.i((.., seq_index, ..))?; + let input = input.i((.., seq_index, ..))?.contiguous()?; let state = if seq_index == 0 { self.step(&input, init_state)? } else { diff --git a/candle-transformers/src/models/segment_anything/prompt_encoder.rs b/candle-transformers/src/models/segment_anything/prompt_encoder.rs index 16e8a4e8..258fb5aa 100644 --- a/candle-transformers/src/models/segment_anything/prompt_encoder.rs +++ b/candle-transformers/src/models/segment_anything/prompt_encoder.rs @@ -218,7 +218,8 @@ impl PromptEncoder { (Some(se_points), None) => se_points, (None, Some(se_boxes)) => se_boxes, (None, None) => { - Tensor::zeros((1, 0, self.embed_dim), DType::F32, &candle::Device::Cpu)? + let dev = self.no_mask_embed.embeddings().device(); + Tensor::zeros((1, 0, self.embed_dim), DType::F32, dev)? } };