diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs
index 3690e0dc..6a9e73f8 100644
--- a/candle-core/src/cuda_backend/mod.rs
+++ b/candle-core/src/cuda_backend/mod.rs
@@ -99,7 +99,7 @@ pub trait WrapErr<O> {
 
 impl<O, E: Into<CudaError>> WrapErr<O> for std::result::Result<O, E> {
     fn w(self) -> std::result::Result<O, crate::Error> {
-        self.map_err(|e| crate::Error::Cuda(Box::new(e.into())))
+        self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt())
     }
 }
 
@@ -1761,6 +1761,11 @@ impl BackendStorage for CudaStorage {
         let dev = &self.device;
         let d1 = d1 as u32;
         let d2 = d2 as u32;
+        // Nothing to copy so we exit early to avoid launching a kernel and some potential invalid
+        // argument with a null pointer.
+        if d1 == 0 || d2 == 0 {
+            return Ok(());
+        }
         let dst_s = dst_s as u32;
         let src_s = src_s as u32;
         let (src, dst, kname) = match (&self.slice, &mut dst.slice) {
diff --git a/candle-kernels/src/cuda_utils.cuh b/candle-kernels/src/cuda_utils.cuh
index b0a85249..2673b8aa 100644
--- a/candle-kernels/src/cuda_utils.cuh
+++ b/candle-kernels/src/cuda_utils.cuh
@@ -14,7 +14,7 @@ __device__ bool is_contiguous(
     size_t acc = 1;
     for (unsigned int d = 0; d < num_dims; d++) {
         unsigned int dim_idx = num_dims - 1 - d;
-        if (acc != strides[dim_idx]) {
+        if (dims[dim_idx] > 1 && acc != strides[dim_idx]) {
             return false;
         }
         acc *= dims[dim_idx];
diff --git a/candle-nn/src/rnn.rs b/candle-nn/src/rnn.rs
index 07795eda..dbfa639b 100644
--- a/candle-nn/src/rnn.rs
+++ b/candle-nn/src/rnn.rs
@@ -31,7 +31,7 @@ pub trait RNN {
         let (_b_size, seq_len, _features) = input.dims3()?;
         let mut output = Vec::with_capacity(seq_len);
         for seq_index in 0..seq_len {
-            let input = input.i((.., seq_index, ..))?;
+            let input = input.i((.., seq_index, ..))?.contiguous()?;
             let state = if seq_index == 0 {
                 self.step(&input, init_state)?
             } else {
diff --git a/candle-transformers/src/models/segment_anything/prompt_encoder.rs b/candle-transformers/src/models/segment_anything/prompt_encoder.rs
index 16e8a4e8..258fb5aa 100644
--- a/candle-transformers/src/models/segment_anything/prompt_encoder.rs
+++ b/candle-transformers/src/models/segment_anything/prompt_encoder.rs
@@ -218,7 +218,8 @@ impl PromptEncoder {
             (Some(se_points), None) => se_points,
             (None, Some(se_boxes)) => se_boxes,
             (None, None) => {
-                Tensor::zeros((1, 0, self.embed_dim), DType::F32, &candle::Device::Cpu)?
+                let dev = self.no_mask_embed.embeddings().device();
+                Tensor::zeros((1, 0, self.embed_dim), DType::F32, dev)?
             }
         };