diff --git a/candle-core/src/quantized/cuda.rs b/candle-core/src/quantized/cuda.rs
index f59fce48..525cf8c8 100644
--- a/candle-core/src/quantized/cuda.rs
+++ b/candle-core/src/quantized/cuda.rs
@@ -34,10 +34,13 @@ fn ceil_div(p: usize, q: usize) -> usize {
 }
 
 fn pad(p: usize, q: usize) -> usize {
+    ceil_div(p, q) * q
+}
+
+fn pad_for_alloc(p: usize) -> usize {
     // Overallocate by q rather than just padding by q as this should pad the last row
     // and we don't have enough information here to know how many elements to add :(
-    // ceil_div(p, q) * q
-    p + q
+    p + MATRIX_ROW_PADDING
 }
 
 fn quantize_q8_1(
@@ -442,7 +445,7 @@ impl QCudaStorage {
             }
             _ => crate::bail!("only f32 can be quantized"),
         };
-        let src_len = pad(src.len(), MATRIX_ROW_PADDING);
+        let src_len = pad_for_alloc(src.len());
         let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
         let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
         qcpu_storage.quantize(&src)?;