diff --git a/kernels/src/reduce.cu b/kernels/src/reduce.cu
index 0214ca88..1d6ee436 100644
--- a/kernels/src/reduce.cu
+++ b/kernels/src/reduce.cu
@@ -1,3 +1,4 @@
+// TODO: Use a proper distributed reduction rather than atomicAdd.
 #include "cuda_utils.cuh"
 #include<stdint.h>
 
@@ -23,7 +24,7 @@ extern "C" __global__ void FN_NAME(  \
               size_t post = dst_index % stride; \
               dst_index = (pre / sum_dims_l[nd]) * stride + post; \
             } \
-            out[dst_index] += inp[i]; \
+            atomicAdd(out + dst_index, inp[i]); \
         } \
     } \
     else { \
@@ -36,7 +37,7 @@ extern "C" __global__ void FN_NAME(  \
               size_t post = dst_index % stride; \
               dst_index = (pre / sum_dims_l[nd]) * stride + post; \
             } \
-            out[dst_index] += inp[strided_i]; \
+            atomicAdd(out + dst_index, inp[strided_i]); \
         } \
     } \
 } \
diff --git a/src/cuda_backend.rs b/src/cuda_backend.rs
index 85f55568..2c96cc6b 100644
--- a/src/cuda_backend.rs
+++ b/src/cuda_backend.rs
@@ -314,14 +314,7 @@ impl CudaStorage {
             .iter()
             .map(|&d| src_dims[d + 1..].iter().product::<usize>())
             .collect();
-        // let cfg = LaunchConfig::for_num_elems(el as u32);
-        // TODO: Hack to run the computation on a single thread, replace with a proper distributed
-        // algorithm.
-        let cfg = LaunchConfig {
-            grid_dim: (1, 1, 1),
-            block_dim: (1, 1, 1),
-            shared_mem_bytes: 0,
-        };
+        let cfg = LaunchConfig::for_num_elems(el as u32);
         let dev = self.device();
         let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?;
         let slice = match &self.slice {