diff --git a/kernels/src/reduce.cu b/kernels/src/reduce.cu index 0214ca88..1d6ee436 100644 --- a/kernels/src/reduce.cu +++ b/kernels/src/reduce.cu @@ -1,3 +1,4 @@ +// TODO: Use a proper distributed reduction rather than atomicAdd. #include "cuda_utils.cuh" #include @@ -23,7 +24,7 @@ extern "C" __global__ void FN_NAME( \ size_t post = dst_index % stride; \ dst_index = (pre / sum_dims_l[nd]) * stride + post; \ } \ - out[dst_index] += inp[i]; \ + atomicAdd(out + dst_index, inp[i]); \ } \ } \ else { \ @@ -36,7 +37,7 @@ extern "C" __global__ void FN_NAME( \ size_t post = dst_index % stride; \ dst_index = (pre / sum_dims_l[nd]) * stride + post; \ } \ - out[dst_index] += inp[strided_i]; \ + atomicAdd(out + dst_index, inp[strided_i]); \ } \ } \ } \ diff --git a/src/cuda_backend.rs b/src/cuda_backend.rs index 85f55568..2c96cc6b 100644 --- a/src/cuda_backend.rs +++ b/src/cuda_backend.rs @@ -314,14 +314,7 @@ impl CudaStorage { .iter() .map(|&d| src_dims[d + 1..].iter().product::()) .collect(); - // let cfg = LaunchConfig::for_num_elems(el as u32); - // TODO: Hack to run the computation on a single thread, replace with a proper distributed - // algorithm. - let cfg = LaunchConfig { - grid_dim: (1, 1, 1), - block_dim: (1, 1, 1), - shared_mem_bytes: 0, - }; + let cfg = LaunchConfig::for_num_elems(el as u32); let dev = self.device(); let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?; let slice = match &self.slice {