Use atomicAdd as a quick workaround some cuda synchronisation issue.

This commit is contained in:
laurent
2023-06-26 16:31:24 +01:00
parent f2ac5547fc
commit 3761f02aa8
2 changed files with 4 additions and 10 deletions

View File

@ -1,3 +1,4 @@
// TODO: Use a proper distributed reduction rather than atomicAdd.
#include "cuda_utils.cuh" #include "cuda_utils.cuh"
#include<stdint.h> #include<stdint.h>
@ -23,7 +24,7 @@ extern "C" __global__ void FN_NAME( \
size_t post = dst_index % stride; \ size_t post = dst_index % stride; \
dst_index = (pre / sum_dims_l[nd]) * stride + post; \ dst_index = (pre / sum_dims_l[nd]) * stride + post; \
} \ } \
out[dst_index] += inp[i]; \ atomicAdd(out + dst_index, inp[i]); \
} \ } \
} \ } \
else { \ else { \
@ -36,7 +37,7 @@ extern "C" __global__ void FN_NAME( \
size_t post = dst_index % stride; \ size_t post = dst_index % stride; \
dst_index = (pre / sum_dims_l[nd]) * stride + post; \ dst_index = (pre / sum_dims_l[nd]) * stride + post; \
} \ } \
out[dst_index] += inp[strided_i]; \ atomicAdd(out + dst_index, inp[strided_i]); \
} \ } \
} \ } \
} \ } \

View File

@ -314,14 +314,7 @@ impl CudaStorage {
.iter() .iter()
.map(|&d| src_dims[d + 1..].iter().product::<usize>()) .map(|&d| src_dims[d + 1..].iter().product::<usize>())
.collect(); .collect();
// let cfg = LaunchConfig::for_num_elems(el as u32); let cfg = LaunchConfig::for_num_elems(el as u32);
// TODO: Hack to run the computation on a single thread, replace with a proper distributed
// algorithm.
let cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
let dev = self.device(); let dev = self.device();
let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?; let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?;
let slice = match &self.slice { let slice = match &self.slice {