mirror of
https://github.com/huggingface/candle.git
synced 2025-06-17 11:08:52 +00:00
Use atomicAdd as a quick workaround some cuda synchronisation issue.
This commit is contained in:
@ -1,3 +1,4 @@
|
|||||||
|
// TODO: Use a proper distributed reduction rather than atomicAdd.
|
||||||
#include "cuda_utils.cuh"
|
#include "cuda_utils.cuh"
|
||||||
#include<stdint.h>
|
#include<stdint.h>
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ extern "C" __global__ void FN_NAME( \
|
|||||||
size_t post = dst_index % stride; \
|
size_t post = dst_index % stride; \
|
||||||
dst_index = (pre / sum_dims_l[nd]) * stride + post; \
|
dst_index = (pre / sum_dims_l[nd]) * stride + post; \
|
||||||
} \
|
} \
|
||||||
out[dst_index] += inp[i]; \
|
atomicAdd(out + dst_index, inp[i]); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else { \
|
else { \
|
||||||
@ -36,7 +37,7 @@ extern "C" __global__ void FN_NAME( \
|
|||||||
size_t post = dst_index % stride; \
|
size_t post = dst_index % stride; \
|
||||||
dst_index = (pre / sum_dims_l[nd]) * stride + post; \
|
dst_index = (pre / sum_dims_l[nd]) * stride + post; \
|
||||||
} \
|
} \
|
||||||
out[dst_index] += inp[strided_i]; \
|
atomicAdd(out + dst_index, inp[strided_i]); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
@ -314,14 +314,7 @@ impl CudaStorage {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|&d| src_dims[d + 1..].iter().product::<usize>())
|
.map(|&d| src_dims[d + 1..].iter().product::<usize>())
|
||||||
.collect();
|
.collect();
|
||||||
// let cfg = LaunchConfig::for_num_elems(el as u32);
|
let cfg = LaunchConfig::for_num_elems(el as u32);
|
||||||
// TODO: Hack to run the computation on a single thread, replace with a proper distributed
|
|
||||||
// algorithm.
|
|
||||||
let cfg = LaunchConfig {
|
|
||||||
grid_dim: (1, 1, 1),
|
|
||||||
block_dim: (1, 1, 1),
|
|
||||||
shared_mem_bytes: 0,
|
|
||||||
};
|
|
||||||
let dev = self.device();
|
let dev = self.device();
|
||||||
let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?;
|
let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?;
|
||||||
let slice = match &self.slice {
|
let slice = match &self.slice {
|
||||||
|
Reference in New Issue
Block a user