Indexing with max-value results in zero/no-op. (#2940)

* Indexing with max-value results in zero/no-op.

* Add some testing.

* Also adapt the metal kernels.

* Another test.

* Fix.
This commit is contained in:
Laurent Mazare
2025-05-03 11:36:31 +02:00
committed by GitHub
parent 1fdfb58de5
commit e27b4700ad
5 changed files with 200 additions and 66 deletions

View File

@ -3,6 +3,28 @@
#include "cuda_utils.cuh"
#include<stdint.h>
template <typename T>
__host__ __device__
constexpr T max_value();
template <>
__host__ __device__
constexpr int64_t max_value<int64_t>() {
return 0x7FFFFFFFFFFFFFFFLL;
}
template <>
__host__ __device__
constexpr uint32_t max_value<uint32_t>() {
return 0xFFFFFFFFu;
}
template <>
__host__ __device__
constexpr uint8_t max_value<uint8_t>() {
return 0xFFu;
}
template<typename T, typename I>
__device__ void index_select(
const size_t numel,
@ -23,10 +45,14 @@ __device__ void index_select(
unsigned int left_i = dst_i / (ids_dim_size * right_size);
unsigned int id_i = dst_i / right_size % ids_dim_size;
unsigned int right_i = dst_i % right_size;
assert(ids[id_i] < src_dim_size);
unsigned int src_i = left_i * (src_dim_size * right_size) + ids[id_i] * right_size + right_i;
unsigned strided_i = b ? src_i : get_strided_index(src_i, num_dims, dims, strides);
out[dst_i] = inp[strided_i];
if (ids[id_i] == max_value<I>()) {
out[dst_i] = static_cast<T>(0);
} else {
assert(ids[id_i] < src_dim_size);
unsigned int src_i = left_i * (src_dim_size * right_size) + ids[id_i] * right_size + right_i;
unsigned strided_i = b ? src_i : get_strided_index(src_i, num_dims, dims, strides);
out[dst_i] = inp[strided_i];
}
}
}
@ -57,11 +83,15 @@ __device__ void gather(
) {
for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
size_t post = i % right_size;
size_t idx = ids[i];
assert(idx < src_dim_size);
size_t pre = i / (right_size * ids_dim_size);
size_t src_i = (pre * src_dim_size + idx) * right_size + post;
out[i] = inp[src_i];
const I idx = ids[i];
if (ids[i] == max_value<I>()) {
out[i] = static_cast<T>(0);
} else {
assert(idx < src_dim_size);
size_t pre = i / (right_size * ids_dim_size);
size_t src_i = (pre * src_dim_size + idx) * right_size + post;
out[i] = inp[src_i];
}
}
}
@ -93,11 +123,13 @@ __device__ void index_add(
const size_t pre = i / right_size;
const size_t post = i % right_size;
for (unsigned int j = 0; j < ids_dim_size; ++j) {
const size_t idx = ids[j];
assert(idx < dst_dim_size);
const I idx = ids[j];
const size_t src_i = (pre * ids_dim_size + j) * right_size + post;
const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
out[dst_i] += inp[src_i];
if (idx < max_value<I>()) {
assert(idx < dst_dim_size);
const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
out[dst_i] += inp[src_i];
}
}
}
}
@ -130,10 +162,12 @@ __device__ void scatter(
const size_t post = i % right_size;
for (unsigned int j = 0; j < src_dim_size; ++j) {
const size_t src_i = (pre * src_dim_size + j) * right_size + post;
const size_t idx = ids[src_i];
assert(idx < dst_dim_size);
const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
out[dst_i] = inp[src_i];
const I idx = ids[src_i];
if (idx < max_value<I>()) {
assert(idx < dst_dim_size);
const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
out[dst_i] = inp[src_i];
}
}
}
}
@ -154,10 +188,12 @@ __device__ void scatter_add(
const size_t post = i % right_size;
for (unsigned int j = 0; j < src_dim_size; ++j) {
const size_t src_i = (pre * src_dim_size + j) * right_size + post;
const size_t idx = ids[src_i];
assert(idx < dst_dim_size);
const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
out[dst_i] += inp[src_i];
const I idx = ids[src_i];
if (idx < max_value<I>()) {
assert(idx < dst_dim_size);
const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
out[dst_i] += inp[src_i];
}
}
}
}