Indexing cuda (#235)

* Allow using uint8_t for indexing. * Revert the default cuda feature. * Add a cuda-kernel for index-select. * Add a test for gather.
2025-06-19 03:54:56 +00:00 · 2023-07-24 20:22:47 +01:00
parent b50f932e7c
commit 581b104f97
3 changed files with 277 additions and 31 deletions
--- a/candle-core/tests/tensor_tests.rs
+++ b/candle-core/tests/tensor_tests.rs
@ -316,10 +316,7 @@ fn cmp(device: &Device) -> Result<()> {
    Ok(())
 }

-#[test]
-fn index_select() -> Result<()> {
-    // TODO: Test on cuda once the kernel is available.
-    let device = &Device::Cpu;
+fn index_select(device: &Device) -> Result<()> {
    let ids = Tensor::new(&[0u32, 2u32, 1u32], device)?;
    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
    assert_eq!(
@ -349,6 +346,38 @@ fn index_select() -> Result<()> {
    Ok(())
 }

+fn gather(device: &Device) -> Result<()> {
+    let ids = Tensor::new(&[[0u32], [2u32], [1u32], [0u32]], device)?;
+    let t = Tensor::arange(0f32, 12f32, device)?.reshape((4, 3))?;
+    assert_eq!(
+        t.to_vec2::<f32>()?,
+        &[
+            [0.0, 1.0, 2.0],
+            [3.0, 4.0, 5.0],
+            [6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0]
+        ]
+    );
+    let hs = t.gather(&ids, 1)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0], [5.0], [7.0], [9.0]]);
+    let ids = Tensor::new(
+        &[[0u32, 0u32], [2u32, 0u32], [1u32, 1u32], [0u32, 2u32]],
+        device,
+    )?;
+    let hs = t.gather(&ids, 1)?;
+    assert_eq!(
+        hs.to_vec2::<f32>()?,
+        &[[0.0, 0.0], [5.0, 3.0], [7.0, 7.0], [9.0, 11.0]]
+    );
+    let ids = Tensor::new(&[[0u32, 2u32, 0u32]], device)?;
+    let hs = t.gather(&ids, 0)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 7.0, 2.0]]);
+    let ids = Tensor::new(&[[0u32, 2u32, 0u32], [0u32, 1u32, 1u32]], device)?;
+    let hs = t.gather(&ids, 0)?;
+    assert_eq!(hs.to_vec2::<f32>()?, &[[0.0, 7.0, 2.0], [0.0, 4.0, 5.0]]);
+    Ok(())
+}
+
 fn matmul(device: &Device) -> Result<()> {
    let data = vec![1.0f32, 2.0, 3.0, 4.0];
    let a = Tensor::from_slice(&data, (2, 2), device)?;
@ -513,3 +542,5 @@ test_device!(embeddings, embeddings_cpu, embeddings_gpu);
 test_device!(cmp, cmp_cpu, cmp_gpu);
 test_device!(matmul, matmul_cpu, matmul_gpu);
 test_device!(broadcasting, broadcasting_cpu, broadcasting_gpu);
+test_device!(index_select, index_select_cpu, index_select_gpu);
+test_device!(gather, gather_cpu, gather_gpu);