diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs index 997afb43..06c8cdd1 100644 --- a/candle-core/src/backprop.rs +++ b/candle-core/src/backprop.rs @@ -164,13 +164,18 @@ impl Tensor { } Op::Binary(lhs, rhs, BinaryOp::Minimum) | Op::Binary(lhs, rhs, BinaryOp::Maximum) => { - let lhs_grad = node.eq(lhs)?.to_dtype(grad.dtype())?.mul(&grad)?; + let mask_lhs = node.eq(lhs)?.to_dtype(grad.dtype())?; + let mask_rhs = node.eq(rhs)?.to_dtype(grad.dtype())?; + + // If both masks are 1 one the same point, we want to scale the + // gradient by 0.5 rather than 1. + let lhs_grad = mask_lhs.mul(&grad)?.div(&(&mask_rhs + 1.)?)?; let lhs_sum_grad = grads.or_insert(lhs)?; *lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?; - let rhs_grad = node.eq(rhs)?.to_dtype(grad.dtype())?.mul(&grad)?; + let rhs_grad = mask_rhs.mul(&grad)?.div(&(&mask_lhs + 1.)?)?; let rhs_sum_grad = grads.or_insert(rhs)?; - *rhs_sum_grad = rhs_sum_grad.sub(&rhs_grad)?; + *rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?; } Op::WhereCond(pred, t, f) => { let zeros = grad.zeros_like()?; diff --git a/candle-core/tests/grad_tests.rs b/candle-core/tests/grad_tests.rs index 6657c918..c44a7ea7 100644 --- a/candle-core/tests/grad_tests.rs +++ b/candle-core/tests/grad_tests.rs @@ -177,8 +177,35 @@ fn unary_grad(device: &Device) -> Result<()> { Ok(()) } +fn binary_grad(device: &Device) -> Result<()> { + let x = Var::new(&[3f32, 1., -4., -1.], device)?; + let x = x.as_tensor(); + // leaky relu + let y = x.maximum(&(x * 0.1)?)?; + let grads = y.backward()?; + let grad_x = grads.get(x).context("no grad for x")?; + assert_eq!(x.to_vec1::()?, [3., 1., -4., -1.]); + assert_eq!(y.to_vec1::()?, [3., 1., -0.4, -0.1]); + assert_eq!(grad_x.to_vec1::()?, [1., 1., 0.1, 0.1]); + + let y = x.minimum(&(x * 0.1)?)?; + let grads = y.backward()?; + let grad_x = grads.get(x).context("no grad for x")?; + assert_eq!(y.to_vec1::()?, [0.3, 0.1, -4., -1.]); + assert_eq!(grad_x.to_vec1::()?, [0.1, 0.1, 1., 1.]); + + // This one is easy to mess up, we want the gradient to be one as it is the identity function. + let y = x.minimum(x)?; + let grads = y.backward()?; + let grad_x = grads.get(x).context("no grad for x")?; + assert_eq!(y.to_vec1::()?, [3., 1., -4., -1.]); + assert_eq!(grad_x.to_vec1::()?, [1., 1., 1., 1.]); + Ok(()) +} + test_device!(simple_grad, simple_grad_cpu, simple_grad_gpu); test_device!(sum_grad, sum_grad_cpu, sum_grad_gpu); test_device!(matmul_grad, matmul_grad_cpu, matmul_grad_gpu); test_device!(grad_descent, grad_descent_cpu, grad_descent_gpu); test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu); +test_device!(binary_grad, binary_grad_cpu, binary_grad_gpu);