Softmax numerical stability. (#267)

* Softmax numerical stability.

* Fix the flash-attn test.
This commit is contained in:
Laurent Mazare
2023-07-28 13:13:01 +01:00
committed by GitHub
parent 68eab38de6
commit 3eb2bc6d07
28 changed files with 117 additions and 188 deletions

View File

@ -553,40 +553,6 @@ impl Tensor {
}
}
/// Applies the softmax function to the input tensor, rescaling the element so that elements on
/// a slice of fixed index on dimension `dim` are between 0 and 1 and sum to 1.
///
/// ```rust
/// use candle::{Tensor, Device};
/// let a = Tensor::new(&[[0f32, 1., 0., 1.], [-2., 2., 3., -3.]], &Device::Cpu)?;
/// let a = a.softmax(1)?;
/// assert_eq!(
/// a.to_vec2::<f32>()?,
/// &[
/// [0.13447072, 0.3655293, 0.13447072, 0.3655293],
/// [0.004892866, 0.26714143, 0.7261657, 0.0017999847],
/// ]);
/// # Ok::<(), candle::Error>(())
/// ```
pub fn softmax<D: Dim>(&self, dim: D) -> Result<Self> {
let dim = dim.to_index(self.shape(), "softmax")?;
// TODO: unify the two branches.
if self.device().is_cuda() {
// We do not have a cuda kernel for divide_by_sum_over_dim so split
// the operation.
let exp = self.exp()?;
let sum_exp = exp.sum_keepdim(dim)?;
exp.broadcast_div(&sum_exp)
} else {
let shape = self.shape();
let mut storage = self.storage().unary_impl::<crate::op::Exp>(self.layout())?;
// The resulting storage is contiguous.
storage.divide_by_sum_over_dim(shape, dim)?;
let op = BackpropOp::new1(self, |arg| Op::Softmax(arg, dim));
Ok(from_storage(storage, shape.clone(), op, false))
}
}
fn squeeze_dims(self, dims: &[usize]) -> Result<Self> {
match dims {
[] => Ok(self),