diff --git a/candle-nn/src/lib.rs b/candle-nn/src/lib.rs index 8ab51070..e9552e83 100644 --- a/candle-nn/src/lib.rs +++ b/candle-nn/src/lib.rs @@ -23,6 +23,7 @@ pub use group_norm::{group_norm, GroupNorm}; pub use init::Init; pub use layer_norm::{layer_norm, rms_norm, LayerNorm, LayerNormConfig, RmsNorm}; pub use linear::{linear, linear_no_bias, Linear}; +pub use ops::Dropout; pub use optim::{AdamW, ParamsAdamW, SGD}; pub use rnn::{lstm, LSTM, RNN}; pub use var_builder::VarBuilder; diff --git a/candle-nn/src/ops.rs b/candle-nn/src/ops.rs index 397674f3..63f73dfe 100644 --- a/candle-nn/src/ops.rs +++ b/candle-nn/src/ops.rs @@ -42,3 +42,38 @@ pub fn sigmoid(xs: &Tensor) -> Result { // TODO: Should we have a specialized op for this? (xs.neg()?.exp()? + 1.0)?.recip() } + +pub fn dropout(xs: &Tensor, drop_p: f32) -> Result { + // This implementation is inefficient as it stores the full mask for the backward pass. + // Instead we could just store the seed and have a specialized kernel that would both + // generate the random mask and apply it. + // Another easier optimization would be to be able to generate boolean mask using just a bit of + // entropy per element rather than generating a full float per element. + if !(0. ..1.).contains(&drop_p) { + candle::bail!("dropout probability has to be in [0, 1), got {drop_p}") + } + let rand = Tensor::rand(0f32, 1f32, xs.shape(), xs.device())?; + let scale = 1.0 / (1.0 - drop_p as f64); + let drop_p = Tensor::new(drop_p, xs.device())?.broadcast_as(xs.shape())?; + let mask = (rand.ge(&drop_p)? * scale)?.to_dtype(xs.dtype())?; + xs * mask +} + +#[derive(Debug)] +pub struct Dropout { + drop_p: f32, +} + +impl Dropout { + pub fn new(drop_p: f32) -> Dropout { + Self { drop_p } + } + + pub fn forward(&self, xs: &Tensor, train: bool) -> Result { + if train { + dropout(xs, self.drop_p) + } else { + Ok(xs.clone()) + } + } +}