Tracing for the phi model (#936)

* Add some tracing bits to mixformers.

* Add the missing file.

* Add the conv2d layer to with-tracing.

* Improve the tracing usage.
This commit is contained in:
Laurent Mazare
2023-09-23 09:19:34 +01:00
committed by GitHub
parent cda1786eed
commit b54acfa3d0
9 changed files with 140 additions and 100 deletions

View File

@ -1,57 +1,12 @@
// T5 Text Model
// https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
use crate::models::with_tracing::{linear_no_bias, Embedding, Linear};
use candle::{DType, Device, Module, Result, Tensor, D};
use candle_nn::{Activation, VarBuilder};
use serde::Deserialize;
use std::sync::Arc;
#[derive(Debug)]
struct Embedding {
inner: candle_nn::Embedding,
span: tracing::Span,
}
impl Embedding {
fn new(d1: usize, d2: usize, vb: VarBuilder) -> Result<Self> {
let inner = candle_nn::embedding(d1, d2, vb)?;
let span = tracing::span!(tracing::Level::TRACE, "embedding");
Ok(Self { inner, span })
}
fn embeddings(&self) -> &Tensor {
self.inner.embeddings()
}
}
impl Module for Embedding {
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
let _enter = self.span.enter();
self.inner.forward(xs)
}
}
#[derive(Debug)]
struct Linear {
inner: candle_nn::Linear,
span: tracing::Span,
}
impl Linear {
fn new(d1: usize, d2: usize, vb: VarBuilder) -> Result<Self> {
let inner = candle_nn::linear_no_bias(d1, d2, vb)?;
let span = tracing::span!(tracing::Level::TRACE, "linear");
Ok(Self { inner, span })
}
}
impl Module for Linear {
fn forward(&self, xs: &Tensor) -> Result<Tensor> {
let _enter = self.span.enter();
self.inner.forward(xs)
}
}
fn default_relative_attention_max_distance() -> usize {
128
}
@ -205,8 +160,8 @@ struct T5DenseActDense {
impl T5DenseActDense {
fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
let wi = Linear::new(cfg.d_model, cfg.d_ff, vb.pp("wi"))?;
let wo = Linear::new(cfg.d_ff, cfg.d_model, vb.pp("wo"))?;
let wi = linear_no_bias(cfg.d_model, cfg.d_ff, vb.pp("wi"))?;
let wo = linear_no_bias(cfg.d_ff, cfg.d_model, vb.pp("wo"))?;
Ok(Self {
wi,
wo,
@ -237,9 +192,9 @@ struct T5DenseGatedActDense {
impl T5DenseGatedActDense {
fn load(vb: VarBuilder, cfg: &Config) -> Result<Self> {
let wi_0 = Linear::new(cfg.d_model, cfg.d_ff, vb.pp("wi_0"))?;
let wi_1 = Linear::new(cfg.d_model, cfg.d_ff, vb.pp("wi_1"))?;
let wo = Linear::new(cfg.d_ff, cfg.d_model, vb.pp("wo"))?;
let wi_0 = linear_no_bias(cfg.d_model, cfg.d_ff, vb.pp("wi_0"))?;
let wi_1 = linear_no_bias(cfg.d_model, cfg.d_ff, vb.pp("wi_1"))?;
let wo = linear_no_bias(cfg.d_ff, cfg.d_model, vb.pp("wo"))?;
Ok(Self {
wi_0,
wi_1,
@ -334,10 +289,10 @@ impl T5Attention {
cfg: &Config,
) -> Result<Self> {
let inner_dim = cfg.num_heads * cfg.d_kv;
let q = Linear::new(cfg.d_model, inner_dim, vb.pp("q"))?;
let k = Linear::new(cfg.d_model, inner_dim, vb.pp("k"))?;
let v = Linear::new(cfg.d_model, inner_dim, vb.pp("v"))?;
let o = Linear::new(inner_dim, cfg.d_model, vb.pp("o"))?;
let q = linear_no_bias(cfg.d_model, inner_dim, vb.pp("q"))?;
let k = linear_no_bias(cfg.d_model, inner_dim, vb.pp("k"))?;
let v = linear_no_bias(cfg.d_model, inner_dim, vb.pp("v"))?;
let o = linear_no_bias(inner_dim, cfg.d_model, vb.pp("o"))?;
let relative_attention_bias = if has_relative_attention_bias {
let emb = Embedding::new(
cfg.relative_attention_num_buckets,
@ -772,7 +727,11 @@ impl T5ForConditionalGeneration {
let lm_head = if tie_word_embeddings {
None
} else {
Some(Linear::new(cfg.d_model, cfg.vocab_size, vb.pp("lm_head"))?)
Some(linear_no_bias(
cfg.d_model,
cfg.vocab_size,
vb.pp("lm_head"),
)?)
};
Ok(Self {