#![allow(dead_code)] #![allow(unused)] #[cfg(feature = "mkl")] extern crate intel_mkl_src; use clap::Parser; use candle::backend::BackendStorage; use candle::cpu_backend; use candle::{CpuStorage, CustomOp1, DType, Device, Error, Layout, Result, Shape, Tensor}; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { /// Run on CPU rather than on GPU. #[arg(long)] cpu: bool, } struct LayerNorm; impl CustomOp1 for LayerNorm { fn name(&self) -> &'static str { "layer-norm" } fn cpu_fwd(&self, s: &CpuStorage, l: &Layout) -> Result<(CpuStorage, Shape)> { let s = s.as_slice::()?; let _s = match l.contiguous_offsets() { None => Err(Error::Wrapped("input has to be contiguous".into()))?, Some((o1, o2)) => &s[o1..o2], }; todo!() } #[cfg(feature = "cuda")] fn cuda_fwd( &self, s: &candle::CudaStorage, l: &Layout, ) -> Result<(candle::CudaStorage, Shape)> { let device = s.device().clone(); let s = s.as_cuda_slice::()?; let s = match l.contiguous_offsets() { None => Err(Error::Wrapped("input has to be contiguous".into()))?, Some((o1, o2)) => s, // TODO: slice with o1 and o2 }; let s: std::result::Result<_, candle::cuda_backend::CudaError> = s.try_clone().map_err(|v| v.into()); let s = s?; let s = candle::CudaStorage::wrap_cuda_slice(s, device); Ok((s, l.shape().clone())) } } fn main() -> anyhow::Result<()> { let args = Args::parse(); let device = candle_examples::device(args.cpu)?; let t = Tensor::arange(0f32, 14f32, &device)?.reshape((2, 7))?; println!("{t}"); let t = t.custom_op1(LayerNorm)?; println!("{t}"); Ok(()) }