mirror of
https://github.com/huggingface/candle.git
synced 2025-06-18 03:28:50 +00:00

* place `ug` behind not wasm32 attr so that wasm32 can compile * mv `ug` to conditional target dep assuming every non-wasm32 user wants this
486 lines
16 KiB
Rust
486 lines
16 KiB
Rust
use crate::op::{BackpropOp, Op};
|
|
use crate::tensor::from_storage;
|
|
use crate::{CpuStorage, CudaStorage, Layout, MetalStorage, Result, Shape, Tensor};
|
|
use std::sync::Arc;
|
|
|
|
/// Unary ops that can be defined in user-land.
|
|
pub trait CustomOp1 {
|
|
// Box<dyn> does not support const yet, so use a function to get the name.
|
|
fn name(&self) -> &'static str;
|
|
|
|
/// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cpu_fwd(&self, storage: &CpuStorage, layout: &Layout) -> Result<(CpuStorage, Shape)>;
|
|
|
|
/// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cuda_fwd(&self, _storage: &CudaStorage, _layout: &Layout) -> Result<(CudaStorage, Shape)> {
|
|
Err(crate::Error::Cuda(
|
|
format!("no cuda implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn metal_fwd(
|
|
&self,
|
|
_storage: &MetalStorage,
|
|
_layout: &Layout,
|
|
) -> Result<(MetalStorage, Shape)> {
|
|
Err(crate::Error::Metal(
|
|
format!("no metal implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// This function takes as argument the argument `arg` used in the forward pass, the result
|
|
/// produced by the forward operation `res` and the gradient of the result `grad_res`.
|
|
/// The function should return the gradient of the argument.
|
|
fn bwd(&self, _arg: &Tensor, _res: &Tensor, _grad_res: &Tensor) -> Result<Option<Tensor>> {
|
|
Err(crate::Error::BackwardNotSupported { op: self.name() })
|
|
}
|
|
}
|
|
|
|
pub trait CustomOp2 {
|
|
fn name(&self) -> &'static str;
|
|
|
|
/// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cpu_fwd(
|
|
&self,
|
|
s1: &CpuStorage,
|
|
l1: &Layout,
|
|
s2: &CpuStorage,
|
|
l2: &Layout,
|
|
) -> Result<(CpuStorage, Shape)>;
|
|
|
|
/// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cuda_fwd(
|
|
&self,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
) -> Result<(CudaStorage, Shape)> {
|
|
Err(crate::Error::Cuda(
|
|
format!("no cuda implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn metal_fwd(
|
|
&self,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
) -> Result<(MetalStorage, Shape)> {
|
|
Err(crate::Error::Metal(
|
|
format!("no metal implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
fn bwd(
|
|
&self,
|
|
_arg1: &Tensor,
|
|
_arg2: &Tensor,
|
|
_res: &Tensor,
|
|
_grad_res: &Tensor,
|
|
) -> Result<(Option<Tensor>, Option<Tensor>)> {
|
|
Err(crate::Error::BackwardNotSupported { op: self.name() })
|
|
}
|
|
}
|
|
|
|
pub trait CustomOp3 {
|
|
fn name(&self) -> &'static str;
|
|
|
|
/// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cpu_fwd(
|
|
&self,
|
|
s1: &CpuStorage,
|
|
l1: &Layout,
|
|
s2: &CpuStorage,
|
|
l2: &Layout,
|
|
s3: &CpuStorage,
|
|
l3: &Layout,
|
|
) -> Result<(CpuStorage, Shape)>;
|
|
|
|
/// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cuda_fwd(
|
|
&self,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
) -> Result<(CudaStorage, Shape)> {
|
|
Err(crate::Error::Cuda(
|
|
format!("no cuda implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn metal_fwd(
|
|
&self,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
) -> Result<(MetalStorage, Shape)> {
|
|
Err(crate::Error::Metal(
|
|
format!("no metal implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
fn bwd(
|
|
&self,
|
|
_arg1: &Tensor,
|
|
_arg2: &Tensor,
|
|
_arg3: &Tensor,
|
|
_res: &Tensor,
|
|
_grad_res: &Tensor,
|
|
) -> Result<(Option<Tensor>, Option<Tensor>, Option<Tensor>)> {
|
|
Err(crate::Error::BackwardNotSupported { op: self.name() })
|
|
}
|
|
}
|
|
|
|
impl Tensor {
|
|
/// Applies a unary custom op without backward support
|
|
pub fn apply_op1_no_bwd<C: CustomOp1>(&self, c: &C) -> Result<Self> {
|
|
let (storage, shape) = self.storage().apply_op1(self.layout(), c)?;
|
|
Ok(from_storage(storage, shape, BackpropOp::none(), false))
|
|
}
|
|
|
|
/// Applies a binary custom op without backward support
|
|
pub fn apply_op2_no_bwd<C: CustomOp2>(&self, rhs: &Self, c: &C) -> Result<Self> {
|
|
let (storage, shape) =
|
|
self.storage()
|
|
.apply_op2(self.layout(), &rhs.storage(), rhs.layout(), c)?;
|
|
Ok(from_storage(storage, shape, BackpropOp::none(), false))
|
|
}
|
|
|
|
/// Applies a ternary custom op without backward support
|
|
pub fn apply_op3_no_bwd<C: CustomOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<Self> {
|
|
let (storage, shape) = self.storage().apply_op3(
|
|
self.layout(),
|
|
&t2.storage(),
|
|
t2.layout(),
|
|
&t3.storage(),
|
|
t3.layout(),
|
|
c,
|
|
)?;
|
|
Ok(from_storage(storage, shape, BackpropOp::none(), false))
|
|
}
|
|
|
|
/// Applies a unary custom op.
|
|
pub fn apply_op1_arc(&self, c: Arc<Box<dyn CustomOp1 + Send + Sync>>) -> Result<Self> {
|
|
let (storage, shape) = self
|
|
.storage()
|
|
.apply_op1(self.layout(), c.as_ref().as_ref())?;
|
|
let op = BackpropOp::new1(self, |s| Op::CustomOp1(s, c.clone()));
|
|
Ok(from_storage(storage, shape, op, false))
|
|
}
|
|
|
|
pub fn apply_op1<C: 'static + CustomOp1 + Send + Sync>(&self, c: C) -> Result<Self> {
|
|
self.apply_op1_arc(Arc::new(Box::new(c)))
|
|
}
|
|
|
|
/// Applies a binary custom op.
|
|
pub fn apply_op2_arc(
|
|
&self,
|
|
rhs: &Self,
|
|
c: Arc<Box<dyn CustomOp2 + Send + Sync>>,
|
|
) -> Result<Self> {
|
|
let (storage, shape) = self.storage().apply_op2(
|
|
self.layout(),
|
|
&rhs.storage(),
|
|
rhs.layout(),
|
|
c.as_ref().as_ref(),
|
|
)?;
|
|
let op = BackpropOp::new2(self, rhs, |t1, t2| Op::CustomOp2(t1, t2, c.clone()));
|
|
Ok(from_storage(storage, shape, op, false))
|
|
}
|
|
|
|
pub fn apply_op2<C: 'static + CustomOp2 + Send + Sync>(&self, r: &Self, c: C) -> Result<Self> {
|
|
self.apply_op2_arc(r, Arc::new(Box::new(c)))
|
|
}
|
|
|
|
/// Applies a ternary custom op.
|
|
pub fn apply_op3_arc(
|
|
&self,
|
|
t2: &Self,
|
|
t3: &Self,
|
|
c: Arc<Box<dyn CustomOp3 + Send + Sync>>,
|
|
) -> Result<Self> {
|
|
let (storage, shape) = self.storage().apply_op3(
|
|
self.layout(),
|
|
&t2.storage(),
|
|
t2.layout(),
|
|
&t3.storage(),
|
|
t3.layout(),
|
|
c.as_ref().as_ref(),
|
|
)?;
|
|
let op = BackpropOp::new3(self, t2, t3, |t1, t2, t3| {
|
|
Op::CustomOp3(t1, t2, t3, c.clone())
|
|
});
|
|
Ok(from_storage(storage, shape, op, false))
|
|
}
|
|
|
|
pub fn apply_op3<C: 'static + CustomOp3 + Send + Sync>(
|
|
&self,
|
|
t2: &Self,
|
|
t3: &Self,
|
|
c: C,
|
|
) -> Result<Self> {
|
|
self.apply_op3_arc(t2, t3, Arc::new(Box::new(c)))
|
|
}
|
|
}
|
|
|
|
// In place ops.
|
|
|
|
/// Unary ops that can be defined in user-land.
|
|
/// These ops work in place and as such back-prop is unsupported.
|
|
pub trait InplaceOp1 {
|
|
// Box<dyn> does not support const yet, so use a function to get the name.
|
|
fn name(&self) -> &'static str;
|
|
|
|
/// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cpu_fwd(&self, storage: &mut CpuStorage, layout: &Layout) -> Result<()>;
|
|
|
|
/// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cuda_fwd(&self, _storage: &mut CudaStorage, _layout: &Layout) -> Result<()> {
|
|
Err(crate::Error::Cuda(
|
|
format!("no cuda implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn metal_fwd(&self, _storage: &mut MetalStorage, _layout: &Layout) -> Result<()> {
|
|
Err(crate::Error::Metal(
|
|
format!("no metal implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
}
|
|
|
|
pub trait InplaceOp2 {
|
|
fn name(&self) -> &'static str;
|
|
|
|
/// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cpu_fwd(&self, s1: &mut CpuStorage, l1: &Layout, s2: &CpuStorage, l2: &Layout)
|
|
-> Result<()>;
|
|
|
|
/// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cuda_fwd(&self, _: &mut CudaStorage, _: &Layout, _: &CudaStorage, _: &Layout) -> Result<()> {
|
|
Err(crate::Error::Cuda(
|
|
format!("no cuda implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn metal_fwd(
|
|
&self,
|
|
_: &mut MetalStorage,
|
|
_: &Layout,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
) -> Result<()> {
|
|
Err(crate::Error::Metal(
|
|
format!("no metal implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
}
|
|
|
|
pub trait InplaceOp3 {
|
|
fn name(&self) -> &'static str;
|
|
|
|
/// The forward pass, as run on a cpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cpu_fwd(
|
|
&self,
|
|
s1: &mut CpuStorage,
|
|
l1: &Layout,
|
|
s2: &CpuStorage,
|
|
l2: &Layout,
|
|
s3: &CpuStorage,
|
|
l3: &Layout,
|
|
) -> Result<()>;
|
|
|
|
/// The forward pass, as run on a gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn cuda_fwd(
|
|
&self,
|
|
_: &mut CudaStorage,
|
|
_: &Layout,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
_: &CudaStorage,
|
|
_: &Layout,
|
|
) -> Result<()> {
|
|
Err(crate::Error::Cuda(
|
|
format!("no cuda implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
|
|
/// The forward pass, as run on a metal gpu device. Note that the storage can use arbitrary strides,
|
|
/// offsets etc so the associated layout should be used to access it.
|
|
fn metal_fwd(
|
|
&self,
|
|
_: &mut MetalStorage,
|
|
_: &Layout,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
_: &MetalStorage,
|
|
_: &Layout,
|
|
) -> Result<()> {
|
|
Err(crate::Error::Metal(
|
|
format!("no metal implementation for {}", self.name()).into(),
|
|
))
|
|
}
|
|
}
|
|
|
|
impl Tensor {
|
|
/// Applies a unary custom op in place.
|
|
pub fn inplace_op1<C: InplaceOp1>(&self, c: &C) -> Result<()> {
|
|
self.storage_mut().inplace_op1(self.layout(), c)
|
|
}
|
|
|
|
/// Applies a unary custom op in place (for the first tensor).
|
|
pub fn inplace_op2<C: InplaceOp2>(&self, rhs: &Self, c: &C) -> Result<()> {
|
|
self.storage_mut()
|
|
.inplace_op2(self.layout(), &rhs.storage(), rhs.layout(), c)
|
|
}
|
|
|
|
/// Applies a ternary custom op in place (for the first tensor).
|
|
pub fn inplace_op3<C: InplaceOp3>(&self, t2: &Self, t3: &Self, c: &C) -> Result<()> {
|
|
self.storage_mut().inplace_op3(
|
|
self.layout(),
|
|
&t2.storage(),
|
|
t2.layout(),
|
|
&t3.storage(),
|
|
t3.layout(),
|
|
c,
|
|
)
|
|
}
|
|
}
|
|
|
|
pub struct UgIOp1 {
|
|
name: &'static str,
|
|
#[cfg(feature = "cuda")]
|
|
func: cudarc::driver::CudaFunction,
|
|
#[cfg(feature = "metal")]
|
|
func: metal::ComputePipelineState,
|
|
}
|
|
|
|
impl UgIOp1 {
|
|
#[allow(unused)]
|
|
#[cfg(not(target_arch = "wasm32"))]
|
|
pub fn new(
|
|
name: &'static str,
|
|
kernel: ug::lang::ssa::Kernel,
|
|
device: &crate::Device,
|
|
) -> Result<Self> {
|
|
#[cfg(feature = "cuda")]
|
|
{
|
|
let device = device.as_cuda_device()?;
|
|
let func = device.compile(name, kernel)?;
|
|
Ok(Self { name, func })
|
|
}
|
|
#[cfg(feature = "metal")]
|
|
{
|
|
let device = device.as_metal_device()?;
|
|
let func = device.compile(name, kernel)?;
|
|
Ok(Self { name, func })
|
|
}
|
|
#[cfg(not(any(feature = "cuda", feature = "metal")))]
|
|
{
|
|
Ok(Self { name })
|
|
}
|
|
}
|
|
}
|
|
|
|
impl InplaceOp1 for UgIOp1 {
|
|
fn name(&self) -> &'static str {
|
|
self.name
|
|
}
|
|
|
|
fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> {
|
|
crate::bail!("ug ops are only supported on metal/cuda at the moment")
|
|
}
|
|
|
|
#[cfg(feature = "metal")]
|
|
fn metal_fwd(&self, sto: &mut MetalStorage, layout: &Layout) -> Result<()> {
|
|
use crate::backend::BackendStorage;
|
|
use candle_metal_kernels::utils::EncoderProvider;
|
|
|
|
let elem_count = layout.shape().elem_count();
|
|
if sto.dtype() != crate::DType::F32 {
|
|
// TODO: support more dtypes.
|
|
crate::bail!("input is not a f32 tensor")
|
|
}
|
|
let device = sto.device();
|
|
println!("here");
|
|
let command_buffer = device.command_buffer()?;
|
|
let command_buffer = &command_buffer;
|
|
let encoder = command_buffer.encoder();
|
|
let encoder = encoder.as_ref();
|
|
encoder.set_compute_pipeline_state(&self.func);
|
|
let (g, b) = if elem_count % 32 == 0 {
|
|
(elem_count / 32, 32)
|
|
} else {
|
|
(elem_count, 1)
|
|
};
|
|
let grid_dims = metal::MTLSize {
|
|
width: g as u64,
|
|
height: 1,
|
|
depth: 1,
|
|
};
|
|
let group_dims = candle_metal_kernels::utils::get_block_dims(b as u64, 1, 1);
|
|
candle_metal_kernels::utils::set_param(encoder, 0, (sto.buffer(), 0usize));
|
|
|
|
encoder.use_resource(sto.buffer(), metal::MTLResourceUsage::Write);
|
|
encoder.dispatch_threads(grid_dims, group_dims);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(feature = "cuda")]
|
|
fn cuda_fwd(&self, sto: &mut CudaStorage, layout: &Layout) -> Result<()> {
|
|
use crate::cuda_backend::WrapErr;
|
|
use cudarc::driver::LaunchAsync;
|
|
|
|
let elem_count = layout.shape().elem_count();
|
|
// TODO: support more dtypes.
|
|
let sto = sto.as_cuda_slice::<f32>()?;
|
|
let sto = match layout.contiguous_offsets() {
|
|
None => crate::bail!("input has to be contiguous"),
|
|
Some((o1, o2)) => sto.slice(o1..o2),
|
|
};
|
|
let params = (&sto,);
|
|
let (g, b) = if elem_count % 32 == 0 {
|
|
(elem_count / 32, 32)
|
|
} else {
|
|
(elem_count, 1)
|
|
};
|
|
let cfg = cudarc::driver::LaunchConfig {
|
|
grid_dim: (g as u32, 1, 1),
|
|
block_dim: (b as u32, 1, 1),
|
|
shared_mem_bytes: 0,
|
|
};
|
|
unsafe { self.func.clone().launch(cfg, params) }.w()?;
|
|
Ok(())
|
|
}
|
|
}
|