Refactor the hierarchy.

This commit is contained in:
Nicolas Patry
2023-06-27 11:57:27 +02:00
parent 6c4a960b15
commit d7f729fb8f
41 changed files with 35 additions and 33 deletions

32
candle-core/Cargo.toml Normal file
View File

@ -0,0 +1,32 @@
[package]
name = "candle"
version = "0.1.0"
edition = "2021"
description = "Minimalist ML framework."
repository = "https://github.com/LaurentMazare/candle"
keywords = ["blas", "tensor", "machine-learning"]
categories = ["science"]
license = "MIT/Apache-2.0"
readme = "README.md"
[dependencies]
safetensors = "0.3.1"
thiserror = "1"
cudarc = { version = "0.9.9", optional = true, features = ["f16"] }
candle-kernels = { path = "../candle-kernels", optional = true }
gemm = "0.15.4"
zip = { version = "0.6.6", default-features=false }
byteorder = "1.4.3"
half = { version = "2.3.1", features = ["num-traits"] }
num-traits = "0.2.15"
[dev-dependencies]
anyhow = "1"
clap = { version = "4.2.4", features = ["derive"] }
rand = "0.8.5"
tokenizers = { version = "0.13.3", default-features=false, features=["onig"] }
[features]
default = ["cuda"]
cuda = ["dep:cudarc", "dep:candle-kernels"]

201
candle-core/LICENSE Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

12
candle-core/Makefile Normal file
View File

@ -0,0 +1,12 @@
clean-ptx:
find target -name "*.ptx" -type f -delete
echo "" > kernels/src/lib.rs
touch kernels/build.rs
clean:
cargo clean
test:
cargo test
all: test

2
candle-core/README.md Normal file
View File

@ -0,0 +1,2 @@
# candle
Minimalist ML framework for Rust

View File

@ -0,0 +1,21 @@
use anyhow::Result;
use candle::{Device, Tensor};
fn main() -> Result<()> {
let data = &[[3f32, 1., 4., 1., 5.], [2., 7., 1., 8., 2.]];
let t1 = Tensor::new(data, &Device::Cpu)?;
let data2 = &[[5f32, 5., 5., 5., 5.], [2., 7., 1., 8., 2.]];
let t2 = Tensor::new(data2, &Device::Cpu)?;
assert_eq!(
Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
.t()?
.to_vec2::<f32>()?,
[
[3.0, 1.0, 4.0, 1.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0],
[5.0, 5.0, 5.0, 5.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0]
]
);
Ok(())
}

View File

@ -0,0 +1,31 @@
use anyhow::Result;
use candle::{Device, Tensor};
fn main() -> Result<()> {
let device = Device::new_cuda(0)?;
let ids = Tensor::new(&[0u32, 2u32, 1u32], &device)?;
let t = Tensor::new(&[[0f32, 1f32], [2f32, 3f32], [4f32, 5f32]], &device)?;
let hs = Tensor::embedding(&ids, &t)?;
println!("> {:?}", hs.to_vec2::<f32>());
let x = Tensor::new(&[3f32, 1., 4., 1., 5.], &device)?;
println!("{:?}", x.to_vec1::<f32>()?);
let y = Tensor::new(&[2f32, 7., 1., 8., 2.], &device)?;
let z = (y + x * 3.)?;
println!("{:?}", z.to_vec1::<f32>()?);
println!("{:?}", z.sqrt()?.to_vec1::<f32>()?);
let x = Tensor::new(&[[11f32, 22.], [33., 44.], [55., 66.], [77., 78.]], &device)?;
let y = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &device)?;
println!("{:?}", y.to_vec2::<f32>()?);
let z = x.matmul(&y)?;
println!("{:?}", z.to_vec2::<f32>()?);
let x = Tensor::new(
&[[11f32, 22.], [33., 44.], [55., 66.], [77., 78.]],
&Device::Cpu,
)?;
let y = Tensor::new(&[[1f32, 2., 3.], [4., 5., 6.]], &Device::Cpu)?;
println!("{:?}", y.to_vec2::<f32>()?);
let z = x.matmul(&y)?;
println!("{:?}", z.to_vec2::<f32>()?);
Ok(())
}

View File

@ -0,0 +1,68 @@
# Adapted from https://github.com/Lightning-AI/lit-llama/blob/main/scripts/convert_checkpoint.py
import sys
import torch
import numpy as np
from typing import Dict
from pathlib import Path
def tr(v):
return np.ascontiguousarray(np.transpose(v))
def convert_state_dict(state_dict: Dict[str, torch.Tensor], dtype: torch.dtype = torch.float32) -> Dict[str, torch.Tensor]:
print("start conv")
def get_and_remove(key, transpose=False):
v = state_dict[key].to(dtype).numpy()
if transpose:
v = tr(v)
del state_dict[key]
return v
converted = {}
converted["transformer.wte.weight"] = get_and_remove("tok_embeddings.weight")
converted["lm_head.weight"] = get_and_remove("output.weight", transpose=True)
converted["transformer.ln_f.scale"] = get_and_remove("norm.weight")
for layer_idx in sorted(set([k.split(".")[1] for k in state_dict if k.startswith("layers")])):
print(layer_idx)
# attention
# the wq, wk, wv from the FB model are stacked in our model as c_attn
converted[f"transformer.h.{layer_idx}.attn.c_attn.weight"] = tr(np.concatenate(
(
get_and_remove(f"layers.{layer_idx}.attention.wq.weight"),
get_and_remove(f"layers.{layer_idx}.attention.wk.weight"),
get_and_remove(f"layers.{layer_idx}.attention.wv.weight"),
)
))
converted[f"transformer.h.{layer_idx}.attn.c_proj.weight"] = tr(get_and_remove(
f"layers.{layer_idx}.attention.wo.weight"
))
# mlp
converted[f"transformer.h.{layer_idx}.mlp.c_fc1.weight"] = get_and_remove(
f"layers.{layer_idx}.feed_forward.w1.weight", transpose=True,
)
converted[f"transformer.h.{layer_idx}.mlp.c_proj.weight"] = get_and_remove(
f"layers.{layer_idx}.feed_forward.w2.weight", transpose=True,
)
converted[f"transformer.h.{layer_idx}.mlp.c_fc2.weight"] = get_and_remove(
f"layers.{layer_idx}.feed_forward.w3.weight", transpose=True,
)
# rms norm
converted[f"transformer.h.{layer_idx}.rms_1.scale"] = get_and_remove(f"layers.{layer_idx}.attention_norm.weight")
converted[f"transformer.h.{layer_idx}.rms_2.scale"] = get_and_remove(f"layers.{layer_idx}.ffn_norm.weight")
return converted
def convert_weights(llama_ckpt, *, output_npz: Path = Path("llama.npz"), dtype: str = "float32") -> None:
dt = getattr(torch, dtype, None)
if not isinstance(dt, torch.dtype):
raise ValueError(f"{dtype} is not a valid dtype.")
checkpoint = torch.load(llama_ckpt, map_location="cpu")
converted = convert_state_dict(checkpoint, dtype=dt)
del checkpoint
np.savez(output_npz, **converted)
if __name__ == "__main__":
if len(sys.argv) != 2:
raise ValueError(f"usage: convert_checkpoint.py ..../LLaMA/7B/consolidated.00.pth")
convert_weights(sys.argv[1])

View File

@ -0,0 +1,485 @@
// An implementation of LLaMA https://github.com/facebookresearch/llama
//
// This is based on nanoGPT in a similar way to:
// https://github.com/Lightning-AI/lit-llama/blob/main/lit_llama/model.py
//
// The tokenizer config can be retrieved from:
// https://huggingface.co/hf-internal-testing/llama-tokenizer/raw/main/tokenizer.json
//
// In order to convert the llama weights to a .npz file, run:
// python examples/llama/convert_checkpoint.py ..../LLaMA/7B/consolidated.00.pth
// TODO: This does not use a batch dimension. If adding it back, be cautious about the
// transposition operations.
use anyhow::{Error as E, Result};
use clap::Parser;
use candle::{DType, Device, Tensor};
mod var_store;
use var_store::VarBuilder;
const CONTEXT_SIZE: usize = 512;
const START_PROMPT: &str = r"
EDWARD:
I wonder how our princely father 'scaped,
Or whether he be 'scaped away or no
From Clifford's and Northumberland's pursuit:
Had he been ta'en, we should have heard the news;
Had he been slain, we should have heard the news;
Or had he 'scaped, methinks we should have heard
The happy tidings of his good escape.
How fares my brother? why is he so sad?
RICHARD:
I cannot joy, until I be resolved
Where our right valiant father is become.
I saw him in the battle range about;
And watch'd him how he singled Clifford forth.
Methought he bore him in the thickest troop
As doth a lion in a herd of neat;
Or as a bear, encompass'd round with dogs,
Who having pinch'd a few and made them cry,
The rest stand all aloof, and bark at him.
So fared our father with his enemies;
So fled his enemies my warlike father:
Methinks, 'tis prize enough to be his son.
See how the morning opes her golden gates,
And takes her farewell of the glorious sun!
How well resembles it the prime of youth,
Trimm'd like a younker prancing to his love!
EDWARD:
Dazzle mine eyes, or do I see three suns?
RICHARD:
Three glorious suns, each one a perfect sun;
Not separated with the racking clouds,
But sever'd in a pale clear-shining sky.
See, see! they join, embrace, and seem to kiss,
As if they vow'd some league inviolable:
Now are they but one lamp, one light, one sun.
In this the heaven figures some event.
EDWARD:
'Tis wondrous strange, the like yet never heard of.
I think it cites us, brother, to the field,
That we, the sons of brave Plantagenet,
Each one already blazing by our meeds,
Should notwithstanding join our lights together
And over-shine the earth as this the world.
Whate'er it bodes, henceforward will I bear
Upon my target three fair-shining suns.
";
#[allow(dead_code)]
struct Config {
block_size: usize,
vocab_size: usize,
n_layer: usize,
n_head: usize,
n_embd: usize,
}
#[allow(dead_code)]
impl Config {
fn config_7b() -> Self {
Self {
block_size: 4096,
vocab_size: 32000,
n_layer: 32,
n_head: 32,
n_embd: 4096,
}
}
fn config_13b() -> Self {
Self {
block_size: 4096,
vocab_size: 32000,
n_layer: 40,
n_head: 40,
n_embd: 5120,
}
}
fn config_30b() -> Self {
Self {
block_size: 4096,
vocab_size: 32000,
n_layer: 60,
n_head: 52,
n_embd: 6656,
}
}
fn config_65b() -> Self {
Self {
block_size: 4096,
vocab_size: 32000,
n_layer: 80,
n_head: 64,
n_embd: 8192,
}
}
}
struct Embedding {
embeddings: Tensor,
}
impl Embedding {
fn new(mut vb: VarBuilder, vocab_size: usize, n_embd: usize) -> Result<Self> {
let embeddings = vb.var("weight", (vocab_size, n_embd))?;
Ok(Self { embeddings })
}
fn forward(&self, indexes: &Tensor) -> Result<Tensor> {
Ok(Tensor::embedding(
indexes,
&self.embeddings.to_dtype(DType::F32)?,
)?)
}
}
struct Linear {
ws: Tensor,
bs: Option<Tensor>,
}
impl Linear {
#[allow(dead_code)]
fn new(mut vb: VarBuilder, in_size: usize, out_size: usize) -> Result<Self> {
let ws = vb.var("weight", (in_size, out_size))?;
let bs = vb.var("bias", out_size)?;
Ok(Self { ws, bs: Some(bs) })
}
fn new_no_bias(mut vb: VarBuilder, in_size: usize, out_size: usize) -> Result<Self> {
let ws = vb.var("weight", (in_size, out_size))?;
Ok(Self { ws, bs: None })
}
fn forward(&self, x: &Tensor) -> Result<Tensor> {
let x = x.matmul(&self.ws.to_dtype(DType::F32)?)?;
let y = match &self.bs {
None => x,
Some(bs) => x.broadcast_add(&bs.to_dtype(DType::F32)?)?,
};
Ok(y)
}
}
struct RmsNorm {
scale: Tensor,
size: usize,
}
impl RmsNorm {
fn new(mut vb: VarBuilder, size: usize) -> Result<Self> {
let scale = vb.var("scale", &[size])?;
Ok(Self { scale, size })
}
fn forward(&self, x: &Tensor) -> Result<Tensor> {
let (seq_len, hidden_size) = x.shape().r2()?;
let norm_x = ((x * x)?.sum(&[1])? / hidden_size as f64)?;
let norm_x = norm_x.broadcast_as((seq_len, hidden_size))?;
let x_normed = (x / (norm_x + 1e-5)?.sqrt()?)?;
let scale = self
.scale
.to_dtype(DType::F32)?
.broadcast_as((seq_len, self.size))?;
Ok((scale * x_normed)?)
}
}
struct Mlp {
c_fc1: Linear,
c_fc2: Linear,
c_proj: Linear,
}
fn silu(xs: &Tensor) -> Result<Tensor> {
Ok((xs / (xs.neg()?.exp()? + 1.0)?)?)
}
impl Mlp {
fn new(vb: VarBuilder, n_embd: usize) -> Result<Self> {
let n_hidden = 8 * n_embd / 3;
let n_hidden = (n_hidden - 1) / 256 * 256 + 256;
let c_fc1 = Linear::new_no_bias(&vb / "c_fc1", n_embd, n_hidden)?;
let c_fc2 = Linear::new_no_bias(&vb / "c_fc2", n_embd, n_hidden)?;
let c_proj = Linear::new_no_bias(&vb / "c_proj", n_hidden, n_embd)?;
Ok(Self {
c_fc1,
c_fc2,
c_proj,
})
}
fn forward(&self, x: &Tensor) -> Result<Tensor> {
let x = (silu(&self.c_fc1.forward(x)?)? * self.c_fc2.forward(x)?)?;
self.c_proj.forward(&x)
}
}
fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32) -> Result<Tensor> {
let shape = mask.shape();
let on_true = Tensor::new(on_true, &on_false.device())?.broadcast_as(shape.dims())?;
let m = mask.where_cond(&on_true, on_false)?;
Ok(m)
}
struct CausalSelfAttention {
c_attn: Linear,
c_proj: Linear,
n_head: usize,
n_embd: usize,
}
impl CausalSelfAttention {
fn new(vb: VarBuilder, n_head: usize, n_embd: usize) -> Result<Self> {
let c_attn = Linear::new_no_bias(&vb / "c_attn", n_embd, 3 * n_embd)?;
let c_proj = Linear::new_no_bias(&vb / "c_proj", n_embd, n_embd)?;
Ok(Self {
c_attn,
c_proj,
n_head,
n_embd,
})
}
fn apply_rotary_emb(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
let mut dims = x.dims().to_vec();
let v = dims.pop().unwrap();
dims.push(v / 2);
dims.push(2);
let x = x.reshape(dims)?;
let rank = x.rank();
let re_x = x.narrow(rank - 1, 0, 1)?;
let im_x = x.narrow(rank - 1, 1, 1)?;
let re_f = freqs_cis
.narrow(rank - 1, 0, 1)?
.broadcast_as(re_x.shape())?;
let im_f = freqs_cis
.narrow(rank - 1, 1, 1)?
.broadcast_as(im_x.shape())?;
let re = ((&re_x * &re_f)? - (&im_x * &im_f)?)?;
let im = ((&re_x * &im_f)? + (&im_x * &re_f)?)?;
let rope = Tensor::cat(&[&re, &im], rank - 1)?;
// TODO: Add the flatten op.
let mut dims = rope.dims().to_vec();
let v1 = dims.pop().unwrap();
let v2 = dims.pop().unwrap();
dims.push(v1 * v2);
let rope = rope.reshape(dims)?;
Ok(rope)
}
fn forward(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
let (t, c) = x.shape().r2()?;
let qkv = self.c_attn.forward(x)?;
let n_embd = self.n_embd;
let q = qkv.narrow(1, 0, n_embd)?;
let k = qkv.narrow(1, n_embd, n_embd)?;
let v = qkv.narrow(1, 2 * n_embd, n_embd)?;
let target_dim = [t, self.n_head, c / self.n_head];
let k = k.reshape(target_dim.as_slice())?.transpose(0, 1)?;
let q = q.reshape(target_dim.as_slice())?.transpose(0, 1)?;
let v = v.reshape(target_dim.as_slice())?.transpose(0, 1)?;
let q = self.apply_rotary_emb(&q, freqs_cis)?;
let k = self.apply_rotary_emb(&k, freqs_cis)?;
let k_shape = k.shape();
let att = (q.matmul(&k.t()?)? / (*k_shape.dims().last().unwrap() as f64).sqrt())?;
let device = x.device();
// TODO: If we support bool or u8 tensors, this would be better.
let mask: Vec<_> = (0..t)
.flat_map(|i| (0..t).map(move |j| u32::from(j > i)))
.collect();
// Once lower_triangle is available, use the following:
//let mask = Tensor::new(1u32, &device)?
// .broadcast_as(&[t, t])?
// .lower_triangle()?
let mask = Tensor::from_slice(&mask, (t, t), &device)?.broadcast_as(att.shape())?;
let att = masked_fill(&att, &mask, f32::NEG_INFINITY)?;
let att = att.softmax(att.rank() - 1)?;
// Convert to contiguous as matmul doesn't support strided vs for now.
let y = att.matmul(&v.contiguous()?)?;
let y = y.transpose(0, 1)?.reshape(&[t, c])?;
let y = self.c_proj.forward(&y)?;
Ok(y)
}
}
struct Block {
rms_1: RmsNorm,
attn: CausalSelfAttention,
rms_2: RmsNorm,
mlp: Mlp,
}
impl Block {
fn new(vb: VarBuilder, config: &Config) -> Result<Self> {
let rms_1 = RmsNorm::new(&vb / "rms_1", config.n_embd)?;
let attn = CausalSelfAttention::new(&vb / "attn", config.n_head, config.n_embd)?;
let rms_2 = RmsNorm::new(&vb / "rms_2", config.n_embd)?;
let mlp = Mlp::new(&vb / "mlp", config.n_embd)?;
Ok(Self {
rms_1,
attn,
rms_2,
mlp,
})
}
fn forward(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
let x = (self.attn.forward(&self.rms_1.forward(x)?, freqs_cis)? + x)?;
let x = (self.mlp.forward(&self.rms_2.forward(&x)?)? + x)?;
Ok(x)
}
}
struct Llama {
wte: Embedding,
blocks: Vec<Block>,
ln_f: RmsNorm,
lm_head: Linear,
}
impl Llama {
fn new(vb: VarBuilder, config: &Config) -> Result<Self> {
let lm_head = Linear::new_no_bias(&vb / "lm_head", config.n_embd, config.vocab_size)?;
let wte = Embedding::new(
&vb / "transformer" / "wte",
config.vocab_size,
config.n_embd,
)?;
let blocks = (0..config.n_layer)
.map(|i| Block::new(&vb / "transformer" / "h" / i, config))
.collect::<Result<Vec<_>>>()?;
let ln_f = RmsNorm::new(&vb / "transformer" / "ln_f", config.n_embd)?;
Ok(Self {
wte,
blocks,
ln_f,
lm_head,
})
}
fn forward(&self, x: &Tensor, freqs_cis: &Tensor) -> Result<Tensor> {
// TODO: Support for mini-batches? (i.e. r2)
let t = x.shape().r1()?;
let mut x = self.wte.forward(x)?;
for block in self.blocks.iter() {
x = block.forward(&x, freqs_cis)?;
}
let x = self.ln_f.forward(&x)?;
let x = x.narrow(0, t - 1, 1)?;
let logits = self.lm_head.forward(&x)?;
let (b, vocab_size) = logits.shape().r2()?;
assert_eq!(b, 1);
Ok(logits.reshape(vocab_size)?)
}
}
fn precompute_freqs_cis(config: &Config, device: &Device) -> Result<Tensor> {
let seq_len = CONTEXT_SIZE;
let n_elem = config.n_embd / config.n_head;
let theta: Vec<_> = (0..n_elem)
.step_by(2)
.map(|i| 1f32 / 10000f32.powf(i as f32 / n_elem as f32))
.collect();
let arange: Vec<_> = (0..seq_len).map(|c| c as f32).collect();
let theta = Tensor::new(theta.as_slice(), device)?;
let arange = Tensor::new(arange.as_slice(), device)?;
let idx_theta = arange
.reshape((arange.elem_count(), 1))?
.matmul(&theta.reshape((1, theta.elem_count()))?)?;
let shape = [1, seq_len, n_elem / 2, 1];
let idx_theta_cos = idx_theta.cos()?.reshape(&shape)?;
let idx_theta_sin = idx_theta.sin()?.reshape(&shape)?;
let last_dim = idx_theta_cos.rank() - 1;
Ok(Tensor::cat(&[&idx_theta_cos, &idx_theta_sin], last_dim)?)
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// Run on CPU rather than on GPU.
#[arg(long)]
cpu: bool,
/// The temperature used to generate samples.
#[arg(long, default_value_t = 1.0)]
temperature: f64,
/// The length of the sample to generate (in tokens).
#[arg(long, default_value_t = 100)]
sample_len: usize,
}
fn main() -> Result<()> {
use rand::prelude::*;
use tokenizers::Tokenizer;
let args = Args::parse();
let device = if args.cpu {
Device::Cpu
} else {
Device::new_cuda(0)?
};
println!("loading tokenizer config");
let tokenizer = Tokenizer::from_file("llama-tokenizer.json").map_err(E::msg)?;
let mut tokens = tokenizer
.encode(START_PROMPT, true)
.map_err(E::msg)?
.get_ids()
.to_vec();
let weight_path = std::path::Path::new("llama.npz");
let weights = if weight_path.exists() {
println!("loading weights from {weight_path:?}");
let start_load = std::time::Instant::now();
let tensors = Tensor::read_npz(weight_path)?;
println!("loaded weights in {:?}", start_load.elapsed());
let tensors: std::collections::HashMap<String, Tensor> = tensors.into_iter().collect();
Some(tensors)
} else {
println!("cannot find {weight_path:?}, using zero weights");
None
};
let vb = VarBuilder::new::<f32>(&device, weights);
println!("building the model");
let config = Config::config_7b();
let llama = Llama::new(vb, &config)?;
println!("pre-computing the positional embeddings");
let freqs_cis = precompute_freqs_cis(&config, &device)?;
println!("starting the inference loop");
let mut new_tokens = vec![];
let mut rng = thread_rng();
for index in 0..args.sample_len {
let ctxt = &tokens[tokens.len().saturating_sub(CONTEXT_SIZE)..];
let input = Tensor::new(ctxt, &device)?;
let logits = llama.forward(&input, &freqs_cis)?;
let prs = (&logits / args.temperature)?.softmax(logits.rank() - 1)?;
let logits_v: Vec<f32> = prs.to_vec1()?;
let distr = rand::distributions::WeightedIndex::new(&logits_v)?;
let next_token = distr.sample(&mut rng) as u32;
tokens.push(next_token);
new_tokens.push(next_token);
println!(
"{} token: {} '{}'",
index + 1,
next_token,
tokenizer.decode(vec![next_token], true).map_err(E::msg)?
);
}
println!(
"----\n{}\n----",
tokenizer.decode(new_tokens, true).map_err(E::msg)?
);
Ok(())
}

View File

@ -0,0 +1,92 @@
use candle::{DType, Device, Result, Shape, Tensor, WithDType};
use std::collections::HashMap;
use std::sync::Arc;
#[allow(dead_code)]
#[derive(Clone)]
struct NamedVar {
path: String,
dtype: DType,
shape: Shape,
}
#[derive(Clone)]
pub struct VarBuilder {
path: Vec<String>,
vars: std::rc::Rc<std::cell::RefCell<Vec<NamedVar>>>,
default_dtype: DType,
default_device: Device,
tensors: Arc<Option<HashMap<String, Tensor>>>,
}
#[allow(dead_code)]
pub struct VarStore {
vars: Vec<NamedVar>,
}
impl VarBuilder {
pub fn new<B: WithDType>(device: &Device, tensors: Option<HashMap<String, Tensor>>) -> Self {
let vars = std::rc::Rc::new(std::cell::RefCell::new(vec![]));
Self {
path: vec![],
vars,
default_dtype: B::DTYPE,
tensors: Arc::new(tensors),
default_device: device.clone(),
}
}
pub fn len(&self) -> usize {
self.vars.borrow().len()
}
pub fn var<S: Into<Shape>>(&mut self, s: &str, shape: S) -> Result<Tensor> {
let shape = shape.into();
let path = format!("{}.{s}", self.path.join("."));
let mut vars = self.vars.borrow_mut();
let parameter = match self.tensors.as_ref() {
None => Tensor::zeros(&shape, self.default_dtype, &self.default_device)?,
Some(tensors) => match tensors.get(&path) {
Some(tensor) => tensor.to_device(&self.default_device)?,
None => panic!("cannot find tensor for {path}"),
},
};
vars.push(NamedVar {
path,
dtype: self.default_dtype,
shape,
});
Ok(parameter)
}
pub fn into_store(self) -> VarStore {
let vars = self.vars.borrow();
VarStore {
vars: vars.to_vec(),
}
}
}
impl<S: ToString> std::ops::Div<S> for &VarBuilder {
type Output = VarBuilder;
fn div(self, rhs: S) -> VarBuilder {
let mut path = self.path.clone();
path.push(rhs.to_string());
VarBuilder {
path,
vars: self.vars.clone(),
default_dtype: self.default_dtype,
default_device: self.default_device.clone(),
tensors: self.tensors.clone(),
}
}
}
impl<S: ToString> std::ops::Div<S> for VarBuilder {
type Output = VarBuilder;
fn div(self, rhs: S) -> VarBuilder {
&self / rhs
}
}

274
candle-core/src/backprop.rs Normal file
View File

@ -0,0 +1,274 @@
use crate::{op::Op, Error, Result, Tensor, TensorId};
use std::collections::HashMap;
impl Tensor {
/// Return all the nodes that lead to this value in a topologically sorted vec, the first
/// elements having dependencies on the latter ones, e.g. the first element if any is the
/// argument.
/// This assumes that the op graph is a DAG.
fn sorted_nodes(&self) -> Vec<&Tensor> {
// The vec of sorted nodes is passed as an owned value rather than a mutable reference
// to get around some lifetime limitations.
fn walk<'a>(
node: &'a Tensor,
nodes: Vec<&'a Tensor>,
already_seen: &mut HashMap<TensorId, bool>,
) -> (bool, Vec<&'a Tensor>) {
if let Some(&tg) = already_seen.get(&node.id()) {
return (tg, nodes);
}
let mut track_grad = false;
let mut nodes = if node.is_variable() {
// Do not call recursively on the "leaf" nodes.
track_grad = true;
nodes
} else if let Some(op) = node.op() {
match op {
Op::WhereCond(t1, t2, t3) => {
let (tg, nodes) = walk(t1, nodes, already_seen);
track_grad |= tg;
let (tg, nodes) = walk(t2, nodes, already_seen);
track_grad |= tg;
let (tg, nodes) = walk(t3, nodes, already_seen);
track_grad |= tg;
nodes
}
Op::Add(lhs, rhs)
| Op::Mul(lhs, rhs)
| Op::Sub(lhs, rhs)
| Op::Div(lhs, rhs)
| Op::Embedding(lhs, rhs)
| Op::Matmul(lhs, rhs) => {
let (tg, nodes) = walk(lhs, nodes, already_seen);
track_grad |= tg;
let (tg, nodes) = walk(rhs, nodes, already_seen);
track_grad |= tg;
nodes
}
Op::Cat(args, _) => args.iter().fold(nodes, |nodes, arg| {
let (tg, nodes) = walk(arg, nodes, already_seen);
track_grad |= tg;
nodes
}),
Op::Affine { arg, mul, .. } => {
if *mul == 0. {
nodes
} else {
let (tg, nodes) = walk(arg, nodes, already_seen);
track_grad |= tg;
nodes
}
}
Op::Reshape(node)
| Op::Broadcast(node)
| Op::Sum(node, _)
| Op::ToDType(node)
| Op::ToDevice(node)
| Op::Transpose(node, _, _)
| Op::Narrow(node, _, _, _)
| Op::Softmax(node, _)
| Op::Sqr(node)
| Op::Sqrt(node)
| Op::Gelu(node)
| Op::Exp(node)
| Op::Log(node)
| Op::Sin(node)
| Op::Cos(node)
| Op::Abs(node)
| Op::Neg(node) => {
let (tg, nodes) = walk(node, nodes, already_seen);
track_grad |= tg;
nodes
}
}
} else {
nodes
};
already_seen.insert(node.id(), track_grad);
if track_grad {
nodes.push(node);
}
(track_grad, nodes)
}
let (_tg, mut nodes) = walk(self, vec![], &mut HashMap::new());
nodes.reverse();
nodes
}
pub fn backward(&self) -> Result<GradStore> {
let sorted_nodes = self.sorted_nodes();
let mut grads = GradStore::new();
grads.insert(self, self.ones_like()?);
for node in sorted_nodes.iter() {
if node.is_variable() {
continue;
}
let grad = grads.remove(node).unwrap();
// TODO: We should perform all these operations in place (or at least not track the
// whole graph).
// The only drawback would be if we wanted to support grad of grad but this is out of
// scope.
if let Some(op) = node.op() {
match op {
Op::Add(lhs, rhs) => {
let lhs_sum_grad = grads.or_insert(lhs)?;
*lhs_sum_grad = lhs_sum_grad.add(&grad)?;
let rhs_sum_grad = grads.or_insert(rhs)?;
*rhs_sum_grad = rhs_sum_grad.add(&grad)?;
}
Op::Sub(lhs, rhs) => {
let lhs_sum_grad = grads.or_insert(lhs)?;
*lhs_sum_grad = lhs_sum_grad.add(&grad)?;
let rhs_sum_grad = grads.or_insert(rhs)?;
*rhs_sum_grad = rhs_sum_grad.sub(&grad)?;
}
Op::Mul(lhs, rhs) => {
let lhs_grad = grad.mul(rhs)?;
let lhs_sum_grad = grads.or_insert(lhs)?;
*lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
let rhs_grad = grad.mul(lhs)?;
let rhs_sum_grad = grads.or_insert(rhs)?;
*rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
}
Op::Div(lhs, rhs) => {
let lhs_grad = grad.div(rhs)?;
let lhs_sum_grad = grads.or_insert(lhs)?;
*lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
let rhs_grad = grad.mul(lhs)?.div(&rhs.sqr()?)?;
let rhs_sum_grad = grads.or_insert(rhs)?;
*rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
}
Op::WhereCond(_pred, _t, _f) => {
return Err(Error::BackwardNotSupported { op: "where_cond" })
}
Op::Embedding(_lhs, _rhs) => {
return Err(Error::BackwardNotSupported { op: "embedding" })
}
Op::Matmul(lhs, rhs) => {
// Skipping checks, the op went ok, we can skip
// the matmul size checks for now.
let lhs_grad = grad.matmul(&rhs.t()?)?;
let lhs_sum_grad = grads.or_insert(lhs)?;
*lhs_sum_grad = lhs_sum_grad.add(&lhs_grad)?;
let rhs_grad = lhs.t()?.matmul(&grad)?;
let rhs_sum_grad = grads.or_insert(rhs)?;
*rhs_sum_grad = rhs_sum_grad.add(&rhs_grad)?;
}
Op::Cat(args, dim) => {
let mut start_idx = 0;
for arg in args {
let len = arg.dims()[*dim];
let arg_grad = grad.narrow(*dim, start_idx, len)?;
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&arg_grad)?;
start_idx += len;
}
}
Op::Broadcast(_arg) => {
return Err(Error::BackwardNotSupported { op: "broadcast" })
}
Op::Sum(_arg, _sum_dims) => {
return Err(Error::BackwardNotSupported { op: "sum" })
}
Op::ToDType(arg) => {
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&grad.to_dtype(node.dtype())?)?
}
Op::Affine { arg, mul, .. } => {
let arg_grad = grad.affine(*mul, 0.)?;
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&arg_grad)?
}
Op::Log(arg) => {
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&(&grad * *node)?)?
}
Op::Sin(arg) => {
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&(&grad * arg.cos())?)?
}
Op::Cos(arg) => {
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.sub(&(&grad * arg.sin())?)?
}
Op::Abs(_args) => return Err(Error::BackwardNotSupported { op: "abs" }),
Op::Exp(arg) => {
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&(&grad / arg)?)?
}
Op::Neg(arg) => {
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.sub(&grad)?
}
Op::Narrow(_arg, _, _, _) => {
return Err(Error::BackwardNotSupported { op: "narrow" })
}
Op::Softmax(_arg, _) => {
return Err(Error::BackwardNotSupported { op: "softmax" })
}
Op::Reshape(_arg) => return Err(Error::BackwardNotSupported { op: "reshape" }),
Op::Gelu(_) => return Err(Error::BackwardNotSupported { op: "gelu" }),
Op::Sqr(arg) => {
let arg_grad = arg.mul(&grad)?.affine(2., 0.)?;
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&arg_grad)?
}
Op::Sqrt(arg) => {
let arg_grad = grad.div(arg)?.affine(0.5, 0.)?;
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&arg_grad)?
}
Op::ToDevice(arg) => {
let sum_grad = grads.or_insert(arg)?;
let arg_grad = grad.to_device(&sum_grad.device())?;
*sum_grad = sum_grad.add(&arg_grad)?
}
Op::Transpose(arg, dim1, dim2) => {
let arg_grad = grad.transpose(*dim1, *dim2)?;
let sum_grad = grads.or_insert(arg)?;
*sum_grad = sum_grad.add(&arg_grad)?
}
};
}
}
Ok(grads)
}
}
pub struct GradStore(HashMap<TensorId, Tensor>);
impl GradStore {
fn new() -> Self {
GradStore(HashMap::new())
}
pub fn get_id(&self, id: TensorId) -> Option<&Tensor> {
self.0.get(&id)
}
pub fn get(&self, tensor: &Tensor) -> Option<&Tensor> {
self.0.get(&tensor.id())
}
pub fn remove(&mut self, tensor: &Tensor) -> Option<Tensor> {
self.0.remove(&tensor.id())
}
pub fn insert(&mut self, tensor: &Tensor, grad: Tensor) -> Option<Tensor> {
self.0.insert(tensor.id(), grad)
}
fn or_insert(&mut self, tensor: &Tensor) -> Result<&mut Tensor> {
use std::collections::hash_map::Entry;
let grad = match self.0.entry(tensor.id()) {
Entry::Occupied(entry) => entry.into_mut(),
Entry::Vacant(entry) => {
let grad = tensor.zeros_like()?;
entry.insert(grad)
}
};
Ok(grad)
}
}

View File

@ -0,0 +1,886 @@
use crate::op::{BinaryOp, UnaryOp};
use crate::{DType, Error, Result, Shape, StridedIndex};
use gemm::{gemm, Parallelism};
use half::{bf16, f16};
// TODO: Think about whether we would be better off with a dtype and
// a buffer as an owned slice of bytes.
// TODO: Maybe we should not implement [Clone] here and instead have an explicit allocator +
// intercept the oom errors to avoid panicking and provide a proper error.
#[derive(Debug, Clone)]
pub enum CpuStorage {
U32(Vec<u32>),
BF16(Vec<bf16>),
F16(Vec<f16>),
F32(Vec<f32>),
F64(Vec<f64>),
}
fn wcond<T: Copy>(
pred: &[u32],
shape: &Shape,
stride: &[usize],
t: &[T],
stride_t: &[usize],
f: &[T],
stride_f: &[usize],
) -> Vec<T> {
if shape.is_contiguous(stride) && shape.is_contiguous(stride_t) && shape.is_contiguous(stride_f)
{
let elem_count = shape.elem_count();
let pred = &pred[..elem_count];
let t = &t[..elem_count];
let f = &f[..elem_count];
pred.iter()
.zip(t.iter().zip(f.iter()))
.map(|(&p, (&t, &f))| if p > 0 { t } else { f })
.collect::<Vec<_>>()
} else {
let dims = shape.dims();
let it_p = StridedIndex::new(dims, stride);
let it_t = StridedIndex::new(dims, stride_t);
let it_f = StridedIndex::new(dims, stride_f);
it_p.zip(it_t.zip(it_f))
.map(|(i_p, (i_t, i_f))| if pred[i_p] > 0 { t[i_t] } else { f[i_f] })
.collect::<Vec<_>>()
}
}
macro_rules! map1 {
($v: expr, $fn: ident, $( $args:expr ),*) => {{
let v = match $v {
CpuStorage::BF16(__s) => CpuStorage::BF16($fn::<bf16>(__s, $($args),*)?),
CpuStorage::F16(__s) => CpuStorage::F16($fn::<f16>(__s, $($args),*)?),
CpuStorage::F32(__s) => CpuStorage::F32($fn::<f32>(__s, $($args),*)?),
CpuStorage::F64(__s) => CpuStorage::F64($fn::<f64>(__s, $($args),*)?),
CpuStorage::U32(__s) => CpuStorage::U32($fn::<u32>(__s, $($args),*)?),
};
Ok(v)
}};
}
fn sum_impl1<T: Copy + num_traits::NumAssign>(
src: &[T],
dst_shape: &Shape,
src_dims: &[usize],
stride: &[usize],
to_dst_index: impl Fn(usize) -> usize,
) -> Result<Vec<T>> {
let mut dst = vec![T::zero(); dst_shape.elem_count()];
for (unstr_index, src_index) in StridedIndex::new(src_dims, stride).enumerate() {
dst[to_dst_index(unstr_index)] += src[src_index];
}
Ok(dst)
}
fn unary_map<T: Copy, U: Copy, F: FnMut(T) -> U>(
vs: &[T],
shape: &Shape,
stride: &[usize],
mut f: F,
) -> Vec<U> {
if shape.is_contiguous(stride) {
vs[..shape.elem_count()].iter().map(|&v| f(v)).collect()
} else {
StridedIndex::new(shape.dims(), stride)
.map(|i| f(vs[i]))
.collect()
}
}
// This function maps over two strided index sequences.
fn binary_map<T: Copy, F: FnMut(T, T) -> T>(
shape: &Shape,
lhs_stride: &[usize],
rhs_stride: &[usize],
lhs: &[T],
rhs: &[T],
mut f: F,
) -> Vec<T> {
let dims = shape.dims();
if shape.is_contiguous(lhs_stride) && shape.is_contiguous(rhs_stride) {
(0..shape.elem_count()).map(|i| f(lhs[i], rhs[i])).collect()
} else {
let lhs_index = StridedIndex::new(dims, lhs_stride);
let rhs_index = StridedIndex::new(dims, rhs_stride);
lhs_index
.zip(rhs_index)
.map(|(lhs_i, rhs_i)| f(lhs[lhs_i], rhs[rhs_i]))
.collect()
}
}
fn take_impl1<T: Copy>(
vs: &[T],
ids: &[u32],
shape: &Shape,
stride: &[usize],
vocab_size: usize,
hidden_size: usize,
) -> Result<Vec<T>> {
let mut values = Vec::with_capacity(shape.elem_count() * hidden_size);
for index in StridedIndex::new(shape.dims(), stride) {
let index = ids[index].try_into()?;
if index >= vocab_size {
return Err(Error::InvalidIndex {
index,
vocab_size,
op: "take",
});
} else {
values.extend(&vs[hidden_size * index..hidden_size * (index + 1)]);
}
}
Ok(values)
}
fn copy_strided_src_<T: Copy + std::fmt::Display>(
src: &[T],
dst: &mut [T],
dst_offset: usize,
src_shape: &Shape,
src_stride: &[usize],
src_offset: usize,
) {
let src = &src[src_offset..];
if src_shape.is_contiguous(src_stride) {
let elem_to_copy = (dst.len() - dst_offset).min(src.len());
dst[dst_offset..dst_offset + elem_to_copy].copy_from_slice(&src[..elem_to_copy])
} else {
let src_indexes = StridedIndex::new(src_shape.dims(), src_stride);
for (dst_index, src_index) in src_indexes.enumerate() {
let dst_index = dst_index + dst_offset;
if dst_index >= dst.len() {
break;
}
dst[dst_index] = src[src_index]
}
}
}
impl CpuStorage {
pub fn dtype(&self) -> DType {
match self {
Self::U32(_) => DType::U32,
Self::BF16(_) => DType::BF16,
Self::F16(_) => DType::F16,
Self::F32(_) => DType::F32,
Self::F64(_) => DType::F64,
}
}
pub fn as_slice<D: crate::WithDType>(&self) -> Result<&[D]> {
D::cpu_storage_as_slice(self)
}
pub fn as_mut_slice<D: crate::WithDType>(&mut self) -> Result<&mut [D]> {
D::cpu_storage_as_mut_slice(self)
}
pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
// TODO: find a way around the quadratic number of cases below.
match (self, dtype) {
(Self::U32(storage), DType::BF16) => {
let data = unary_map(storage, shape, stride, |v| bf16::from_f32(v as f32));
Ok(Self::BF16(data))
}
(Self::BF16(storage), DType::BF16) => {
let data = unary_map(storage, shape, stride, |v| v);
Ok(Self::BF16(data))
}
(Self::F16(storage), DType::BF16) => {
let data = unary_map(storage, shape, stride, |v| bf16::from_f32(v.to_f32()));
Ok(Self::BF16(data))
}
(Self::F32(storage), DType::BF16) => {
let data = unary_map(storage, shape, stride, bf16::from_f32);
Ok(Self::BF16(data))
}
(Self::F64(storage), DType::BF16) => {
let data = unary_map(storage, shape, stride, bf16::from_f64);
Ok(Self::BF16(data))
}
(Self::U32(storage), DType::F16) => {
let data = unary_map(storage, shape, stride, |v| f16::from_f32(v as f32));
Ok(Self::F16(data))
}
(Self::BF16(storage), DType::F16) => {
let data = unary_map(storage, shape, stride, |v| f16::from_f32(v.to_f32()));
Ok(Self::F16(data))
}
(Self::F16(storage), DType::F16) => {
let data = unary_map(storage, shape, stride, |v| v);
Ok(Self::F16(data))
}
(Self::F32(storage), DType::F16) => {
let data = unary_map(storage, shape, stride, f16::from_f32);
Ok(Self::F16(data))
}
(Self::F64(storage), DType::F16) => {
let data = unary_map(storage, shape, stride, f16::from_f64);
Ok(Self::F16(data))
}
(Self::U32(storage), DType::F32) => {
let data = unary_map(storage, shape, stride, |v| v as f32);
Ok(Self::F32(data))
}
(Self::BF16(storage), DType::F32) => {
let data = unary_map(storage, shape, stride, |v| v.to_f32());
Ok(Self::F32(data))
}
(Self::F16(storage), DType::F32) => {
let data = unary_map(storage, shape, stride, |v| v.to_f32());
Ok(Self::F32(data))
}
(Self::F32(storage), DType::F32) => {
let data = unary_map(storage, shape, stride, |v| v);
Ok(Self::F32(data))
}
(Self::F64(storage), DType::F32) => {
let data = unary_map(storage, shape, stride, |v| v as f32);
Ok(Self::F32(data))
}
(Self::U32(storage), DType::U32) => {
let data = unary_map(storage, shape, stride, |v| v);
Ok(Self::U32(data))
}
(Self::BF16(storage), DType::U32) => {
let data = unary_map(storage, shape, stride, |v| v.to_f32() as u32);
Ok(Self::U32(data))
}
(Self::F16(storage), DType::U32) => {
let data = unary_map(storage, shape, stride, |v| v.to_f32() as u32);
Ok(Self::U32(data))
}
(Self::F32(storage), DType::U32) => {
let data = unary_map(storage, shape, stride, |v| v as u32);
Ok(Self::U32(data))
}
(Self::F64(storage), DType::U32) => {
let data = unary_map(storage, shape, stride, |v| v as u32);
Ok(Self::U32(data))
}
(Self::U32(storage), DType::F64) => {
let data = unary_map(storage, shape, stride, |v| v as f64);
Ok(Self::F64(data))
}
(Self::BF16(storage), DType::F64) => {
let data = unary_map(storage, shape, stride, |v| v.to_f64());
Ok(Self::F64(data))
}
(Self::F16(storage), DType::F64) => {
let data = unary_map(storage, shape, stride, |v| v.to_f64());
Ok(Self::F64(data))
}
(Self::F32(storage), DType::F64) => {
let data = unary_map(storage, shape, stride, |v| v as f64);
Ok(Self::F64(data))
}
(Self::F64(storage), DType::F64) => {
let data = unary_map(storage, shape, stride, |v| v);
Ok(Self::F64(data))
}
}
}
pub(crate) fn sum(&self, shape: &Shape, stride: &[usize], sum_dims: &[usize]) -> Result<Self> {
let src_dims = shape.dims();
let mut dst_dims = src_dims.to_vec();
for &sum_dim in sum_dims.iter() {
dst_dims[sum_dim] = 1;
}
let dst_shape = Shape::from(dst_dims);
let mut sum_dims = sum_dims.to_vec();
// Sort the sum_dims as they have to be processed from left to right when converting the
// indexes.
sum_dims.sort();
let sum_dims_and_stride: Vec<_> = sum_dims
.iter()
.map(|&d| (src_dims[d], src_dims[d + 1..].iter().product::<usize>()))
.collect();
let to_dst_index = |unstr_index: usize| {
// TODO: Optimize, the following does lots of slow division.
let mut dst_index = unstr_index;
// Set the sum_dims indexes to 0.
for &(dim, stride) in sum_dims_and_stride.iter() {
// The compiler is able to optimize the following in a single divmod op.
let (pre, post) = (dst_index / stride, dst_index % stride);
dst_index = (pre / dim) * stride + post;
}
dst_index
};
// TODO: Maybe provide an implementation with higher precision accumulators?
map1!(self, sum_impl1, &dst_shape, src_dims, stride, to_dst_index)
}
pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
// [self] stores data in a contiguous way.
let dims = shape.dims();
let elem_per_slice = dims[dim];
let prod_pre_dim = dims[..dim].iter().product();
let prod_post_dim = dims[dim + 1..].iter().product();
match self {
Self::BF16(storage) => {
for pre_idx in 0..prod_pre_dim {
for post_idx in 0..prod_post_dim {
let mut sum = 0f64;
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
sum += storage[idx].to_f64();
idx += prod_post_dim
}
let sum = bf16::from_f64(sum);
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
storage[idx] /= sum;
idx += prod_post_dim
}
}
}
}
Self::F16(storage) => {
for pre_idx in 0..prod_pre_dim {
for post_idx in 0..prod_post_dim {
let mut sum = 0f64;
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
sum += storage[idx].to_f64();
idx += prod_post_dim
}
let sum = f16::from_f64(sum);
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
storage[idx] /= sum;
idx += prod_post_dim
}
}
}
}
Self::F32(storage) => {
for pre_idx in 0..prod_pre_dim {
for post_idx in 0..prod_post_dim {
let mut sum = 0f64;
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
sum += storage[idx] as f64;
idx += prod_post_dim
}
let sum = sum as f32;
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
storage[idx] /= sum;
idx += prod_post_dim
}
}
}
}
Self::F64(storage) => {
for pre_idx in 0..prod_pre_dim {
for post_idx in 0..prod_post_dim {
let mut sum = 0f64;
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
sum += storage[idx];
idx += prod_post_dim
}
let mut idx = pre_idx * prod_post_dim * elem_per_slice + post_idx;
for _ in 0..elem_per_slice {
storage[idx] /= sum;
idx += prod_post_dim
}
}
}
}
Self::U32(_) => {}
}
Ok(())
}
pub(crate) fn affine_impl(
&self,
shape: &Shape,
stride: &[usize],
mul: f64,
add: f64,
) -> Result<Self> {
match self {
Self::U32(storage) => {
let mul = mul as u32;
let add = add as u32;
let data = unary_map(storage, shape, stride, |v| v * mul + add);
Ok(Self::U32(data))
}
Self::BF16(storage) => {
let mul = bf16::from_f64(mul);
let add = bf16::from_f64(add);
let data = unary_map(storage, shape, stride, |v| v * mul + add);
Ok(Self::BF16(data))
}
Self::F16(storage) => {
let mul = f16::from_f64(mul);
let add = f16::from_f64(add);
let data = unary_map(storage, shape, stride, |v| v * mul + add);
Ok(Self::F16(data))
}
Self::F32(storage) => {
let mul = mul as f32;
let add = add as f32;
let data = unary_map(storage, shape, stride, |v| v * mul + add);
Ok(Self::F32(data))
}
Self::F64(storage) => {
let data = unary_map(storage, shape, stride, |v| v * mul + add);
Ok(Self::F64(data))
}
}
}
pub(crate) fn unary_impl<B: UnaryOp>(&self, shape: &Shape, stride: &[usize]) -> Result<Self> {
match self {
Self::BF16(storage) => {
let data = unary_map(storage, shape, stride, B::bf16);
Ok(Self::BF16(data))
}
Self::F16(storage) => {
let data = unary_map(storage, shape, stride, B::f16);
Ok(Self::F16(data))
}
Self::F32(storage) => {
let data = unary_map(storage, shape, stride, B::f32);
Ok(Self::F32(data))
}
Self::F64(storage) => {
let data = unary_map(storage, shape, stride, B::f64);
Ok(Self::F64(data))
}
Self::U32(storage) => {
let data = unary_map(storage, shape, stride, B::u32);
Ok(Self::U32(data))
}
}
}
pub(crate) fn binary_impl<B: BinaryOp>(
&self,
rhs: &Self,
shape: &Shape,
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<Self> {
match (self, rhs) {
(Self::BF16(lhs), Self::BF16(rhs)) => {
let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::bf16);
Ok(Self::BF16(data))
}
(Self::F16(lhs), Self::F16(rhs)) => {
let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::f16);
Ok(Self::F16(data))
}
(Self::F32(lhs), Self::F32(rhs)) => {
let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::f32);
Ok(Self::F32(data))
}
(Self::F64(lhs), Self::F64(rhs)) => {
let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::f64);
Ok(Self::F64(data))
}
(Self::U32(lhs), Self::U32(rhs)) => {
let data = binary_map(shape, lhs_stride, rhs_stride, lhs, rhs, B::u32);
Ok(Self::U32(data))
}
_ => {
// This should be covered by the dtype check above.
Err(Error::DTypeMismatchBinaryOp {
lhs: self.dtype(),
rhs: rhs.dtype(),
op: B::NAME,
})
}
}
}
pub(crate) fn copy_strided_src(
&self,
dst: &mut Self,
dst_offset: usize,
src_shape: &Shape,
src_stride: &[usize],
src_offset: usize,
) -> Result<()> {
if src_shape.rank() != src_stride.len() {
panic!("incoherent shape and strides {src_shape:?} {src_stride:?}")
}
match (self, dst) {
(Self::U32(src), Self::U32(dst)) => {
copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
}
(Self::BF16(src), Self::BF16(dst)) => {
copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
}
(Self::F16(src), Self::F16(dst)) => {
copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
}
(Self::F32(src), Self::F32(dst)) => {
copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
}
(Self::F64(src), Self::F64(dst)) => {
copy_strided_src_(src, dst, dst_offset, src_shape, src_stride, src_offset)
}
(_, dst) => {
// This should be covered by the dtype check above.
return Err(Error::DTypeMismatchBinaryOp {
lhs: self.dtype(),
rhs: dst.dtype(),
op: "copy_strided",
});
}
}
Ok(())
}
pub(crate) fn where_cond(
&self,
shape: &Shape,
stride: &[usize],
t: &Self,
stride_t: &[usize],
f: &Self,
stride_f: &[usize],
) -> Result<Self> {
// TODO: Support types that could be casted to a boolean.
let pred = self.as_slice::<u32>()?;
match (t, f) {
(Self::BF16(t), Self::BF16(f)) => {
let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
Ok(Self::BF16(data))
}
(Self::F16(t), Self::F16(f)) => {
let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
Ok(Self::F16(data))
}
(Self::F32(t), Self::F32(f)) => {
let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
Ok(Self::F32(data))
}
(Self::F64(t), Self::F64(f)) => {
let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
Ok(Self::F64(data))
}
(Self::U32(t), Self::U32(f)) => {
let data = wcond(pred, shape, stride, t, stride_t, f, stride_f);
Ok(Self::U32(data))
}
_ => Err(Error::DTypeMismatchBinaryOp {
lhs: t.dtype(),
rhs: f.dtype(),
op: "where_cond",
}),
}
}
pub(crate) fn embedding_impl(
&self,
shape: &Shape,
stride: &[usize],
vs: &Self,
hidden_size: usize,
vocab_size: usize,
) -> Result<Self> {
let ids = self.as_slice::<u32>()?;
map1!(vs, take_impl1, ids, shape, stride, vocab_size, hidden_size)
}
pub(crate) fn matmul_impl(
&self,
rhs: &Self,
(b, m, n, k): (usize, usize, usize, usize),
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<Self> {
let a_skip: usize = m * k;
let b_skip: usize = n * k;
let c_skip: usize = m * n;
let rank = lhs_stride.len();
let lhs_cs = lhs_stride[rank - 1];
let lhs_rs = lhs_stride[rank - 2];
let rhs_cs = rhs_stride[rank - 1];
let rhs_rs = rhs_stride[rank - 2];
if lhs_stride.len() > 2 {
let lhs_batch_stride = &lhs_stride[..rank - 2];
let rhs_batch_stride = &rhs_stride[..rank - 2];
if lhs_batch_stride != [a_skip] || rhs_batch_stride != [b_skip] {
// Temporary error before we support abitrary striding.
return Err(Error::UnexpectedStriding);
}
}
let dst_shape: Shape = (m, n).into();
let dst_strides = dst_shape.stride_contiguous();
let dst_rs = dst_strides[0];
let dst_cs = dst_strides[1];
match (self, rhs) {
(CpuStorage::F16(lhs), CpuStorage::F16(rhs)) => {
let mut dst = vec![f16::ZERO; b * m * n];
for step in 0..b {
let lhs_p = &lhs[step * a_skip..];
let rhs_p = &rhs[step * b_skip..];
let dst_p = &mut dst[step * c_skip..];
unsafe {
gemm(
// m: usize,
m,
// n: usize,
n,
// k: usize,
k,
// dst: *mut T,
dst_p.as_mut_ptr(),
// dst_cs: isize,
dst_cs as isize,
// dst_rs: isize,
dst_rs as isize,
// read_dst: bool,
false,
// lhs: *const T,
lhs_p.as_ptr(),
// lhs_cs: isize,
lhs_cs as isize,
// lhs_rs: isize,
lhs_rs as isize,
// rhs: *const T,
rhs_p.as_ptr(),
// rhs_cs: isize,
rhs_cs as isize,
// rhs_rs: isize,
rhs_rs as isize,
// alpha: T,
f16::ONE,
// beta: T,
f16::ONE,
// conj_dst: bool,
false,
// conj_lhs: bool,
false,
// conj_rhs: bool,
true,
// parallelism: Parallelism
Parallelism::None,
)
}
}
Ok(Self::F16(dst))
}
(CpuStorage::F32(lhs), CpuStorage::F32(rhs)) => {
let mut dst = vec![0f32; b * m * n];
for step in 0..b {
let lhs_p = &lhs[step * a_skip..];
let rhs_p = &rhs[step * b_skip..];
let dst_p = &mut dst[step * c_skip..];
unsafe {
gemm(
// m: usize,
m,
// n: usize,
n,
// k: usize,
k,
// dst: *mut T,
dst_p.as_mut_ptr(),
// dst_cs: isize,
dst_cs as isize,
// dst_rs: isize,
dst_rs as isize,
// read_dst: bool,
false,
// lhs: *const T,
lhs_p.as_ptr(),
// lhs_cs: isize,
lhs_cs as isize,
// lhs_rs: isize,
lhs_rs as isize,
// rhs: *const T,
rhs_p.as_ptr(),
// rhs_cs: isize,
rhs_cs as isize,
// rhs_rs: isize,
rhs_rs as isize,
// alpha: T,
1f32,
// beta: T,
1f32,
// conj_dst: bool,
false,
// conj_lhs: bool,
false,
// conj_rhs: bool,
true,
// parallelism: Parallelism
Parallelism::None,
)
}
}
Ok(Self::F32(dst))
}
(CpuStorage::F64(lhs), CpuStorage::F64(rhs)) => {
let mut dst = vec![0f64; b * m * n];
for step in 0..b {
let lhs_p = &lhs[step * a_skip..];
let rhs_p = &rhs[step * b_skip..];
let dst_p = &mut dst[step * c_skip..];
unsafe {
gemm(
// m: usize,
m,
// n: usize,
n,
// k: usize,
k,
// dst: *mut T,
dst_p.as_mut_ptr(),
// dst_cs: isize,
dst_cs as isize,
// dst_rs: isize,
dst_rs as isize,
// read_dst: bool,
false,
// lhs: *const T,
lhs_p.as_ptr(),
// lhs_cs: isize,
lhs_cs as isize,
// lhs_rs: isize,
lhs_rs as isize,
// rhs: *const T,
rhs_p.as_ptr(),
// rhs_cs: isize,
rhs_cs as isize,
// rhs_rs: isize,
rhs_rs as isize,
// alpha: T,
1f64,
// beta: T,
1f64,
// conj_dst: bool,
false,
// conj_lhs: bool,
false,
// conj_rhs: bool,
true,
// parallelism: Parallelism
Parallelism::None,
)
}
}
Ok(Self::F64(dst))
}
_ => {
// This should be covered by the dtype check above.
Err(Error::DTypeMismatchBinaryOp {
lhs: self.dtype(),
rhs: rhs.dtype(),
op: "matmul",
})
}
}
}
pub(crate) fn ones_impl(shape: &Shape, dtype: DType) -> Self {
let elem_count = shape.elem_count();
match dtype {
DType::U32 => {
let data = vec![1u32; elem_count];
Self::U32(data)
}
DType::BF16 => {
let data = vec![bf16::ONE; elem_count];
Self::BF16(data)
}
DType::F16 => {
let data = vec![f16::ONE; elem_count];
Self::F16(data)
}
DType::F32 => {
let data = vec![1f32; elem_count];
Self::F32(data)
}
DType::F64 => {
let data = vec![1f64; elem_count];
Self::F64(data)
}
}
}
pub(crate) fn zeros_impl(shape: &Shape, dtype: DType) -> Self {
let elem_count = shape.elem_count();
match dtype {
DType::U32 => {
let data = vec![0u32; elem_count];
Self::U32(data)
}
DType::BF16 => {
let data = vec![bf16::ZERO; elem_count];
Self::BF16(data)
}
DType::F16 => {
let data = vec![f16::ZERO; elem_count];
Self::F16(data)
}
DType::F32 => {
let data = vec![0f32; elem_count];
Self::F32(data)
}
DType::F64 => {
let data = vec![0f64; elem_count];
Self::F64(data)
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{Device, Tensor};
#[test]
fn simple_matmul() -> Result<()> {
let data = vec![1.0f32, 2.0, 3.0, 4.0];
let a = Tensor::from_slice(&data, (2, 2), &Device::Cpu)?;
let data = vec![1.0f32, 2.0, 3.0, 4.0];
let b = Tensor::from_slice(&data, (2, 2), &Device::Cpu)?;
let c = a.matmul(&b)?;
assert_eq!(c.to_vec2::<f32>()?, &[&[7.0f32, 10.0], &[15.0, 22.0]]);
let data = vec![1.0f32, 2.0];
let a = Tensor::from_slice(&data, (2, 1), &Device::Cpu)?;
let data = vec![3.0f32, 4.0];
let b = Tensor::from_slice(&data, (1, 2), &Device::Cpu)?;
let c = a.matmul(&b)?;
assert_eq!(c.to_vec2::<f32>()?, &[&[3.0, 4.0], &[6.0, 8.0]]);
let data: Vec<_> = (0..6).map(|i| i as f32).collect();
let a = Tensor::from_slice(&data, (2, 3), &Device::Cpu)?;
let data: Vec<_> = (0..6).map(|i| (i + 2) as f32).collect();
let b = Tensor::from_slice(&data, (3, 2), &Device::Cpu)?;
let c = a.matmul(&b)?;
assert_eq!(c.to_vec2::<f32>()?, &[&[16., 19.], &[52., 64.]]);
let data: Vec<_> = (0..12).map(|i| i as f32).collect();
let a = Tensor::from_slice(&data, (2, 2, 3), &Device::Cpu)?;
let data: Vec<_> = (0..12).map(|i| (i + 2) as f32).collect();
let b = Tensor::from_slice(&data, (2, 3, 2), &Device::Cpu)?;
let c = a.matmul(&b)?;
assert_eq!(
c.to_vec3::<f32>()?,
&[&[&[16., 19.], &[52., 64.]], &[&[214., 235.], &[304., 334.]]]
);
Ok(())
}
}

View File

@ -0,0 +1,978 @@
use crate::{CpuStorage, DType, Shape};
use candle_kernels as kernels;
use cudarc::cublas::{Gemm, GemmConfig, StridedBatchedConfig};
use cudarc::driver::{CudaFunction, CudaSlice, LaunchAsync, LaunchConfig};
use half::{bf16, f16};
use std::sync::Arc;
/// cudarc related errors
#[derive(thiserror::Error, Debug)]
pub enum CudaError {
#[error(transparent)]
Cuda(#[from] cudarc::driver::DriverError),
#[error(transparent)]
Compiler(#[from] cudarc::nvrtc::CompileError),
#[error(transparent)]
Cublas(#[from] cudarc::cublas::result::CublasError),
#[error("{op} only supports contiguous tensors")]
RequiresContiguous { op: &'static str },
#[error("missing kernel '{module_name}'")]
MissingKernel { module_name: String },
#[error("internal error '{0}'")]
InternalError(&'static str),
#[error("matmul is only supported for contiguous tensors lstride: {lhs_stride:?} rstride: {rhs_stride:?} mnk: {mnk:?}")]
MatMulNonContiguous {
lhs_stride: Vec<usize>,
rhs_stride: Vec<usize>,
mnk: (usize, usize, usize),
},
#[error("{msg}, expected: {expected:?}, got: {got:?}")]
UnexpectedDType {
msg: &'static str,
expected: DType,
got: DType,
},
#[error("{cuda} when loading {module_name}")]
Load {
cuda: cudarc::driver::DriverError,
module_name: String,
},
}
type Result<T> = std::result::Result<T, CudaError>;
/// Unique identifier for cuda devices.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub(crate) struct DeviceId(usize);
impl DeviceId {
fn new() -> Self {
// https://users.rust-lang.org/t/idiomatic-rust-way-to-generate-unique-id/33805
use std::sync::atomic;
static COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::new(1);
Self(COUNTER.fetch_add(1, atomic::Ordering::Relaxed))
}
}
#[derive(Debug, Clone)]
pub struct CudaDevice {
id: DeviceId,
device: Arc<cudarc::driver::CudaDevice>,
#[allow(dead_code)]
blas: Arc<cudarc::cublas::CudaBlas>,
}
impl std::ops::Deref for CudaDevice {
type Target = Arc<cudarc::driver::CudaDevice>;
fn deref(&self) -> &Self::Target {
&self.device
}
}
impl CudaDevice {
pub(crate) fn new(ordinal: usize) -> Result<Self> {
let device = cudarc::driver::CudaDevice::new(ordinal)?;
let blas = cudarc::cublas::CudaBlas::new(device.clone())?;
Ok(Self {
id: DeviceId::new(),
device,
blas: Arc::new(blas),
})
}
pub(crate) fn same_id(&self, rhs: &Self) -> bool {
self.id == rhs.id
}
pub(crate) fn ordinal(&self) -> usize {
self.device.ordinal()
}
pub(crate) fn zeros_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
let elem_count = shape.elem_count();
let slice = match dtype {
DType::U32 => {
let data = self.alloc_zeros::<u32>(elem_count)?;
CudaStorageSlice::U32(data)
}
DType::BF16 => {
let data = self.alloc_zeros::<bf16>(elem_count)?;
CudaStorageSlice::BF16(data)
}
DType::F16 => {
let data = self.alloc_zeros::<f16>(elem_count)?;
CudaStorageSlice::F16(data)
}
DType::F32 => {
let data = self.alloc_zeros::<f32>(elem_count)?;
CudaStorageSlice::F32(data)
}
DType::F64 => {
let data = self.alloc_zeros::<f64>(elem_count)?;
CudaStorageSlice::F64(data)
}
};
Ok(CudaStorage {
slice,
device: self.clone(),
})
}
pub(crate) fn const_impl(&self, v: f64, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
let elem_count = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(elem_count as u32);
let slice = match dtype {
DType::U32 => {
// SAFETY: Set later by running the fill kernel.
let data = unsafe { self.alloc::<u32>(elem_count) }?;
let func = self.get_or_load_func("fill_u32", kernels::FILL)?;
let params = (&data, v as u32, elem_count);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(data)
}
DType::BF16 => {
// SAFETY: Set later by running the fill kernel.
let data = unsafe { self.alloc::<bf16>(elem_count) }?;
let func = self.get_or_load_func("fill_bf16", kernels::FILL)?;
let params = (&data, bf16::from_f64(v), elem_count);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(data)
}
DType::F16 => {
// SAFETY: Set later by running the fill kernel.
let data = unsafe { self.alloc::<f16>(elem_count) }?;
let func = self.get_or_load_func("fill_f16", kernels::FILL)?;
let params = (&data, f16::from_f64(v), elem_count);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(data)
}
DType::F32 => {
// SAFETY: Set later by running the fill kernel.
let data = unsafe { self.alloc::<f32>(elem_count) }?;
let func = self.get_or_load_func("fill_f32", kernels::FILL)?;
let params = (&data, v as f32, elem_count);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(data)
}
DType::F64 => {
// SAFETY: Set later by running the fill kernel.
let data = unsafe { self.alloc::<f64>(elem_count) }?;
let func = self.get_or_load_func("fill_f64", kernels::FILL)?;
let params = (&data, v, elem_count);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(data)
}
};
Ok(CudaStorage {
slice,
device: self.clone(),
})
}
pub(crate) fn ones_impl(&self, shape: &Shape, dtype: DType) -> Result<CudaStorage> {
self.const_impl(1., shape, dtype)
}
pub(crate) fn cuda_from_cpu_storage(&self, storage: &CpuStorage) -> Result<CudaStorage> {
let slice = match storage {
CpuStorage::U32(storage) => {
let data = self.htod_sync_copy(storage)?;
CudaStorageSlice::U32(data)
}
CpuStorage::BF16(storage) => {
let data = self.htod_sync_copy(storage)?;
CudaStorageSlice::BF16(data)
}
CpuStorage::F16(storage) => {
let data = self.htod_sync_copy(storage)?;
CudaStorageSlice::F16(data)
}
CpuStorage::F32(storage) => {
let data = self.htod_sync_copy(storage)?;
CudaStorageSlice::F32(data)
}
CpuStorage::F64(storage) => {
let data = self.htod_sync_copy(storage)?;
CudaStorageSlice::F64(data)
}
};
Ok(CudaStorage {
slice,
device: self.clone(),
})
}
fn get_or_load_func(&self, module_name: &str, ptx: &'static str) -> Result<CudaFunction> {
if !self.has_func(module_name, module_name) {
// Leaking the string here is a bit sad but we need a &'static str and this is only
// done once per kernel name.
let static_module_name = Box::leak(module_name.to_string().into_boxed_str());
self.load_ptx(ptx.into(), module_name, &[static_module_name])
.map_err(|cuda| CudaError::Load {
cuda,
module_name: module_name.to_string(),
})?;
}
self.get_func(module_name, module_name)
// Clippy recommends this `ok_or` rather than `ok_or_else` so hopefully the compiler is
// able to only build the error value if needed.
.ok_or(CudaError::MissingKernel {
module_name: module_name.to_string(),
})
}
}
#[derive(Debug)]
enum CudaStorageSlice {
U32(CudaSlice<u32>),
BF16(CudaSlice<bf16>),
F16(CudaSlice<f16>),
F32(CudaSlice<f32>),
F64(CudaSlice<f64>),
}
#[derive(Debug)]
pub struct CudaStorage {
slice: CudaStorageSlice,
device: CudaDevice,
}
fn gemm_config<T>(
alpha: T,
beta: T,
(b, m, n, k): (usize, usize, usize, usize),
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<StridedBatchedConfig<T>> {
// https://docs.nvidia.com/cuda/cublas/index.html#cublas-t-gemm
use cudarc::cublas::sys::cublasOperation_t;
let rhs_m1 = rhs_stride[rhs_stride.len() - 1];
let rhs_m2 = rhs_stride[rhs_stride.len() - 2];
let lhs_m1 = lhs_stride[lhs_stride.len() - 1];
let lhs_m2 = lhs_stride[lhs_stride.len() - 2];
// The a tensor has dims batching, k, n (rhs)
let (lda, transa) = if rhs_m1 == 1 && rhs_m2 == n {
(n as i32, cublasOperation_t::CUBLAS_OP_N)
} else if rhs_m1 == k && rhs_m2 == 1 {
(k as i32, cublasOperation_t::CUBLAS_OP_T)
} else {
Err(CudaError::MatMulNonContiguous {
lhs_stride: lhs_stride.to_vec(),
rhs_stride: rhs_stride.to_vec(),
mnk: (m, n, k),
})?
};
// The b tensor has dims batching, m, k (lhs)
let (ldb, transb) = if lhs_m1 == 1 && lhs_m2 == k {
(k as i32, cublasOperation_t::CUBLAS_OP_N)
} else if lhs_m1 == m && lhs_m2 == 1 {
(m as i32, cublasOperation_t::CUBLAS_OP_T)
} else {
Err(CudaError::MatMulNonContiguous {
lhs_stride: lhs_stride.to_vec(),
rhs_stride: rhs_stride.to_vec(),
mnk: (m, n, k),
})?
};
// The setup below was copied from:
// https://github.com/lebedov/scikit-cuda/blob/7e7300474286019c917a6c8a4bca59405c64fbce/tests/test_cublas.py#L531
let gemm = GemmConfig {
alpha,
beta,
m: n as i32,
n: m as i32,
k: k as i32,
lda,
ldb,
ldc: n as i32,
transa,
transb,
};
Ok(StridedBatchedConfig {
batch_size: b as i32,
gemm,
stride_a: (m * k) as i64,
stride_b: (n * k) as i64,
stride_c: (m * n) as i64,
})
}
impl CudaStorage {
pub fn try_clone(&self) -> Result<Self> {
let slice = match &self.slice {
CudaStorageSlice::U32(slice) => CudaStorageSlice::U32(slice.try_clone()?),
CudaStorageSlice::BF16(slice) => CudaStorageSlice::BF16(slice.try_clone()?),
CudaStorageSlice::F16(slice) => CudaStorageSlice::F16(slice.try_clone()?),
CudaStorageSlice::F32(slice) => CudaStorageSlice::F32(slice.try_clone()?),
CudaStorageSlice::F64(slice) => CudaStorageSlice::F64(slice.try_clone()?),
};
let device = self.device.clone();
Ok(Self { slice, device })
}
pub fn dtype(&self) -> DType {
match self.slice {
CudaStorageSlice::U32(_) => DType::U32,
CudaStorageSlice::BF16(_) => DType::BF16,
CudaStorageSlice::F16(_) => DType::F16,
CudaStorageSlice::F32(_) => DType::F32,
CudaStorageSlice::F64(_) => DType::F64,
}
}
pub fn device(&self) -> &CudaDevice {
&self.device
}
pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
use cudarc::driver::DevicePtr;
let dims = shape.dims();
let el = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(el as u32);
let dev = self.device();
let ds = dev.htod_copy([dims, stride].concat())?;
let inp = match &self.slice {
CudaStorageSlice::U32(inp) => inp.device_ptr(),
CudaStorageSlice::BF16(inp) => inp.device_ptr(),
CudaStorageSlice::F16(inp) => inp.device_ptr(),
CudaStorageSlice::F32(inp) => inp.device_ptr(),
CudaStorageSlice::F64(inp) => inp.device_ptr(),
};
let kernel_name = format!("cast_{}_{}", self.dtype().as_str(), dtype.as_str());
let func = dev.get_or_load_func(&kernel_name, kernels::CAST)?;
let slice = match dtype {
DType::U32 => {
let out = unsafe { dev.alloc::<u32>(el) }?;
let params = (el, dims.len(), &ds, *inp, &out);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(out)
}
DType::BF16 => {
let out = unsafe { dev.alloc::<bf16>(el) }?;
let params = (el, dims.len(), &ds, *inp, &out);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
DType::F16 => {
let out = unsafe { dev.alloc::<f16>(el) }?;
let params = (el, dims.len(), &ds, *inp, &out);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
DType::F32 => {
let out = unsafe { dev.alloc::<f32>(el) }?;
let params = (el, dims.len(), &ds, *inp, &out);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
DType::F64 => {
let out = unsafe { dev.alloc::<f64>(el) }?;
let params = (el, dims.len(), &ds, *inp, &out);
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
};
Ok(Self {
slice,
device: dev.clone(),
})
}
pub(crate) fn affine_impl(
&self,
shape: &Shape,
stride: &[usize],
mul: f64,
add: f64,
) -> Result<Self> {
let dims = shape.dims();
let el_count = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(el_count as u32);
let dev = self.device();
let ds = dev.htod_copy([dims, stride].concat())?;
let slice = match &self.slice {
CudaStorageSlice::U32(arg) => {
let func = dev.get_or_load_func("affine_u32", kernels::AFFINE)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<u32>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out, mul as u32, add as u32);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(out)
}
CudaStorageSlice::BF16(arg) => {
let func = dev.get_or_load_func("affine_bf16", kernels::AFFINE)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<bf16>(el_count) }?;
let params = (
el_count,
dims.len(),
&ds,
arg,
&out,
bf16::from_f64(mul),
bf16::from_f64(add),
);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
CudaStorageSlice::F16(arg) => {
let func = dev.get_or_load_func("affine_f16", kernels::AFFINE)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f16>(el_count) }?;
let params = (
el_count,
dims.len(),
&ds,
arg,
&out,
f16::from_f64(mul),
f16::from_f64(add),
);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
CudaStorageSlice::F32(arg) => {
let func = dev.get_or_load_func("affine_f32", kernels::AFFINE)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f32>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out, mul as f32, add as f32);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
CudaStorageSlice::F64(arg) => {
let func = dev.get_or_load_func("affine_f64", kernels::AFFINE)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f64>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out, mul, add);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn sum(&self, shape: &Shape, stride: &[usize], sum_dims: &[usize]) -> Result<Self> {
let src_dims = shape.dims();
let el = shape.elem_count();
let mut dst_el = el;
for &sum_dim in sum_dims.iter() {
dst_el /= src_dims[sum_dim];
}
let mut sum_dims = sum_dims.to_vec();
// Sort the sum_dims as they have to be processed from left to right when converting the
// indexes.
sum_dims.sort();
let sum_dims_l: Vec<usize> = sum_dims.iter().map(|&d| src_dims[d]).collect();
let sum_dims_s: Vec<usize> = sum_dims
.iter()
.map(|&d| src_dims[d + 1..].iter().product::<usize>())
.collect();
let cfg = LaunchConfig::for_num_elems(el as u32);
let dev = self.device();
let ds = dev.htod_copy([src_dims, stride, &sum_dims_l, &sum_dims_s].concat())?;
let slice = match &self.slice {
CudaStorageSlice::U32(arg) => {
let func = dev.get_or_load_func("sum_u32", kernels::REDUCE)?;
let out = dev.alloc_zeros::<u32>(dst_el)?;
let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(out)
}
CudaStorageSlice::BF16(arg) => {
let func = dev.get_or_load_func("sum_bf16", kernels::REDUCE)?;
let out = dev.alloc_zeros::<bf16>(dst_el)?;
let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
CudaStorageSlice::F16(arg) => {
let func = dev.get_or_load_func("sum_f16", kernels::REDUCE)?;
let out = dev.alloc_zeros::<f16>(dst_el)?;
let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
CudaStorageSlice::F32(arg) => {
let func = dev.get_or_load_func("sum_f32", kernels::REDUCE)?;
let out = dev.alloc_zeros::<f32>(dst_el)?;
let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
CudaStorageSlice::F64(arg) => {
let func = dev.get_or_load_func("sum_f64", kernels::REDUCE)?;
let out = dev.alloc_zeros::<f64>(dst_el)?;
let params = (el, src_dims.len(), sum_dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
Err(CudaError::InternalError(
"TODO: implement divide_by_sum_over_dim",
))
}
pub(crate) fn unary_impl<U: crate::op::UnaryOp>(
&self,
shape: &Shape,
stride: &[usize],
) -> Result<Self> {
let dims = shape.dims();
let el_count = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(el_count as u32);
let dev = &self.device;
let ds = dev.htod_copy([dims, stride].concat())?;
let slice = match &self.slice {
CudaStorageSlice::U32(_arg) => {
todo!("No unary kernels for u32");
}
CudaStorageSlice::BF16(arg) => {
let func = dev.get_or_load_func(U::KERNEL_BF16, kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<bf16>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
CudaStorageSlice::F16(arg) => {
let func = dev.get_or_load_func(U::KERNEL_F16, kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f16>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
CudaStorageSlice::F32(arg) => {
let func = dev.get_or_load_func(U::KERNEL_F32, kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f32>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
CudaStorageSlice::F64(arg) => {
let func = dev.get_or_load_func(U::KERNEL_F64, kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f64>(el_count) }?;
let params = (el_count, dims.len(), &ds, arg, &out);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn binary_impl<B: crate::op::BinaryOp>(
&self,
rhs: &Self,
shape: &Shape,
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<Self> {
let dims = shape.dims();
let elem_count = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(elem_count as u32);
let dev = self.device();
let dims_and_strides = dev.htod_copy([dims, lhs_stride, rhs_stride].concat())?;
let slice = match (&self.slice, &rhs.slice) {
(CudaStorageSlice::BF16(lhs), CudaStorageSlice::BF16(rhs)) => {
let func = dev.get_or_load_func(B::KERNEL_BF16, kernels::BINARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<bf16>(elem_count) }?;
let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
(CudaStorageSlice::F16(lhs), CudaStorageSlice::F16(rhs)) => {
let func = dev.get_or_load_func(B::KERNEL_F16, kernels::BINARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f16>(elem_count) }?;
let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
(CudaStorageSlice::F32(lhs), CudaStorageSlice::F32(rhs)) => {
let func = dev.get_or_load_func(B::KERNEL_F32, kernels::BINARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f32>(elem_count) }?;
let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
(CudaStorageSlice::F64(lhs), CudaStorageSlice::F64(rhs)) => {
// SAFETY: Set later by running the kernel.
let func = dev.get_or_load_func(B::KERNEL_F64, kernels::BINARY)?;
let out = unsafe { dev.alloc::<f64>(elem_count) }?;
let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
(CudaStorageSlice::U32(lhs), CudaStorageSlice::U32(rhs)) => {
// SAFETY: Set later by running the kernel.
let func = dev.get_or_load_func(B::KERNEL_U32, kernels::BINARY)?;
let out = unsafe { dev.alloc::<u32>(elem_count) }?;
let params = (elem_count, dims.len(), &dims_and_strides, lhs, rhs, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(out)
}
// The dtypes should have been checked at this point so this is an internal error.
_ => return Err(CudaError::InternalError("dtype mismatch in binary op")),
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn to_cpu_storage(&self) -> Result<CpuStorage> {
match &self.slice {
CudaStorageSlice::U32(slice) => {
let dev = slice.device();
let cpu_storage = dev.dtoh_sync_copy(slice)?;
Ok(CpuStorage::U32(cpu_storage))
}
CudaStorageSlice::BF16(slice) => {
let dev = slice.device();
let cpu_storage = dev.dtoh_sync_copy(slice)?;
Ok(CpuStorage::BF16(cpu_storage))
}
CudaStorageSlice::F16(slice) => {
let dev = slice.device();
let cpu_storage = dev.dtoh_sync_copy(slice)?;
Ok(CpuStorage::F16(cpu_storage))
}
CudaStorageSlice::F32(slice) => {
let dev = slice.device();
let cpu_storage = dev.dtoh_sync_copy(slice)?;
Ok(CpuStorage::F32(cpu_storage))
}
CudaStorageSlice::F64(slice) => {
let dev = slice.device();
let cpu_storage = dev.dtoh_sync_copy(slice)?;
Ok(CpuStorage::F64(cpu_storage))
}
}
}
pub(crate) fn where_cond(
&self,
shape: &Shape,
stride: &[usize],
t: &Self,
stride_t: &[usize],
f: &Self,
stride_f: &[usize],
) -> Result<Self> {
let ids = match &self.slice {
CudaStorageSlice::U32(slice) => slice,
_ => Err(CudaError::UnexpectedDType {
msg: "where conditions should be u32",
expected: DType::U32,
got: self.dtype(),
})?,
};
let dims = shape.dims();
let el = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(el as u32);
let dev = self.device();
let ds = dev.htod_copy([dims, stride, stride_t, stride_f].concat())?;
let slice = match (&t.slice, &f.slice) {
(CudaStorageSlice::BF16(t), CudaStorageSlice::BF16(f)) => {
let func = dev.get_or_load_func("where_bf16", kernels::TERNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<bf16>(el) }?;
let params = (el, dims.len(), &ds, ids, t, f, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
(CudaStorageSlice::F16(t), CudaStorageSlice::F16(f)) => {
let func = dev.get_or_load_func("where_f16", kernels::TERNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f16>(el) }?;
let params = (el, dims.len(), &ds, ids, t, f, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
(CudaStorageSlice::F32(t), CudaStorageSlice::F32(f)) => {
let func = dev.get_or_load_func("where_f32", kernels::TERNARY)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f32>(el) }?;
let params = (el, dims.len(), &ds, ids, t, f, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
(CudaStorageSlice::F64(t), CudaStorageSlice::F64(f)) => {
// SAFETY: Set later by running the kernel.
let func = dev.get_or_load_func("where_f64", kernels::TERNARY)?;
let out = unsafe { dev.alloc::<f64>(el) }?;
let params = (el, dims.len(), &ds, ids, t, f, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
(CudaStorageSlice::U32(t), CudaStorageSlice::U32(f)) => {
// SAFETY: Set later by running the kernel.
let func = dev.get_or_load_func("where_u32", kernels::TERNARY)?;
let out = unsafe { dev.alloc::<u32>(el) }?;
let params = (el, dims.len(), &ds, ids, t, f, &out);
// SAFETY: ffi
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(out)
}
// The dtypes should have been checked at this point so this is an internal error.
_ => return Err(CudaError::InternalError("dtype mismatch in binary op")),
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn embedding_impl(
&self,
shape: &Shape,
stride: &[usize],
rhs: &Self,
h_size: usize, // hidden size
v_size: usize, // vocab size
) -> Result<Self> {
let ids = match &self.slice {
CudaStorageSlice::U32(slice) => slice,
_ => Err(CudaError::UnexpectedDType {
msg: "embedding ids should be u32",
expected: DType::U32,
got: self.dtype(),
})?,
};
let dims = shape.dims();
let el = shape.elem_count();
let cfg = LaunchConfig::for_num_elems(el as u32);
let dev = self.device();
let ds = dev.htod_copy([dims, stride].concat())?;
let slice = match &rhs.slice {
// The kernels below assume that rhs is contiguous.
CudaStorageSlice::U32(arg) => {
let func = dev.get_or_load_func("emb_u32", kernels::EMBEDDINGS)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<u32>(el * h_size) }?;
let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::U32(out)
}
CudaStorageSlice::BF16(arg) => {
let func = dev.get_or_load_func("emb_bf16", kernels::EMBEDDINGS)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<bf16>(el * h_size) }?;
let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::BF16(out)
}
CudaStorageSlice::F16(arg) => {
let func = dev.get_or_load_func("emb_f16", kernels::EMBEDDINGS)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f16>(el * h_size) }?;
let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F16(out)
}
CudaStorageSlice::F32(arg) => {
let func = dev.get_or_load_func("emb_f32", kernels::EMBEDDINGS)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f32>(el * h_size) }?;
let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F32(out)
}
CudaStorageSlice::F64(arg) => {
let func = dev.get_or_load_func("emb_f64", kernels::EMBEDDINGS)?;
// SAFETY: Set later by running the kernel.
let out = unsafe { dev.alloc::<f64>(el * h_size) }?;
let params = (el, dims.len(), &ds, ids, arg, &out, h_size, v_size);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
CudaStorageSlice::F64(out)
}
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn matmul_impl(
&self,
rhs: &Self,
(b, m, n, k): (usize, usize, usize, usize),
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<Self> {
let elem_count = b * m * n;
let dev = &self.device;
let slice = match (&self.slice, &rhs.slice) {
(CudaStorageSlice::BF16(_lhs), CudaStorageSlice::BF16(_rhs)) => {
todo!("bf16")
}
(CudaStorageSlice::F16(lhs), CudaStorageSlice::F16(rhs)) => {
let cfg = gemm_config(f16::ONE, f16::ZERO, (b, m, n, k), lhs_stride, rhs_stride)?;
let mut out = unsafe { dev.alloc::<f16>(elem_count) }?;
unsafe {
self.device
.blas
.gemm_strided_batched(cfg, rhs, lhs, &mut out)
}?;
CudaStorageSlice::F16(out)
}
(CudaStorageSlice::F32(lhs), CudaStorageSlice::F32(rhs)) => {
let cfg = gemm_config(1., 0., (b, m, n, k), lhs_stride, rhs_stride)?;
let mut out = unsafe { dev.alloc::<f32>(elem_count) }?;
unsafe {
self.device
.blas
.gemm_strided_batched(cfg, rhs, lhs, &mut out)
}?;
CudaStorageSlice::F32(out)
}
(CudaStorageSlice::F64(lhs), CudaStorageSlice::F64(rhs)) => {
let cfg = gemm_config(1., 0., (b, m, n, k), lhs_stride, rhs_stride)?;
let mut out = unsafe { dev.alloc::<f64>(elem_count) }?;
unsafe {
self.device
.blas
.gemm_strided_batched(cfg, rhs, lhs, &mut out)
}?;
CudaStorageSlice::F64(out)
}
_ => return Err(CudaError::InternalError("dtype mismatch in matmul op")),
};
let device = dev.clone();
Ok(Self { slice, device })
}
pub(crate) fn copy_strided_src(
&self,
dst: &mut Self,
dst_offset: usize,
src_shape: &Shape,
src_stride: &[usize],
src_offset: usize,
) -> Result<()> {
if src_shape.rank() != src_stride.len() {
panic!("incoherent shape and strides {src_shape:?} {src_stride:?}")
}
let dims = src_shape.dims();
let el_count = src_shape.elem_count();
let cfg = LaunchConfig::for_num_elems(el_count as u32);
let dev = &self.device;
let ds = dev.htod_copy([dims, src_stride].concat())?;
match (&self.slice, &mut dst.slice) {
(CudaStorageSlice::BF16(src), CudaStorageSlice::BF16(dst)) => {
let src = src.slice(src_offset..);
let mut dst = dst.slice_mut(dst_offset..);
if src_shape.is_contiguous(src_stride) {
dev.dtod_copy(&src, &mut dst)?
} else {
let func = dev.get_or_load_func("ucopy_bf16", kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let params = (el_count, dims.len(), &ds, &src, &mut dst);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?
}
}
(CudaStorageSlice::F16(src), CudaStorageSlice::F16(dst)) => {
let src = src.slice(src_offset..);
let mut dst = dst.slice_mut(dst_offset..);
if src_shape.is_contiguous(src_stride) {
dev.dtod_copy(&src, &mut dst)?
} else {
let func = dev.get_or_load_func("ucopy_f16", kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let params = (el_count, dims.len(), &ds, &src, &mut dst);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?
}
}
(CudaStorageSlice::F32(src), CudaStorageSlice::F32(dst)) => {
let src = src.slice(src_offset..);
let mut dst = dst.slice_mut(dst_offset..);
if src_shape.is_contiguous(src_stride) {
dev.dtod_copy(&src, &mut dst)?
} else {
let func = dev.get_or_load_func("ucopy_f32", kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let params = (el_count, dims.len(), &ds, &src, &mut dst);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?
}
}
(CudaStorageSlice::U32(src), CudaStorageSlice::U32(dst)) => {
let src = src.slice(src_offset..);
let mut dst = dst.slice_mut(dst_offset..);
if src_shape.is_contiguous(src_stride) {
dev.dtod_copy(&src, &mut dst)?
} else {
let func = dev.get_or_load_func("ucopy_u32", kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let params = (el_count, dims.len(), &ds, &src, &mut dst);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?
}
}
(CudaStorageSlice::F64(src), CudaStorageSlice::F64(dst)) => {
let src = src.slice(src_offset..);
let mut dst = dst.slice_mut(dst_offset..);
if src_shape.is_contiguous(src_stride) {
dev.dtod_copy(&src, &mut dst)?
} else {
let func = dev.get_or_load_func("ucopy_64", kernels::UNARY)?;
// SAFETY: Set later by running the kernel.
let params = (el_count, dims.len(), &ds, &src, &mut dst);
// SAFETY: ffi.
unsafe { func.launch(cfg, params) }?;
}
}
_ => {
return Err(CudaError::InternalError(
"dtype mismatch in copy_strided op",
))
}
}
Ok(())
}
}

159
candle-core/src/device.rs Normal file
View File

@ -0,0 +1,159 @@
use crate::{CpuStorage, DType, Result, Shape, Storage, WithDType};
/// A `DeviceLocation` represents a physical device whereas multiple `Device`
/// can live on the same location (typically for cuda devices).
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum DeviceLocation {
Cpu,
Cuda { gpu_id: usize },
}
#[derive(Debug, Clone)]
pub enum Device {
Cpu,
Cuda(crate::CudaDevice),
}
// TODO: Should we back the cpu implementation using the NdArray crate or similar?
pub trait NdArray {
fn shape(&self) -> Result<Shape>;
fn to_cpu_storage(&self) -> CpuStorage;
}
impl<S: WithDType> NdArray for S {
fn shape(&self) -> Result<Shape> {
Ok(Shape::from(()))
}
fn to_cpu_storage(&self) -> CpuStorage {
S::to_cpu_storage(&[*self])
}
}
impl<S: WithDType, const N: usize> NdArray for &[S; N] {
fn shape(&self) -> Result<Shape> {
Ok(Shape::from(self.len()))
}
fn to_cpu_storage(&self) -> CpuStorage {
S::to_cpu_storage(self.as_slice())
}
}
impl<S: WithDType> NdArray for &[S] {
fn shape(&self) -> Result<Shape> {
Ok(Shape::from(self.len()))
}
fn to_cpu_storage(&self) -> CpuStorage {
S::to_cpu_storage(self)
}
}
impl<S: WithDType, const N: usize, const M: usize> NdArray for &[[S; N]; M] {
fn shape(&self) -> Result<Shape> {
Ok(Shape::from((M, N)))
}
fn to_cpu_storage(&self) -> CpuStorage {
S::to_cpu_storage_owned(self.concat())
}
}
impl<S: WithDType, const N1: usize, const N2: usize, const N3: usize> NdArray
for &[[[S; N3]; N2]; N1]
{
fn shape(&self) -> Result<Shape> {
Ok(Shape::from((N1, N2, N3)))
}
fn to_cpu_storage(&self) -> CpuStorage {
let mut vec = Vec::new();
vec.reserve(N1 * N2 * N3);
for i1 in 0..N1 {
for i2 in 0..N2 {
vec.extend(self[i1][i2])
}
}
S::to_cpu_storage_owned(vec)
}
}
impl Device {
pub fn new_cuda(ordinal: usize) -> Result<Self> {
Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
}
pub fn same_id(&self, rhs: &Self) -> bool {
match (self, rhs) {
(Self::Cpu, Self::Cpu) => true,
(Self::Cuda(lhs), Self::Cuda(rhs)) => lhs.same_id(rhs),
_ => false,
}
}
pub fn location(&self) -> DeviceLocation {
match self {
Self::Cpu => DeviceLocation::Cpu,
Self::Cuda(device) => DeviceLocation::Cuda {
gpu_id: device.ordinal(),
},
}
}
pub fn is_cuda(&self) -> bool {
match self {
Self::Cpu => false,
Self::Cuda(_) => true,
}
}
pub(crate) fn ones(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
match self {
Device::Cpu => {
let storage = CpuStorage::ones_impl(shape, dtype);
Ok(Storage::Cpu(storage))
}
Device::Cuda(device) => {
let storage = device.ones_impl(shape, dtype)?;
Ok(Storage::Cuda(storage))
}
}
}
pub(crate) fn zeros(&self, shape: &Shape, dtype: DType) -> Result<Storage> {
match self {
Device::Cpu => {
let storage = CpuStorage::zeros_impl(shape, dtype);
Ok(Storage::Cpu(storage))
}
Device::Cuda(device) => {
let storage = device.zeros_impl(shape, dtype)?;
Ok(Storage::Cuda(storage))
}
}
}
pub(crate) fn storage<A: NdArray>(&self, array: A) -> Result<Storage> {
match self {
Device::Cpu => Ok(Storage::Cpu(array.to_cpu_storage())),
Device::Cuda(device) => {
let storage = array.to_cpu_storage();
let storage = device.cuda_from_cpu_storage(&storage)?;
Ok(Storage::Cuda(storage))
}
}
}
pub(crate) fn storage_owned<S: WithDType>(&self, data: Vec<S>) -> Result<Storage> {
match self {
Device::Cpu => Ok(Storage::Cpu(S::to_cpu_storage_owned(data))),
Device::Cuda(device) => {
let storage = S::to_cpu_storage_owned(data);
let storage = device.cuda_from_cpu_storage(&storage)?;
Ok(Storage::Cuda(storage))
}
}
}
}

96
candle-core/src/dtype.rs Normal file
View File

@ -0,0 +1,96 @@
use crate::{CpuStorage, Error, Result};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum DType {
U32,
BF16,
F16,
F32,
F64,
}
impl DType {
pub fn as_str(&self) -> &'static str {
match self {
Self::U32 => "u32",
Self::BF16 => "bf16",
Self::F16 => "f16",
Self::F32 => "f32",
Self::F64 => "f64",
}
}
pub fn size_in_bytes(&self) -> usize {
match self {
Self::U32 => 4,
Self::BF16 => 2,
Self::F16 => 2,
Self::F32 => 4,
Self::F64 => 8,
}
}
}
pub trait WithDType: Sized + Copy {
const DTYPE: DType;
fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage;
fn to_cpu_storage(data: &[Self]) -> CpuStorage {
Self::to_cpu_storage_owned(data.to_vec())
}
fn cpu_storage_as_slice(s: &CpuStorage) -> Result<&[Self]>;
fn cpu_storage_as_mut_slice(s: &mut CpuStorage) -> Result<&mut [Self]>;
fn cpu_storage_data(s: CpuStorage) -> Result<Vec<Self>>;
}
macro_rules! with_dtype {
($ty:ty, $dtype:ident) => {
impl WithDType for $ty {
const DTYPE: DType = DType::$dtype;
fn to_cpu_storage_owned(data: Vec<Self>) -> CpuStorage {
CpuStorage::$dtype(data)
}
fn cpu_storage_data(s: CpuStorage) -> Result<Vec<Self>> {
match s {
CpuStorage::$dtype(data) => Ok(data),
_ => Err(Error::UnexpectedDType {
expected: DType::$dtype,
got: s.dtype(),
msg: "unexpected dtype",
}),
}
}
fn cpu_storage_as_slice(s: &CpuStorage) -> Result<&[Self]> {
match s {
CpuStorage::$dtype(data) => Ok(data),
_ => Err(Error::UnexpectedDType {
expected: DType::$dtype,
got: s.dtype(),
msg: "unexpected dtype",
}),
}
}
fn cpu_storage_as_mut_slice(s: &mut CpuStorage) -> Result<&mut [Self]> {
match s {
CpuStorage::$dtype(data) => Ok(data),
_ => Err(Error::UnexpectedDType {
expected: DType::$dtype,
got: s.dtype(),
msg: "unexpected dtype",
}),
}
}
}
};
}
with_dtype!(u32, U32);
with_dtype!(half::f16, F16);
with_dtype!(half::bf16, BF16);
with_dtype!(f32, F32);
with_dtype!(f64, F64);

View File

@ -0,0 +1,136 @@
#![allow(dead_code)]
use crate::{CpuStorage, DType, Error, Result, Shape};
#[derive(thiserror::Error, Debug)]
pub enum DummyError {}
pub type CudaError = DummyError;
#[derive(Debug, Clone)]
pub struct CudaDevice;
macro_rules! fail {
() => {
unimplemented!("cuda support has not been enabled")
};
}
impl CudaDevice {
pub(crate) fn new(_: usize) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn same_id(&self, _: &Self) -> bool {
true
}
pub(crate) fn ordinal(&self) -> usize {
fail!()
}
pub(crate) fn zeros_impl(&self, _shape: &Shape, _dtype: DType) -> Result<CudaStorage> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn ones_impl(&self, _shape: &Shape, _dtype: DType) -> Result<CudaStorage> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn cuda_from_cpu_storage(&self, _: &CpuStorage) -> Result<CudaStorage> {
Err(Error::NotCompiledWithCudaSupport)
}
}
#[derive(Debug)]
pub struct CudaStorage;
impl CudaStorage {
pub fn try_clone(&self) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub fn dtype(&self) -> DType {
fail!()
}
pub fn device(&self) -> &CudaDevice {
fail!()
}
pub(crate) fn to_cpu_storage(&self) -> Result<CpuStorage> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn affine_impl(&self, _: &Shape, _: &[usize], _: f64, _: f64) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn sum(&self, _: &Shape, _: &[usize], _: &[usize]) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn divide_by_sum_over_dim(&mut self, _: &Shape, _: usize) -> Result<()> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn to_dtype(&self, _: &Shape, _: &[usize], _: DType) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn unary_impl<B: crate::op::UnaryOp>(&self, _: &Shape, _: &[usize]) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn binary_impl<B: crate::op::BinaryOp>(
&self,
_: &Self,
_: &Shape,
_: &[usize],
_: &[usize],
) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn where_cond(
&self,
_: &Shape,
_: &[usize],
_: &Self,
_: &[usize],
_: &Self,
_: &[usize],
) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn embedding_impl(
&self,
_: &Shape,
_: &[usize],
_: &Self,
_: usize,
_: usize,
) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn matmul_impl(
&self,
_: &Self,
_: (usize, usize, usize, usize),
_: &[usize],
_: &[usize],
) -> Result<Self> {
Err(Error::NotCompiledWithCudaSupport)
}
pub(crate) fn copy_strided_src(
&self,
_: &mut Self,
_: usize,
_: &Shape,
_: &[usize],
_: usize,
) -> Result<()> {
Err(Error::NotCompiledWithCudaSupport)
}
}

102
candle-core/src/error.rs Normal file
View File

@ -0,0 +1,102 @@
use crate::{DType, DeviceLocation, Shape};
/// Main library error type.
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("{msg}, expected: {expected:?}, got: {got:?}")]
UnexpectedDType {
msg: &'static str,
expected: DType,
got: DType,
},
#[error("{op} only supports contiguous tensors")]
RequiresContiguous { op: &'static str },
#[error("{op} expects at least one tensor")]
OpRequiresAtLeastOneTensor { op: &'static str },
#[error("backward is not supported for {op}")]
BackwardNotSupported { op: &'static str },
#[error("{op} invalid index {index} with vocab {vocab_size}")]
InvalidIndex {
op: &'static str,
index: usize,
vocab_size: usize,
},
#[error("the candle crate has not been built with cuda support")]
NotCompiledWithCudaSupport,
#[error(
"Shape mismatch, got buffer of size {buffer_size} which is compatible with shape {shape:?}"
)]
ShapeMismatch { buffer_size: usize, shape: Shape },
#[error("shape mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
ShapeMismatchBinaryOp {
lhs: Shape,
rhs: Shape,
op: &'static str,
},
#[error("shape mismatch in cat for dim {dim}, shape for arg 1: {first_shape:?} shape for arg {n}: {nth_shape:?}")]
ShapeMismatchCat {
dim: usize,
first_shape: Shape,
n: usize,
nth_shape: Shape,
},
#[error("device mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
DeviceMismatchBinaryOp {
lhs: DeviceLocation,
rhs: DeviceLocation,
op: &'static str,
},
#[error("dtype mismatch in {op}, lhs: {lhs:?}, rhs: {rhs:?}")]
DTypeMismatchBinaryOp {
lhs: DType,
rhs: DType,
op: &'static str,
},
#[error("unexpected rank, expected: {expected}, got: {got} ({shape:?})")]
UnexpectedNumberOfDims {
expected: usize,
got: usize,
shape: Shape,
},
// TODO this is temporary when we support arbitrary matmul
#[error("temporary error where matmul doesn't support arbitrary striding")]
UnexpectedStriding,
#[error(transparent)]
Cuda(#[from] crate::CudaError),
#[error(transparent)]
TryFromIntError(#[from] core::num::TryFromIntError),
#[error("npy/npz error {0}")]
Npy(String),
/// Zip file format error.
#[error(transparent)]
Zip(#[from] zip::result::ZipError),
/// Integer parse error.
#[error(transparent)]
ParseInt(#[from] std::num::ParseIntError),
/// I/O error.
#[error(transparent)]
Io(#[from] std::io::Error),
#[error("cannot broadcast {src_shape:?} to {dst_shape:?}")]
BroadcastIncompatibleShapes { src_shape: Shape, dst_shape: Shape },
}
pub type Result<T> = std::result::Result<T, Error>;

29
candle-core/src/lib.rs Normal file
View File

@ -0,0 +1,29 @@
mod backprop;
mod cpu_backend;
#[cfg(feature = "cuda")]
mod cuda_backend;
mod device;
mod dtype;
mod dummy_cuda_backend;
mod error;
mod npy;
mod op;
mod shape;
mod storage;
mod strided_index;
mod tensor;
pub use cpu_backend::CpuStorage;
pub use device::{Device, DeviceLocation};
pub use dtype::{DType, WithDType};
pub use error::{Error, Result};
pub use shape::Shape;
pub use storage::Storage;
use strided_index::StridedIndex;
pub use tensor::{Tensor, TensorId};
#[cfg(feature = "cuda")]
pub use cuda_backend::{CudaDevice, CudaError, CudaStorage};
#[cfg(not(feature = "cuda"))]
pub use dummy_cuda_backend::{CudaDevice, CudaError, CudaStorage};

401
candle-core/src/npy.rs Normal file
View File

@ -0,0 +1,401 @@
//! Numpy support for literals.
//!
//! The spec for the npy format can be found in
//! [npy-format](https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html).
//! The functions from this module can be used to read literals from npy/npz files
//! or write literals to these files. A npy file contains a single literal (unnamed)
//! whereas a npz file can contain multiple named literals. npz files are also compressed.
//!
//! These two formats are easy to use in Python using the numpy library.
//!
//! ```python
//! import numpy as np
//! x = np.arange(10)
//!
//! # Write a npy file.
//! np.save("test.npy", x)
//!
//! # Read a value from the npy file.
//! x = np.load("test.npy")
//!
//! # Write multiple values to a npz file.
//! values = { "x": x, "x_plus_one": x + 1 }
//! np.savez("test.npz", **values)
//!
//! # Load multiple values from a npz file.
//! values = np.loadz("test.npz")
//! ```
use crate::{DType, Device, Error, Result, Shape, Tensor};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use half::{bf16, f16, slice::HalfFloatSliceExt};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read, Write};
use std::path::Path;
const NPY_MAGIC_STRING: &[u8] = b"\x93NUMPY";
const NPY_SUFFIX: &str = ".npy";
fn read_header<R: Read>(reader: &mut R) -> Result<String> {
let mut magic_string = vec![0u8; NPY_MAGIC_STRING.len()];
reader.read_exact(&mut magic_string)?;
if magic_string != NPY_MAGIC_STRING {
return Err(Error::Npy("magic string mismatch".to_string()));
}
let mut version = [0u8; 2];
reader.read_exact(&mut version)?;
let header_len_len = match version[0] {
1 => 2,
2 => 4,
otherwise => return Err(Error::Npy(format!("unsupported version {otherwise}"))),
};
let mut header_len = vec![0u8; header_len_len];
reader.read_exact(&mut header_len)?;
let header_len = header_len
.iter()
.rev()
.fold(0_usize, |acc, &v| 256 * acc + v as usize);
let mut header = vec![0u8; header_len];
reader.read_exact(&mut header)?;
Ok(String::from_utf8_lossy(&header).to_string())
}
#[derive(Debug, PartialEq)]
struct Header {
descr: DType,
fortran_order: bool,
shape: Vec<usize>,
}
impl Header {
fn shape(&self) -> Shape {
Shape::from(self.shape.as_slice())
}
fn to_string(&self) -> Result<String> {
let fortran_order = if self.fortran_order { "True" } else { "False" };
let mut shape = self
.shape
.iter()
.map(|x| x.to_string())
.collect::<Vec<_>>()
.join(",");
let descr = match self.descr {
DType::BF16 => Err(Error::Npy("bf16 is not supported".into()))?,
DType::F16 => "f2",
DType::F32 => "f4",
DType::F64 => "f8",
DType::U32 => "u4",
};
if !shape.is_empty() {
shape.push(',')
}
Ok(format!(
"{{'descr': '<{descr}', 'fortran_order': {fortran_order}, 'shape': ({shape}), }}"
))
}
// Hacky parser for the npy header, a typical example would be:
// {'descr': '<f8', 'fortran_order': False, 'shape': (128,), }
fn parse(header: &str) -> Result<Header> {
let header =
header.trim_matches(|c: char| c == '{' || c == '}' || c == ',' || c.is_whitespace());
let mut parts: Vec<String> = vec![];
let mut start_index = 0usize;
let mut cnt_parenthesis = 0i64;
for (index, c) in header.chars().enumerate() {
match c {
'(' => cnt_parenthesis += 1,
')' => cnt_parenthesis -= 1,
',' => {
if cnt_parenthesis == 0 {
parts.push(header[start_index..index].to_owned());
start_index = index + 1;
}
}
_ => {}
}
}
parts.push(header[start_index..].to_owned());
let mut part_map: HashMap<String, String> = HashMap::new();
for part in parts.iter() {
let part = part.trim();
if !part.is_empty() {
match part.split(':').collect::<Vec<_>>().as_slice() {
[key, value] => {
let key = key.trim_matches(|c: char| c == '\'' || c.is_whitespace());
let value = value.trim_matches(|c: char| c == '\'' || c.is_whitespace());
let _ = part_map.insert(key.to_owned(), value.to_owned());
}
_ => return Err(Error::Npy(format!("unable to parse header {header}"))),
}
}
}
let fortran_order = match part_map.get("fortran_order") {
None => false,
Some(fortran_order) => match fortran_order.as_ref() {
"False" => false,
"True" => true,
_ => return Err(Error::Npy(format!("unknown fortran_order {fortran_order}"))),
},
};
let descr = match part_map.get("descr") {
None => return Err(Error::Npy("no descr in header".to_string())),
Some(descr) => {
if descr.is_empty() {
return Err(Error::Npy("empty descr".to_string()));
}
if descr.starts_with('>') {
return Err(Error::Npy(format!("little-endian descr {descr}")));
}
// the only supported types in tensor are:
// float64, float32, float16,
// complex64, complex128,
// int64, int32, int16, int8,
// uint8, and bool.
match descr.trim_matches(|c: char| c == '=' || c == '<' || c == '|') {
"e" | "f2" => DType::F16,
"f" | "f4" => DType::F32,
"d" | "f8" => DType::F64,
// "i" | "i4" => DType::S32,
// "q" | "i8" => DType::S64,
// "h" | "i2" => DType::S16,
// "b" | "i1" => DType::S8,
// "B" | "u1" => DType::U8,
"I" | "u4" => DType::U32,
// "?" | "b1" => DType::Pred,
// "F" | "F4" => DType::C64,
// "D" | "F8" => DType::C128,
descr => return Err(Error::Npy(format!("unrecognized descr {descr}"))),
}
}
};
let shape = match part_map.get("shape") {
None => return Err(Error::Npy("no shape in header".to_string())),
Some(shape) => {
let shape = shape.trim_matches(|c: char| c == '(' || c == ')' || c == ',');
if shape.is_empty() {
vec![]
} else {
shape
.split(',')
.map(|v| v.trim().parse::<usize>())
.collect::<std::result::Result<Vec<_>, _>>()?
}
}
};
Ok(Header {
descr,
fortran_order,
shape,
})
}
}
impl Tensor {
// TODO: Add the possibility to read directly to a device?
fn from_reader<R: std::io::Read>(shape: Shape, dtype: DType, reader: &mut R) -> Result<Self> {
let elem_count = shape.elem_count();
match dtype {
DType::BF16 => {
let mut data_t = vec![bf16::ZERO; elem_count];
reader.read_u16_into::<LittleEndian>(data_t.reinterpret_cast_mut())?;
Tensor::from_vec(data_t, shape, &Device::Cpu)
}
DType::F16 => {
let mut data_t = vec![f16::ZERO; elem_count];
reader.read_u16_into::<LittleEndian>(data_t.reinterpret_cast_mut())?;
Tensor::from_vec(data_t, shape, &Device::Cpu)
}
DType::F32 => {
let mut data_t = vec![0f32; elem_count];
reader.read_f32_into::<LittleEndian>(&mut data_t)?;
Tensor::from_vec(data_t, shape, &Device::Cpu)
}
DType::F64 => {
let mut data_t = vec![0f64; elem_count];
reader.read_f64_into::<LittleEndian>(&mut data_t)?;
Tensor::from_vec(data_t, shape, &Device::Cpu)
}
DType::U32 => {
let mut data_t = vec![0u32; elem_count];
reader.read_u32_into::<LittleEndian>(&mut data_t)?;
Tensor::from_vec(data_t, shape, &Device::Cpu)
}
}
}
/// Reads a npy file and return the stored multi-dimensional array as a literal.
pub fn read_npy<T: AsRef<Path>>(path: T) -> Result<Self> {
let mut reader = File::open(path.as_ref())?;
let header = read_header(&mut reader)?;
let header = Header::parse(&header)?;
if header.fortran_order {
return Err(Error::Npy("fortran order not supported".to_string()));
}
let mut data: Vec<u8> = vec![];
reader.read_to_end(&mut data)?;
Self::from_reader(header.shape(), header.descr, &mut reader)
}
/// Reads a npz file and returns the stored multi-dimensional arrays together with their names.
pub fn read_npz<T: AsRef<Path>>(path: T) -> Result<Vec<(String, Self)>> {
let zip_reader = BufReader::new(File::open(path.as_ref())?);
let mut zip = zip::ZipArchive::new(zip_reader)?;
let mut result = vec![];
for i in 0..zip.len() {
let mut reader = zip.by_index(i).unwrap();
let name = {
let name = reader.name();
name.strip_suffix(NPY_SUFFIX).unwrap_or(name).to_owned()
};
let header = read_header(&mut reader)?;
let header = Header::parse(&header)?;
if header.fortran_order {
return Err(Error::Npy("fortran order not supported".to_string()));
}
let s = Self::from_reader(header.shape(), header.descr, &mut reader)?;
result.push((name, s))
}
Ok(result)
}
/// Reads a npz file and returns the stored multi-dimensional arrays for some specified names.
pub fn read_npz_by_name<T: AsRef<Path>>(path: T, names: &[&str]) -> Result<Vec<Self>> {
let zip_reader = BufReader::new(File::open(path.as_ref())?);
let mut zip = zip::ZipArchive::new(zip_reader)?;
let mut result = vec![];
for name in names.iter() {
let mut reader = match zip.by_name(&format!("{name}{NPY_SUFFIX}")) {
Ok(reader) => reader,
Err(_) => Err(Error::Npy(format!(
"no array for {name} in {:?}",
path.as_ref()
)))?,
};
let header = read_header(&mut reader)?;
let header = Header::parse(&header)?;
if header.fortran_order {
return Err(Error::Npy("fortran order not supported".to_string()));
}
let s = Self::from_reader(header.shape(), header.descr, &mut reader)?;
result.push(s)
}
Ok(result)
}
fn write<T: Write>(&self, f: &mut T) -> Result<()> {
f.write_all(NPY_MAGIC_STRING)?;
f.write_all(&[1u8, 0u8])?;
let header = Header {
descr: self.dtype(),
fortran_order: false,
shape: self.dims().to_vec(),
};
let mut header = header.to_string()?;
let pad = 16 - (NPY_MAGIC_STRING.len() + 5 + header.len()) % 16;
for _ in 0..pad % 16 {
header.push(' ')
}
header.push('\n');
f.write_all(&[(header.len() % 256) as u8, (header.len() / 256) as u8])?;
f.write_all(header.as_bytes())?;
let elem_count = self.elem_count();
match self.dtype() {
DType::BF16 => {
let vs = self.reshape(elem_count)?.to_vec1::<bf16>()?;
for &v in vs.reinterpret_cast() {
f.write_u16::<LittleEndian>(v)?
}
}
DType::F16 => {
let vs = self.reshape(elem_count)?.to_vec1::<f16>()?;
for &v in vs.reinterpret_cast() {
f.write_u16::<LittleEndian>(v)?
}
}
DType::F32 => {
// TODO: Avoid using a buffer when data is already on the CPU.
for v in self.reshape(elem_count)?.to_vec1::<f32>()? {
f.write_f32::<LittleEndian>(v)?
}
}
DType::F64 => {
for v in self.reshape(elem_count)?.to_vec1::<f64>()? {
f.write_f64::<LittleEndian>(v)?
}
}
DType::U32 => {
for v in self.reshape(elem_count)?.to_vec1::<u32>()? {
f.write_u32::<LittleEndian>(v)?
}
}
}
Ok(())
}
/// Writes a multi-dimensional array in the npy format.
pub fn write_npy<T: AsRef<Path>>(&self, path: T) -> Result<()> {
let mut f = File::create(path.as_ref())?;
self.write(&mut f)
}
/// Writes multiple multi-dimensional arrays using the npz format.
pub fn write_npz<S: AsRef<str>, T: AsRef<Tensor>, P: AsRef<Path>>(
ts: &[(S, T)],
path: P,
) -> Result<()> {
let mut zip = zip::ZipWriter::new(File::create(path.as_ref())?);
let options =
zip::write::FileOptions::default().compression_method(zip::CompressionMethod::Stored);
for (name, tensor) in ts.iter() {
zip.start_file(format!("{}.npy", name.as_ref()), options)?;
tensor.as_ref().write(&mut zip)?
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::Header;
#[test]
fn parse() {
let h = "{'descr': '<f8', 'fortran_order': False, 'shape': (128,), }";
assert_eq!(
Header::parse(h).unwrap(),
Header {
descr: crate::DType::F64,
fortran_order: false,
shape: vec![128]
}
);
let h = "{'descr': '<f4', 'fortran_order': True, 'shape': (256,1,128), }";
let h = Header::parse(h).unwrap();
assert_eq!(
h,
Header {
descr: crate::DType::F32,
fortran_order: true,
shape: vec![256, 1, 128]
}
);
assert_eq!(
h.to_string().unwrap(),
"{'descr': '<f4', 'fortran_order': True, 'shape': (256,1,128,), }"
);
let h = Header {
descr: crate::DType::U32,
fortran_order: false,
shape: vec![],
};
assert_eq!(
h.to_string().unwrap(),
"{'descr': '<u4', 'fortran_order': False, 'shape': (), }"
);
}
}

197
candle-core/src/op.rs Normal file
View File

@ -0,0 +1,197 @@
use crate::Tensor;
use half::{bf16, f16};
use num_traits::float::Float;
#[derive(Clone)]
pub(crate) enum Op {
Add(Tensor, Tensor),
Mul(Tensor, Tensor),
Sub(Tensor, Tensor),
Div(Tensor, Tensor),
Matmul(Tensor, Tensor),
Embedding(Tensor, Tensor),
WhereCond(Tensor, Tensor, Tensor),
Cat(Vec<Tensor>, usize),
#[allow(dead_code)] // add is currently unused.
Affine {
arg: Tensor,
mul: f64,
add: f64,
},
Sum(Tensor, Vec<usize>),
ToDType(Tensor),
Broadcast(Tensor),
Exp(Tensor),
Log(Tensor),
Sin(Tensor),
Cos(Tensor),
Abs(Tensor),
Narrow(Tensor, usize, usize, usize),
Neg(Tensor),
Reshape(Tensor),
Softmax(Tensor, usize),
Sqr(Tensor),
Sqrt(Tensor),
ToDevice(Tensor),
Transpose(Tensor, usize, usize),
Gelu(Tensor),
// TODO: Support for custom ops.
}
pub(crate) trait UnaryOp {
const NAME: &'static str;
const KERNEL_BF16: &'static str;
const KERNEL_F16: &'static str;
const KERNEL_F32: &'static str;
const KERNEL_F64: &'static str;
const KERNEL_U32: &'static str;
fn bf16(v1: bf16) -> bf16;
fn f16(v1: f16) -> f16;
fn f32(v1: f32) -> f32;
fn f64(v1: f64) -> f64;
fn u32(v1: u32) -> u32;
}
pub(crate) trait BinaryOp {
const NAME: &'static str;
const KERNEL_BF16: &'static str;
const KERNEL_F16: &'static str;
const KERNEL_F32: &'static str;
const KERNEL_F64: &'static str;
const KERNEL_U32: &'static str;
fn bf16(v1: bf16, v2: bf16) -> bf16;
fn f16(v1: f16, v2: f16) -> f16;
fn f32(v1: f32, v2: f32) -> f32;
fn f64(v1: f64, v2: f64) -> f64;
fn u32(v1: u32, v2: u32) -> u32;
}
pub(crate) struct Add;
pub(crate) struct Div;
pub(crate) struct Mul;
pub(crate) struct Sub;
pub(crate) struct Exp;
pub(crate) struct Log;
pub(crate) struct Sin;
pub(crate) struct Cos;
pub(crate) struct Abs;
pub(crate) struct Neg;
pub(crate) struct Sqr;
pub(crate) struct Sqrt;
pub(crate) struct Gelu;
macro_rules! bin_op {
($op:ident, $name: literal, $e: expr) => {
impl BinaryOp for $op {
const NAME: &'static str = $name;
const KERNEL_BF16: &'static str = concat!("b", $name, "_bf16");
const KERNEL_F16: &'static str = concat!("b", $name, "_f16");
const KERNEL_F32: &'static str = concat!("b", $name, "_f32");
const KERNEL_F64: &'static str = concat!("b", $name, "_f64");
const KERNEL_U32: &'static str = concat!("b", $name, "_u32");
fn bf16(v1: bf16, v2: bf16) -> bf16 {
$e(v1, v2)
}
fn f16(v1: f16, v2: f16) -> f16 {
$e(v1, v2)
}
fn f32(v1: f32, v2: f32) -> f32 {
$e(v1, v2)
}
fn f64(v1: f64, v2: f64) -> f64 {
$e(v1, v2)
}
fn u32(v1: u32, v2: u32) -> u32 {
$e(v1, v2)
}
}
};
}
bin_op!(Add, "add", |v1, v2| v1 + v2);
bin_op!(Sub, "sub", |v1, v2| v1 - v2);
bin_op!(Mul, "mul", |v1, v2| v1 * v2);
bin_op!(Div, "div", |v1, v2| v1 / v2);
macro_rules! unary_op {
($op: ident, $name: literal, $a: ident, $e: expr) => {
impl UnaryOp for $op {
const NAME: &'static str = $name;
const KERNEL_BF16: &'static str = concat!("u", $name, "_bf16");
const KERNEL_F16: &'static str = concat!("u", $name, "_f16");
const KERNEL_F32: &'static str = concat!("u", $name, "_f32");
const KERNEL_F64: &'static str = concat!("u", $name, "_f64");
const KERNEL_U32: &'static str = concat!("u", $name, "_u32");
fn bf16($a: bf16) -> bf16 {
$e
}
fn f16($a: f16) -> f16 {
$e
}
fn f32($a: f32) -> f32 {
$e
}
fn f64($a: f64) -> f64 {
$e
}
fn u32(_: u32) -> u32 {
todo!("no unary function for u32")
}
}
};
}
unary_op!(Exp, "exp", v, v.exp());
unary_op!(Log, "log", v, v.ln());
unary_op!(Sin, "sin", v, v.sin());
unary_op!(Cos, "cos", v, v.cos());
unary_op!(Abs, "abs", v, v.abs());
unary_op!(Neg, "neg", v, -v);
unary_op!(Sqr, "sqr", v, v * v);
unary_op!(Sqrt, "sqrt", v, v.sqrt());
/// `gelu` operation
/// <https://en.wikipedia.org/wiki/Activation_function#Comparison_of_activation_functions>
impl UnaryOp for Gelu {
const NAME: &'static str = "gelu";
fn bf16(v: bf16) -> bf16 {
bf16::from_f32_const(0.5)
* v
* (bf16::ONE
+ bf16::tanh(
(bf16::from_f32_const(2.0) / bf16::PI).sqrt()
* v
* (bf16::ONE + bf16::from_f32_const(0.044715) * v * v),
))
}
fn f16(v: f16) -> f16 {
f16::from_f32_const(0.5)
* v
* (f16::ONE
+ f16::tanh(
(f16::from_f32_const(2.0) / f16::PI).sqrt()
* v
* (f16::ONE + f16::from_f32_const(0.044715) * v * v),
))
}
fn f32(v: f32) -> f32 {
0.5 * v
* (1.0
+ f32::tanh((2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
}
fn f64(v: f64) -> f64 {
0.5 * v
* (1.0
+ f64::tanh((2.0f64 / std::f64::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)))
}
fn u32(_: u32) -> u32 {
0
}
const KERNEL_BF16: &'static str = "gelu_bf16";
const KERNEL_F16: &'static str = "gelu_f16";
const KERNEL_F32: &'static str = "gelu_f32";
const KERNEL_F64: &'static str = "gelu_f64";
const KERNEL_U32: &'static str = "gelu_u32";
}

199
candle-core/src/shape.rs Normal file
View File

@ -0,0 +1,199 @@
use crate::{Error, Result};
#[derive(Clone, PartialEq, Eq)]
pub struct Shape(Vec<usize>);
impl std::fmt::Debug for Shape {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", &self.dims())
}
}
impl<const C: usize> From<&[usize; C]> for Shape {
fn from(dims: &[usize; C]) -> Self {
Self(dims.to_vec())
}
}
impl From<&[usize]> for Shape {
fn from(dims: &[usize]) -> Self {
Self(dims.to_vec())
}
}
impl From<&Shape> for Shape {
fn from(shape: &Shape) -> Self {
Self(shape.0.to_vec())
}
}
impl From<()> for Shape {
fn from(_: ()) -> Self {
Self(vec![])
}
}
impl From<usize> for Shape {
fn from(d1: usize) -> Self {
Self(vec![d1])
}
}
impl From<(usize, usize)> for Shape {
fn from(d12: (usize, usize)) -> Self {
Self(vec![d12.0, d12.1])
}
}
impl From<(usize, usize, usize)> for Shape {
fn from(d123: (usize, usize, usize)) -> Self {
Self(vec![d123.0, d123.1, d123.2])
}
}
impl From<(usize, usize, usize, usize)> for Shape {
fn from(d1234: (usize, usize, usize, usize)) -> Self {
Self(vec![d1234.0, d1234.1, d1234.2, d1234.3])
}
}
impl From<(usize, usize, usize, usize, usize)> for Shape {
fn from(d12345: (usize, usize, usize, usize, usize)) -> Self {
Self(vec![d12345.0, d12345.1, d12345.2, d12345.3, d12345.4])
}
}
impl From<Vec<usize>> for Shape {
fn from(dims: Vec<usize>) -> Self {
Self(dims)
}
}
macro_rules! extract_dims {
($fn_name:ident, $cnt:tt, $dims:expr, $out_type:ty) => {
pub fn $fn_name(&self) -> Result<$out_type> {
if self.0.len() != $cnt {
Err(Error::UnexpectedNumberOfDims {
expected: $cnt,
got: self.0.len(),
shape: self.clone(),
})
} else {
Ok($dims(&self.0))
}
}
};
}
impl Shape {
pub fn from_dims(dims: &[usize]) -> Self {
Self(dims.to_vec())
}
pub fn rank(&self) -> usize {
self.0.len()
}
pub fn into_dims(self) -> Vec<usize> {
self.0
}
pub fn dims(&self) -> &[usize] {
&self.0
}
pub fn elem_count(&self) -> usize {
self.0.iter().product()
}
extract_dims!(r0, 0, |_: &Vec<usize>| (), ());
extract_dims!(r1, 1, |d: &[usize]| d[0], usize);
extract_dims!(r2, 2, |d: &[usize]| (d[0], d[1]), (usize, usize));
extract_dims!(
r3,
3,
|d: &[usize]| (d[0], d[1], d[2]),
(usize, usize, usize)
);
extract_dims!(
r4,
4,
|d: &[usize]| (d[0], d[1], d[2], d[3]),
(usize, usize, usize, usize)
);
extract_dims!(
r5,
5,
|d: &[usize]| (d[0], d[1], d[2], d[3], d[4]),
(usize, usize, usize, usize, usize)
);
/// The strides given in number of elements for a contiguous n-dimensional
/// arrays using this shape.
pub(crate) fn stride_contiguous(&self) -> Vec<usize> {
let mut stride: Vec<_> = self
.0
.iter()
.rev()
.scan(1, |prod, u| {
let prod_pre_mult = *prod;
*prod *= u;
Some(prod_pre_mult)
})
.collect();
stride.reverse();
stride
}
/// Returns true if the strides are C contiguous (aka row major).
pub fn is_contiguous(&self, stride: &[usize]) -> bool {
if self.0.len() != stride.len() {
return false;
}
let mut acc = 1;
for (&stride, &dim) in stride.iter().zip(self.0.iter()).rev() {
if stride != acc {
return false;
}
acc *= dim;
}
true
}
/// Returns true if the strides are Fortran contiguous (aka column major).
pub fn is_fortran_contiguous(&self, stride: &[usize]) -> bool {
if self.0.len() != stride.len() {
return false;
}
let mut acc = 1;
for (&stride, &dim) in stride.iter().zip(self.0.iter()) {
if stride != acc {
return false;
}
acc *= dim;
}
true
}
pub fn extend(mut self, additional_dims: &[usize]) -> Self {
self.0.extend(additional_dims);
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn stride() {
let shape = Shape::from(());
assert_eq!(shape.stride_contiguous(), Vec::<usize>::new());
let shape = Shape::from(42);
assert_eq!(shape.stride_contiguous(), [1]);
let shape = Shape::from((42, 1337));
assert_eq!(shape.stride_contiguous(), [1337, 1]);
let shape = Shape::from((299, 792, 458));
assert_eq!(shape.stride_contiguous(), [458 * 792, 458, 1]);
}
}

261
candle-core/src/storage.rs Normal file
View File

@ -0,0 +1,261 @@
use crate::{op, CpuStorage, CudaStorage, DType, Device, Error, Result, Shape};
// We do not want to implement Clone on Storage as cloning may fail because of
// out of memory. Instead try_clone should be used.
#[derive(Debug)]
pub enum Storage {
Cpu(CpuStorage),
Cuda(CudaStorage),
}
impl Storage {
pub fn try_clone(&self) -> Result<Self> {
match self {
Self::Cpu(storage) => Ok(Self::Cpu(storage.clone())),
Self::Cuda(storage) => {
let storage = storage.try_clone()?;
Ok(Self::Cuda(storage))
}
}
}
pub fn device(&self) -> Device {
match self {
Self::Cpu(_) => Device::Cpu,
Self::Cuda(storage) => Device::Cuda(storage.device().clone()),
}
}
pub fn dtype(&self) -> DType {
match self {
Self::Cpu(storage) => storage.dtype(),
Self::Cuda(storage) => storage.dtype(),
}
}
pub(crate) fn same_device(&self, rhs: &Self, op: &'static str) -> Result<()> {
let lhs = self.device().location();
let rhs = rhs.device().location();
if lhs != rhs {
Err(Error::DeviceMismatchBinaryOp { lhs, rhs, op })
} else {
Ok(())
}
}
pub(crate) fn same_dtype(&self, rhs: &Self, op: &'static str) -> Result<()> {
let lhs = self.dtype();
let rhs = rhs.dtype();
if lhs != rhs {
Err(Error::DTypeMismatchBinaryOp { lhs, rhs, op })
} else {
Ok(())
}
}
pub(crate) fn affine_impl(
&self,
shape: &Shape,
stride: &[usize],
mul: f64,
add: f64,
) -> Result<Self> {
match self {
Storage::Cpu(storage) => {
let storage = storage.affine_impl(shape, stride, mul, add)?;
Ok(Self::Cpu(storage))
}
Self::Cuda(storage) => {
let storage = storage.affine_impl(shape, stride, mul, add)?;
Ok(Self::Cuda(storage))
}
}
}
pub(crate) fn sum(&self, shape: &Shape, stride: &[usize], s: &[usize]) -> Result<Self> {
match self {
Storage::Cpu(storage) => {
let storage = storage.sum(shape, stride, s)?;
Ok(Self::Cpu(storage))
}
Self::Cuda(storage) => {
let storage = storage.sum(shape, stride, s)?;
Ok(Self::Cuda(storage))
}
}
}
pub(crate) fn divide_by_sum_over_dim(&mut self, shape: &Shape, dim: usize) -> Result<()> {
match self {
Storage::Cpu(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
Self::Cuda(storage) => storage.divide_by_sum_over_dim(shape, dim)?,
}
Ok(())
}
pub(crate) fn to_dtype(&self, shape: &Shape, stride: &[usize], dtype: DType) -> Result<Self> {
match self {
Storage::Cpu(storage) => {
let storage = storage.to_dtype(shape, stride, dtype)?;
Ok(Self::Cpu(storage))
}
Self::Cuda(storage) => {
let storage = storage.to_dtype(shape, stride, dtype)?;
Ok(Self::Cuda(storage))
}
}
}
pub(crate) fn unary_impl<B: op::UnaryOp>(
&self,
shape: &Shape,
stride: &[usize],
) -> Result<Self> {
// TODO: Different code path for the contiguous case?
match self {
Storage::Cpu(storage) => {
let storage = storage.unary_impl::<B>(shape, stride)?;
Ok(Self::Cpu(storage))
}
Self::Cuda(storage) => {
let storage = storage.unary_impl::<B>(shape, stride)?;
Ok(Self::Cuda(storage))
}
}
}
pub(crate) fn binary_impl<B: op::BinaryOp>(
&self,
rhs: &Self,
shape: &Shape,
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<Self> {
self.same_device(rhs, B::NAME)?;
self.same_dtype(rhs, B::NAME)?;
match (self, rhs) {
(Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
let storage = lhs.binary_impl::<B>(rhs, shape, lhs_stride, rhs_stride)?;
Ok(Self::Cpu(storage))
}
(Self::Cuda(lhs), Self::Cuda(rhs)) => {
let storage = lhs.binary_impl::<B>(rhs, shape, lhs_stride, rhs_stride)?;
Ok(Self::Cuda(storage))
}
(lhs, rhs) => {
// Should not happen because of the same device check above but we're defensive
// anyway.
Err(Error::DeviceMismatchBinaryOp {
lhs: lhs.device().location(),
rhs: rhs.device().location(),
op: B::NAME,
})
}
}
}
pub(crate) fn where_cond(
&self,
shape: &Shape,
stride: &[usize],
t: &Self,
stride_t: &[usize],
f: &Self,
stride_f: &[usize],
) -> Result<Self> {
self.same_device(t, "where")?;
self.same_device(f, "where")?;
t.same_dtype(f, "where")?;
match (self, t, f) {
(Storage::Cpu(cond), Storage::Cpu(t), Storage::Cpu(f)) => {
let storage = cond.where_cond(shape, stride, t, stride_t, f, stride_f)?;
Ok(Self::Cpu(storage))
}
(Self::Cuda(cond), Self::Cuda(t), Self::Cuda(f)) => {
let storage = cond.where_cond(shape, stride, t, stride_t, f, stride_f)?;
Ok(Self::Cuda(storage))
}
(_, lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
lhs: lhs.device().location(),
rhs: rhs.device().location(),
op: "embedding",
}),
}
}
pub(crate) fn embedding_impl(
&self,
shape: &Shape,
stride: &[usize],
rhs: &Self,
hidden_size: usize,
vocab_size: usize,
) -> Result<Self> {
self.same_device(rhs, "embedding")?;
match (self, rhs) {
(Storage::Cpu(lhs), Storage::Cpu(rhs)) => {
let storage = lhs.embedding_impl(shape, stride, rhs, hidden_size, vocab_size)?;
Ok(Self::Cpu(storage))
}
(Self::Cuda(lhs), Self::Cuda(rhs)) => {
let storage = lhs.embedding_impl(shape, stride, rhs, hidden_size, vocab_size)?;
Ok(Self::Cuda(storage))
}
(lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
lhs: lhs.device().location(),
rhs: rhs.device().location(),
op: "embedding",
}),
}
}
pub(crate) fn matmul_impl(
&self,
rhs: &Self,
bmnk: (usize, usize, usize, usize),
lhs_stride: &[usize],
rhs_stride: &[usize],
) -> Result<Self> {
self.same_device(rhs, "matmul")?;
self.same_dtype(rhs, "matmul")?;
match (self, rhs) {
(Self::Cpu(lhs), Self::Cpu(rhs)) => {
let storage = lhs.matmul_impl(rhs, bmnk, lhs_stride, rhs_stride)?;
Ok(Self::Cpu(storage))
}
(Self::Cuda(lhs), Self::Cuda(rhs)) => {
let storage = lhs.matmul_impl(rhs, bmnk, lhs_stride, rhs_stride)?;
Ok(Self::Cuda(storage))
}
(lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
lhs: lhs.device().location(),
rhs: rhs.device().location(),
op: "matmul",
}),
}
}
// self, the source can be strided whereas dst is contiguous.
pub(crate) fn copy_strided_src(
&self,
dst: &mut Self,
dst_offset: usize,
src_shape: &Shape,
src_stride: &[usize],
src_offset: usize,
) -> Result<()> {
match (self, dst) {
(Self::Cpu(src), Self::Cpu(dst)) => {
src.copy_strided_src(dst, dst_offset, src_shape, src_stride, src_offset)
}
(Self::Cuda(src), Self::Cuda(dst)) => {
Ok(src.copy_strided_src(dst, dst_offset, src_shape, src_stride, src_offset)?)
}
(lhs, rhs) => Err(Error::DeviceMismatchBinaryOp {
lhs: lhs.device().location(),
rhs: rhs.device().location(),
op: "copy",
}),
}
}
}

View File

@ -0,0 +1,61 @@
/// An iterator over offset position for items of an N-dimensional arrays stored in a
/// flat buffer using some potential strides.
#[derive(Debug)]
pub(crate) struct StridedIndex<'a> {
next_storage_index: Option<usize>,
multi_index: Vec<usize>,
dims: &'a [usize],
stride: &'a [usize],
}
impl<'a> StridedIndex<'a> {
pub(crate) fn new(dims: &'a [usize], stride: &'a [usize]) -> Self {
let elem_count: usize = dims.iter().product();
let next_storage_index = if elem_count == 0 {
None
} else {
// This applies to the scalar case.
Some(0)
};
StridedIndex {
next_storage_index,
multi_index: vec![0; dims.len()],
dims,
stride,
}
}
}
impl<'a> Iterator for StridedIndex<'a> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
let storage_index = match self.next_storage_index {
None => return None,
Some(storage_index) => storage_index,
};
let mut updated = false;
for (multi_i, max_i) in self.multi_index.iter_mut().zip(self.dims.iter()).rev() {
let next_i = *multi_i + 1;
if next_i < *max_i {
*multi_i = next_i;
updated = true;
break;
} else {
*multi_i = 0
}
}
self.next_storage_index = if updated {
let next_storage_index = self
.multi_index
.iter()
.zip(self.stride.iter())
.map(|(&x, &y)| x * y)
.sum();
Some(next_storage_index)
} else {
None
};
Some(storage_index)
}
}

1062
candle-core/src/tensor.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,40 @@
use anyhow::{Context, Result};
use candle::{Device, Shape, Tensor};
#[test]
fn simple_grad() -> Result<()> {
let x = Tensor::var(&[3f32, 1., 4.], &Device::Cpu)?;
let y = (((&x * &x)? + &x * 5f64)? + 4f64)?;
let grads = y.backward()?;
let grad_x = grads.get(&x).context("no grad for x")?;
assert_eq!(x.to_vec1::<f32>()?, [3., 1., 4.]);
// y = x^2 + 5.x + 4
assert_eq!(y.to_vec1::<f32>()?, [28., 10., 40.]);
// dy/dx = 2.x + 5
assert_eq!(grad_x.to_vec1::<f32>()?, [11., 7., 13.]);
Ok(())
}
#[test]
fn matmul_grad() -> Result<()> {
let data: Vec<_> = (0..12).map(|i| i as f32).collect();
let x = Tensor::var_from_slice(&data, (2, 2, 3), &Device::Cpu)?;
let data: Vec<_> = (0..12).map(|i| i as f32).collect();
let y = Tensor::var_from_slice(&data, (2, 3, 2), &Device::Cpu)?;
let c = x.matmul(&y)?;
let grads = c.backward()?;
let grad_x = grads.get(&x).context("no grad for x")?;
let grad_y = grads.get(&y).context("no grad for y")?;
assert_eq!(grad_x.shape(), &Shape::from((2, 2, 3)));
assert_eq!(grad_y.shape(), &Shape::from((2, 3, 2)));
assert_eq!(
&*grad_x.storage_data::<f32>()?,
&[1., 5., 9., 1., 5., 9., 13., 17., 21., 13., 17., 21.]
);
assert_eq!(
&*grad_y.storage_data::<f32>()?,
&[3., 3., 5., 5., 7., 7., 15., 15., 17., 17., 19., 19.]
);
Ok(())
}

View File

@ -0,0 +1,228 @@
// TODO: Also test the cuda backend.
use candle::{DType, Device, Result, Tensor};
#[test]
fn zeros() -> Result<()> {
let tensor = Tensor::zeros((5, 2), DType::F32, &Device::Cpu)?;
let (dim1, dim2) = tensor.shape().r2()?;
assert_eq!(dim1, 5);
assert_eq!(dim2, 2);
Ok(())
}
#[test]
fn add_mul() -> Result<()> {
let tensor = Tensor::new(&[3f32, 1., 4.], &Device::Cpu)?;
let dim1 = tensor.shape().r1()?;
assert_eq!(dim1, 3);
let content: Vec<f32> = tensor.to_vec1()?;
assert_eq!(content, [3., 1., 4.]);
let tensor = Tensor::add(&tensor, &tensor)?;
let content: Vec<f32> = tensor.to_vec1()?;
assert_eq!(content, [6., 2., 8.]);
let tensor = Tensor::mul(&tensor, &tensor)?;
let content: Vec<f32> = tensor.to_vec1()?;
assert_eq!(content, [36., 4., 64.]);
Ok(())
}
#[test]
fn tensor_2d() -> Result<()> {
let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
let tensor = Tensor::new(data, &Device::Cpu)?;
let dims = tensor.shape().r2()?;
assert_eq!(dims, (2, 5));
let content: Vec<Vec<f32>> = tensor.to_vec2()?;
assert_eq!(content, data);
Ok(())
}
#[test]
fn binary_op() -> Result<()> {
let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
let tensor = Tensor::new(data, &Device::Cpu)?;
let data2 = &[[5f32, 5., 5., 5., 5.], [2., 1., 7., 8., 2.]];
let tensor2 = Tensor::new(data2, &Device::Cpu)?;
let tensor = (&tensor + (&tensor * &tensor)? / (&tensor + &tensor2))?;
let dims = tensor.shape().r2()?;
assert_eq!(dims, (2, 5));
let content: Vec<Vec<f32>> = tensor.to_vec2()?;
assert_eq!(content[0], [4.125, 1.1666666, 5.7777777, 1.1666666, 7.5]);
assert_eq!(content[1], [3.0, 1.5, 10.5, 12.0, 3.0]);
let tensor = (&tensor - &tensor)?;
let content: Vec<Vec<f32>> = tensor.to_vec2()?;
assert_eq!(content[0], [0., 0., 0., 0., 0.]);
Ok(())
}
#[test]
fn tensor_2d_transpose() -> Result<()> {
let data = &[[3f32, 1., 4., 1., 5.], [2., 1., 7., 8., 2.]];
let tensor = Tensor::new(data, &Device::Cpu)?.t()?;
let dims = tensor.shape().r2()?;
assert_eq!(dims, (5, 2));
assert_eq!(
tensor.to_vec2::<f32>()?,
&[[3f32, 2.], [1., 1.], [4., 7.], [1., 8.], [5., 2.]]
);
assert_eq!(tensor.t()?.to_vec2::<f32>()?, data);
assert_eq!(tensor.contiguous()?.t()?.to_vec2::<f32>()?, data);
assert_eq!(((tensor + 1.)?.t()? - 1.)?.to_vec2::<f32>()?, data);
Ok(())
}
#[test]
fn softmax() -> Result<()> {
let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
let tensor = Tensor::new(data, &Device::Cpu)?;
let t0 = tensor.log()?.softmax(0)?;
let t1 = tensor.log()?.softmax(1)?;
let t2 = tensor.log()?.softmax(2)?;
assert_eq!(
t0.to_vec3::<f32>()?,
&[
// 3/5, 1/2, 4/11
[[0.6, 0.5, 0.36363637], [0.11111111, 0.71428573, 0.5294118]],
// 2/5, 1/2, 7/11
[[0.4, 0.5, 0.63636357], [0.8888889, 0.2857143, 0.47058824]]
]
);
assert_eq!(
t1.to_vec3::<f32>()?,
&[
// 3/4, 1/6, 4/13
[[0.75, 0.16666667, 0.30769232], [0.25, 0.8333333, 0.6923077]],
// 2/10, 1/3, 7/15
[[0.2, 0.33333334, 0.46666664], [0.8, 0.6666667, 0.53333336]]
]
);
assert_eq!(
t2.to_vec3::<f32>()?,
&[
// (3, 1, 4) / 8, (1, 5, 9) / 15
[[0.375, 0.125, 0.5], [0.06666667, 0.33333334, 0.6]],
// (2, 1, 7) / 10, (8, 2, 8) / 18
[[0.2, 0.1, 0.6999999], [0.44444445, 0.11111111, 0.44444445]]
]
);
Ok(())
}
#[test]
fn sum() -> Result<()> {
let data = &[[[3u32, 1, 4], [1, 5, 9]], [[2, 1, 7], [8, 2, 8]]];
let tensor = Tensor::new(data, &Device::Cpu)?;
assert_eq!(
tensor.sum(&[2])?.to_vec3::<u32>()?,
&[[[8], [15]], [[10], [18]]]
);
assert_eq!(
tensor.sum(&[0])?.to_vec3::<u32>()?,
&[[[5, 2, 11], [9, 7, 17]]],
);
assert_eq!(tensor.sum(&[0, 2, 1])?.to_vec3::<u32>()?, &[[[51]]],);
assert_eq!(
tensor.t()?.sum(&[1])?.t()?.to_vec3::<u32>()?,
&[[[8], [15]], [[10], [18]]]
);
assert_eq!(
tensor.sum(&[2, 1])?.to_vec3::<u32>()?,
&[[[8 + 15]], [[10 + 18]]]
);
Ok(())
}
#[test]
fn narrow() -> Result<()> {
let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
let tensor = Tensor::new(data, &Device::Cpu)?;
assert_eq!(
tensor.narrow(2, 1, 2)?.to_vec3::<f32>()?,
&[[[1.0, 4.0], [5.0, 9.0]], [[1.0, 7.0], [2.0, 8.0]]],
);
assert_eq!(
tensor.narrow(1, 1, 1)?.to_vec3::<f32>()?,
&[[[1.0, 5.0, 9.0]], [[8.0, 2.0, 8.0]]],
);
assert_eq!(
tensor.narrow(0, 0, 1)?.to_vec3::<f32>()?,
&[[[3.0, 1.0, 4.0], [1.0, 5.0, 9.0]]],
);
assert_eq!(
tensor.narrow(0, 1, 1)?.to_vec3::<f32>()?,
&[[[2.0, 1.0, 7.0], [8.0, 2.0, 8.0]]],
);
// The following has been checked against PyTorch via:
// import torch
// t = torch.tensor([[[3., 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]])
// t.transpose(-1, -2).narrow(1, 1, 2)
assert_eq!(
tensor.t()?.narrow(1, 1, 2)?.to_vec3::<f32>()?,
&[[[1.0, 5.0], [4.0, 9.0]], [[1.0, 2.0], [7.0, 8.0]]],
);
Ok(())
}
#[test]
fn broadcast() -> Result<()> {
let data = &[3f32, 1., 4.];
let tensor = Tensor::new(data, &Device::Cpu)?;
assert_eq!(
tensor.broadcast_left((3, 1))?.to_vec3::<f32>()?,
&[[[3.0, 1.0, 4.0]], [[3.0, 1.0, 4.0]], [[3.0, 1.0, 4.0]]]
);
Ok(())
}
#[test]
fn cat() -> Result<()> {
// 1D
let t1 = Tensor::new(&[3f32, 1., 4.], &Device::Cpu)?;
let t2 = Tensor::new(&[1f32, 5., 9., 2.], &Device::Cpu)?;
let t3 = Tensor::new(&[6f32, 5., 3., 5., 8., 9.], &Device::Cpu)?;
assert_eq!(Tensor::cat(&[&t1], 0)?.to_vec1::<f32>()?, [3f32, 1., 4.],);
assert_eq!(
Tensor::cat(&[&t1, &t2], 0)?.to_vec1::<f32>()?,
[3f32, 1., 4., 1., 5., 9., 2.],
);
assert_eq!(
Tensor::cat(&[&t1, &t2, &t3], 0)?.to_vec1::<f32>()?,
[3f32, 1., 4., 1., 5., 9., 2., 6., 5., 3., 5., 8., 9.],
);
// 2D
let data = &[[3f32, 1., 4., 1., 5.], [2., 7., 1., 8., 2.]];
let t1 = Tensor::new(data, &Device::Cpu)?;
let data2 = &[[5f32, 5., 5., 5., 5.], [2., 7., 1., 8., 2.]];
let t2 = Tensor::new(data2, &Device::Cpu)?;
assert_eq!(
Tensor::cat(&[&t1, &t2], 0)?.to_vec2::<f32>()?,
[
[3.0, 1.0, 4.0, 1.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0],
[5.0, 5.0, 5.0, 5.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0]
]
);
// TODO: This is not the expected answer, to be fixed!
assert_eq!(
Tensor::cat(&[&t1.t()?, &t2.t()?], 1)?
.t()?
.to_vec2::<f32>()?,
[
[3.0, 1.0, 4.0, 1.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0],
[5.0, 5.0, 5.0, 5.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0]
]
);
// TODO: This is not the expected answer, to be fixed!
assert_eq!(
Tensor::cat(&[&t1, &t2], 1)?.to_vec2::<f32>()?,
[
[3.0, 1.0, 4.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0],
[2.0, 7.0, 1.0, 8.0, 2.0, 2.0, 7.0, 1.0, 8.0, 2.0]
]
);
Ok(())
}