Files
candle/inference/hub.html
2025-04-13 14:47:49 +00:00

345 lines
16 KiB
HTML

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>Using the hub - Candle Documentation</title>
<!-- Custom HTML head -->
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../favicon.svg">
<link rel="shortcut icon" href="../favicon.png">
<link rel="stylesheet" href="../css/variables.css">
<link rel="stylesheet" href="../css/general.css">
<link rel="stylesheet" href="../css/chrome.css">
<link rel="stylesheet" href="../css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="../fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "../";
const default_light_theme = "light";
const default_dark_theme = "navy";
</script>
<!-- Start loading toc.js asap -->
<script src="../toc.js"></script>
</head>
<body>
<div id="body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('light')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
}
sidebar_toggle.checked = sidebar === 'visible';
html.classList.remove('sidebar-visible');
html.classList.add("sidebar-" + sidebar);
</script>
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
</noscript>
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
</label>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
</button>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
</ul>
<button id="search-toggle" class="icon-button" type="button" title="Search. (Shortkey: s)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="S" aria-controls="searchbar">
<i class="fa fa-search"></i>
</button>
</div>
<h1 class="menu-title">Candle Documentation</h1>
<div class="right-buttons">
<a href="../print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
</a>
</div>
</div>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
</form>
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="content" class="content">
<main>
<h1 id="using-the-hub"><a class="header" href="#using-the-hub">Using the hub</a></h1>
<p>Install the <a href="https://github.com/huggingface/hf-hub"><code>hf-hub</code></a> crate:</p>
<pre><code class="language-bash">cargo add hf-hub
</code></pre>
<p>Then let's start by downloading the <a href="https://huggingface.co/bert-base-uncased/tree/main">model file</a>.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span><span class="boring">extern crate candle_core;
</span><span class="boring">extern crate hf_hub;
</span>use hf_hub::api::sync::Api;
use candle_core::Device;
let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());
let weights = repo.get("model.safetensors").unwrap();
let weights = candle_core::safetensors::load(weights, &amp;Device::Cpu);
<span class="boring">}</span></code></pre></pre>
<p>We now have access to all the <a href="https://huggingface.co/bert-base-uncased?show_tensors=true">tensors</a> within the file.</p>
<p>You can check all the names of the tensors <a href="https://huggingface.co/bert-base-uncased?show_tensors=true">here</a></p>
<h2 id="using-async"><a class="header" href="#using-async">Using async</a></h2>
<p><code>hf-hub</code> comes with an async API.</p>
<pre><code class="language-bash">cargo add hf-hub --features tokio
</code></pre>
<pre><code class="language-rust ignore"><span class="boring">This is tested directly in examples crate because it needs external dependencies unfortunately:
</span><span class="boring">See [this](https://github.com/rust-lang/mdBook/issues/706)
</span>use candle::Device;
use hf_hub::api::tokio::Api;
let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());
let weights_filename = repo.get("model.safetensors").await.unwrap();
let weights = candle::safetensors::load(weights_filename, &amp;Device::Cpu).unwrap();</code></pre>
<h2 id="using-in-a-real-model"><a class="header" href="#using-in-a-real-model">Using in a real model.</a></h2>
<p>Now that we have our weights, we can use them in our bert architecture:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span><span class="boring">extern crate candle_core;
</span><span class="boring">extern crate candle_nn;
</span><span class="boring">extern crate hf_hub;
</span><span class="boring">use hf_hub::api::sync::Api;
</span><span class="boring">
</span><span class="boring">let api = Api::new().unwrap();
</span><span class="boring">let repo = api.model("bert-base-uncased".to_string());
</span><span class="boring">
</span><span class="boring">let weights = repo.get("model.safetensors").unwrap();
</span>use candle_core::{Device, Tensor, DType};
use candle_nn::{Linear, Module};
let weights = candle_core::safetensors::load(weights, &amp;Device::Cpu).unwrap();
let weight = weights.get("bert.encoder.layer.0.attention.self.query.weight").unwrap();
let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap();
let linear = Linear::new(weight.clone(), Some(bias.clone()));
let input_ids = Tensor::zeros((3, 768), DType::F32, &amp;Device::Cpu).unwrap();
let output = linear.forward(&amp;input_ids).unwrap();
<span class="boring">}</span></code></pre></pre>
<p>For a full reference, you can check out the full <a href="https://github.com/LaurentMazare/candle/tree/main/candle-examples/examples/bert">bert</a> example.</p>
<h2 id="memory-mapping"><a class="header" href="#memory-mapping">Memory mapping</a></h2>
<p>For more efficient loading, instead of reading the file, you could use <a href="https://docs.rs/memmap2/latest/memmap2/"><code>memmap2</code></a></p>
<p><strong>Note</strong>: Be careful about memory mapping it seems to cause issues on <a href="https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/5893">Windows, WSL</a>
and will definitely be slower on network mounted disk, because it will issue more read calls.</p>
<pre><code class="language-rust ignore">use candle::Device;
use hf_hub::api::sync::Api;
use memmap2::Mmap;
use std::fs;
let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());
let weights_filename = repo.get("model.safetensors").unwrap();
let file = fs::File::open(weights_filename).unwrap();
let mmap = unsafe { Mmap::map(&amp;file).unwrap() };
let weights = candle::safetensors::load_buffer(&amp;mmap[..], &amp;Device::Cpu).unwrap();</code></pre>
<p><strong>Note</strong>: This operation is <strong>unsafe</strong>. <a href="https://docs.rs/memmap2/latest/memmap2/struct.Mmap.html#safety">See the safety notice</a>.
In practice model files should never be modified, and the mmaps should be mostly READONLY anyway, so the caveat most likely does not apply, but always keep it in mind.</p>
<h2 id="tensor-parallel-sharding"><a class="header" href="#tensor-parallel-sharding">Tensor Parallel Sharding</a></h2>
<p>When using multiple GPUs to use in Tensor Parallel in order to get good latency, you can load only the part of the Tensor you need.</p>
<p>For that you need to use <a href="https://crates.io/crates/safetensors"><code>safetensors</code></a> directly.</p>
<pre><code class="language-bash">cargo add safetensors
</code></pre>
<pre><code class="language-rust ignore">use candle::{DType, Device, Tensor};
use hf_hub::api::sync::Api;
use memmap2::Mmap;
use safetensors::slice::IndexOp;
use safetensors::SafeTensors;
use std::fs;
let api = Api::new().unwrap();
let repo = api.model("bert-base-uncased".to_string());
let weights_filename = repo.get("model.safetensors").unwrap();
let file = fs::File::open(weights_filename).unwrap();
let mmap = unsafe { Mmap::map(&amp;file).unwrap() };
// Use safetensors directly
let tensors = SafeTensors::deserialize(&amp;mmap[..]).unwrap();
let view = tensors
.tensor("bert.encoder.layer.0.attention.self.query.weight")
.unwrap();
// We're going to load shard with rank 1, within a world_size of 4
// We're going to split along dimension 0 doing VIEW[start..stop, :]
let rank = 1;
let world_size = 4;
let dim = 0;
let dtype = view.dtype();
let mut tp_shape = view.shape().to_vec();
let size = tp_shape[0];
if size % world_size != 0 {
panic!("The dimension is not divisible by `world_size`");
}
let block_size = size / world_size;
let start = rank * block_size;
let stop = (rank + 1) * block_size;
// Everything is expressed in tensor dimension
// bytes offsets is handled automatically for safetensors.
let iterator = view.slice(start..stop).unwrap();
tp_shape[dim] = block_size;
// Convert safetensors Dtype to candle DType
let dtype: DType = dtype.try_into().unwrap();
// TODO: Implement from_buffer_iterator so we can skip the extra CPU alloc.
let raw: Vec&lt;u8&gt; = iterator.into_iter().flatten().cloned().collect();
let tp_tensor = Tensor::from_raw_buffer(&amp;raw, dtype, &amp;tp_shape, &amp;Device::Cpu).unwrap();</code></pre>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../inference/inference.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../error_manage.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../inference/inference.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../error_manage.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
</nav>
</div>
<script>
window.playground_copyable = true;
</script>
<script src="../elasticlunr.min.js"></script>
<script src="../mark.min.js"></script>
<script src="../searcher.js"></script>
<script src="../clipboard.min.js"></script>
<script src="../highlight.js"></script>
<script src="../book.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>