Quantized GGUF style (#1523)

* Metal quantized modifications proposal. - Add a device param, wherever needed. - Create new QMetal storage thing that implements QuantizedType. - Update everywhere needed. Fix Python. Fixing examples. Fix: fmt + clippy + stub. Moving everything around. Only missing the actual implems. Fixing everything + adding dequantized kernels. More work. Fixing matmul. Fmt + Clippy Some clippy fixes. Working state. Q2K Metal -> Bugged (also present in GGML). Q4K CPU -> Bugged (present previously, new test catch it). Q5K CPU -> Bugged (present previously). Q8_1 Both -> Never really implemented it seems Q8K metal -> Never implemented in metal Fixing Q2K bug (present in ggml). * Cleanup. * Fix the rebase. * Removing the fences speeds everything up and *is* correct this time... * Cleanup the fence. * After rebase. * Bad code removal. * Rebase after phi2 merge + fix replit default to CPU. * Making the CI happy. * More happy tests. --------- Co-authored-by: Nicolas Patry <nicolas@Nicolass-MacBook-Pro.local>
2025-06-21 12:20:46 +00:00 · 2024-01-17 10:27:58 +01:00
parent 5270224f40
commit 403680f17d
31 changed files with 6446 additions and 515 deletions
--- a/candle-core/src/quantized/gguf_file.rs
+++ b/candle-core/src/quantized/gguf_file.rs
@ -3,7 +3,7 @@
 //! Spec: https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md

 use super::{GgmlDType, QTensor};
-use crate::Result;
+use crate::{Device, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::collections::HashMap;

@ -59,19 +59,25 @@ impl TensorInfo {
        &self,
        reader: &mut R,
        tensor_data_offset: u64,
+        device: &Device,
    ) -> Result<QTensor> {
        let tensor_elems = self.shape.elem_count();
-        let blck_size = self.ggml_dtype.blck_size();
-        if tensor_elems % blck_size != 0 {
+        let block_size = self.ggml_dtype.block_size();
+        if tensor_elems % block_size != 0 {
            crate::bail!(
-            "the number of elements {tensor_elems} is not divisible by the block size {blck_size}"
+            "the number of elements {tensor_elems} is not divisible by the block size {block_size}"
        )
        }
-        let size_in_bytes = tensor_elems / blck_size * self.ggml_dtype.type_size();
+        let size_in_bytes = tensor_elems / block_size * self.ggml_dtype.type_size();
        let mut raw_data = vec![0u8; size_in_bytes];
        reader.seek(std::io::SeekFrom::Start(tensor_data_offset + self.offset))?;
        reader.read_exact(&mut raw_data)?;
-        super::ggml_file::qtensor_from_ggml(self.ggml_dtype, &raw_data, self.shape.dims().to_vec())
+        super::ggml_file::qtensor_from_ggml(
+            self.ggml_dtype,
+            &raw_data,
+            self.shape.dims().to_vec(),
+            device,
+        )
    }
 }

@ -460,12 +466,13 @@ impl Content {
        &self,
        reader: &mut R,
        name: &str,
+        device: &Device,
    ) -> Result<QTensor> {
        let tensor_info = match self.tensor_infos.get(name) {
            Some(tensor_info) => tensor_info,
            None => crate::bail!("cannot find tensor info for {name}"),
        };
-        tensor_info.read(reader, self.tensor_data_offset)
+        tensor_info.read(reader, self.tensor_data_offset, device)
    }
 }

@ -517,10 +524,9 @@ pub fn write<W: std::io::Seek + std::io::Write>(
                "internal error, unexpected current position {tensor_start_pos} {offset} {pos}"
            )
        }
-        let data_ptr = tensor.as_ptr();
-        let size_in_bytes = tensor.storage_size_in_bytes();
-        let data = unsafe { std::slice::from_raw_parts(data_ptr, size_in_bytes) };
-        w.write_all(data)?;
+        let data = tensor.data()?;
+        let size_in_bytes = data.len();
+        w.write_all(&data)?;
        let padding = 31 - (31 + size_in_bytes) % 32;
        w.write_all(&vec![0u8; padding])?;
    }