From ffb8d633245e57098d829bfa1b266417f49b8467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Sat, 17 May 2025 03:41:24 +0000 Subject: [PATCH] Use HF Papers --- candle-examples/examples/based/README.md | 2 +- candle-examples/examples/beit/README.md | 2 +- candle-examples/examples/colpali/README.md | 2 +- candle-examples/examples/convmixer/README.md | 2 +- candle-examples/examples/convnext/README.md | 4 ++-- candle-examples/examples/dinov2reg4/README.md | 2 +- candle-examples/examples/dinov2reg4/main.rs | 2 +- candle-examples/examples/efficientnet/main.rs | 2 +- candle-examples/examples/efficientvit/README.md | 2 +- candle-examples/examples/eva2/README.md | 2 +- candle-examples/examples/fastvit/README.md | 2 +- candle-examples/examples/gte-qwen/README.md | 2 +- candle-examples/examples/hiera/README.md | 2 +- candle-examples/examples/mamba/README.md | 2 +- candle-examples/examples/mobileclip/README.md | 2 +- candle-examples/examples/mobilenetv4/README.md | 2 +- candle-examples/examples/mobileone/README.md | 2 +- candle-examples/examples/musicgen/README.md | 2 +- candle-examples/examples/olmo/README.md | 2 +- candle-examples/examples/onnx/README.md | 2 +- candle-examples/examples/quantized-t5/README.md | 2 +- candle-examples/examples/repvgg/README.md | 2 +- candle-examples/examples/resnet/README.md | 2 +- candle-examples/examples/stable-diffusion-3/README.md | 2 +- candle-examples/examples/stable-diffusion-3/sampling.rs | 2 +- candle-examples/examples/starcoder2/README.md | 2 +- candle-examples/examples/stella-en-v5/README.md | 2 +- candle-examples/examples/t5/README.md | 2 +- candle-examples/examples/wuerstchen/README.md | 2 +- candle-metal-kernels/src/reduce.metal | 2 +- candle-nn/src/batch_norm.rs | 2 +- candle-nn/src/layer_norm.rs | 2 +- candle-onnx/src/onnx.proto3 | 4 ++-- candle-pyo3/py_src/candle/nn/normalization.py | 2 +- candle-transformers/src/models/based.rs | 2 +- candle-transformers/src/models/beit.rs | 2 +- candle-transformers/src/models/bert.rs | 2 +- candle-transformers/src/models/bigcode.rs | 2 +- candle-transformers/src/models/blip.rs | 2 +- candle-transformers/src/models/blip_text.rs | 4 ++-- candle-transformers/src/models/chinese_clip/text_model.rs | 4 ++-- candle-transformers/src/models/codegeex4_9b.rs | 2 +- candle-transformers/src/models/convmixer.rs | 2 +- candle-transformers/src/models/convnext.rs | 4 ++-- candle-transformers/src/models/dinov2reg4.rs | 2 +- candle-transformers/src/models/distilbert.rs | 2 +- candle-transformers/src/models/efficientnet.rs | 2 +- candle-transformers/src/models/efficientvit.rs | 2 +- candle-transformers/src/models/encodec.rs | 2 +- candle-transformers/src/models/eva2.rs | 2 +- candle-transformers/src/models/fastvit.rs | 2 +- candle-transformers/src/models/hiera.rs | 2 +- candle-transformers/src/models/llama.rs | 2 +- candle-transformers/src/models/llama2_c.rs | 2 +- candle-transformers/src/models/llama2_c_weights.rs | 2 +- candle-transformers/src/models/llava/mod.rs | 2 +- candle-transformers/src/models/mamba.rs | 4 ++-- candle-transformers/src/models/mixformer.rs | 4 ++-- candle-transformers/src/models/mmdit/mod.rs | 4 ++-- candle-transformers/src/models/mmdit/model.rs | 2 +- candle-transformers/src/models/mobileclip.rs | 4 ++-- candle-transformers/src/models/mobilenetv4.rs | 2 +- candle-transformers/src/models/mobileone.rs | 2 +- candle-transformers/src/models/modernbert.rs | 2 +- candle-transformers/src/models/nvembed_v2/mod.rs | 2 +- candle-transformers/src/models/olmo2.rs | 2 +- candle-transformers/src/models/openclip/mod.rs | 2 +- candle-transformers/src/models/paligemma.rs | 4 ++-- candle-transformers/src/models/quantized_blip.rs | 2 +- candle-transformers/src/models/quantized_blip_text.rs | 2 +- candle-transformers/src/models/quantized_llama.rs | 2 +- candle-transformers/src/models/quantized_llama2_c.rs | 2 +- candle-transformers/src/models/quantized_mistral.rs | 2 +- candle-transformers/src/models/quantized_phi.rs | 2 +- .../src/models/quantized_recurrent_gemma.rs | 2 +- candle-transformers/src/models/quantized_t5.rs | 2 +- candle-transformers/src/models/qwen2_moe.rs | 2 +- candle-transformers/src/models/recurrent_gemma.rs | 2 +- candle-transformers/src/models/repvgg.rs | 2 +- candle-transformers/src/models/resnet.rs | 2 +- candle-transformers/src/models/segformer.rs | 2 +- candle-transformers/src/models/segment_anything/mod.rs | 2 +- candle-transformers/src/models/snac.rs | 2 +- candle-transformers/src/models/stable_diffusion/ddim.rs | 2 +- candle-transformers/src/models/stable_diffusion/ddpm.rs | 8 ++++---- candle-transformers/src/models/stable_diffusion/resnet.rs | 2 +- .../src/models/stable_diffusion/schedulers.rs | 2 +- candle-transformers/src/models/stable_diffusion/uni_pc.rs | 2 +- candle-transformers/src/models/starcoder2.rs | 2 +- candle-transformers/src/models/stella_en_v5.rs | 4 ++-- candle-transformers/src/models/t5.rs | 4 ++-- candle-transformers/src/models/trocr.rs | 2 +- candle-transformers/src/models/vgg.rs | 2 +- candle-transformers/src/models/vit.rs | 2 +- candle-transformers/src/models/yi.rs | 2 +- candle-transformers/src/object_detection.rs | 4 ++-- candle-wasm-examples/phi/index.html | 2 +- 97 files changed, 113 insertions(+), 113 deletions(-) diff --git a/candle-examples/examples/based/README.md b/candle-examples/examples/based/README.md index 16bfddb6..6e0c2473 100644 --- a/candle-examples/examples/based/README.md +++ b/candle-examples/examples/based/README.md @@ -4,7 +4,7 @@ Experimental, not instruction-tuned small LLM from the Hazy Research group, comb [Blogpost](https://hazyresearch.stanford.edu/blog/2024-03-03-based) -[Simple linear attention language models balance the recall-throughput tradeoff](https://arxiv.org/abs/2402.18668) +[Simple linear attention language models balance the recall-throughput tradeoff](https://huggingface.co/papers/2402.18668) ## Running an example diff --git a/candle-examples/examples/beit/README.md b/candle-examples/examples/beit/README.md index 23af1e32..3f333406 100644 --- a/candle-examples/examples/beit/README.md +++ b/candle-examples/examples/beit/README.md @@ -1,6 +1,6 @@ # candle-beit -[Beit](https://arxiv.org/abs/2106.08254) is a computer vision model. +[Beit](https://huggingface.co/papers/2106.08254) is a computer vision model. In this example, it is used as an ImageNet classifier: the model returns the probability for the image to belong to each of the 1000 ImageNet categories. diff --git a/candle-examples/examples/colpali/README.md b/candle-examples/examples/colpali/README.md index e6a55798..013a46e8 100644 --- a/candle-examples/examples/colpali/README.md +++ b/candle-examples/examples/colpali/README.md @@ -3,7 +3,7 @@ [HuggingFace Model Card](https://huggingface.co/vidore/colpali-v1.2-merged) ``` -wget https://arxiv.org/pdf/1706.03762.pdf +wget https://huggingface.co/papers/1706.03762 cargo run --features cuda,pdf2image --release --example colpali -- --prompt "What is Positional Encoding" --pdf "1706.03762.pdf" ``` diff --git a/candle-examples/examples/convmixer/README.md b/candle-examples/examples/convmixer/README.md index 3981e3d9..ca1b353e 100644 --- a/candle-examples/examples/convmixer/README.md +++ b/candle-examples/examples/convmixer/README.md @@ -2,7 +2,7 @@ A lightweight CNN architecture that processes image patches similar to a vision transformer, with separate spatial and channel convolutions. -ConvMixer from [Patches Are All You Need?](https://arxiv.org/pdf/2201.09792) and [ConvMixer](https://github.com/locuslab/convmixer). +ConvMixer from [Patches Are All You Need?](https://huggingface.co/papers/2201.09792) and [ConvMixer](https://github.com/locuslab/convmixer). ## Running an example diff --git a/candle-examples/examples/convnext/README.md b/candle-examples/examples/convnext/README.md index d532d7a4..aa30adf9 100644 --- a/candle-examples/examples/convnext/README.md +++ b/candle-examples/examples/convnext/README.md @@ -1,7 +1,7 @@ # candle-convnext -[A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) and -[ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808). +[A ConvNet for the 2020s](https://huggingface.co/papers/2201.03545) and +[ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://huggingface.co/papers/2301.00808). This candle implementation uses a pre-trained ConvNeXt network for inference. The classification head has been trained on the ImageNet dataset and returns the diff --git a/candle-examples/examples/dinov2reg4/README.md b/candle-examples/examples/dinov2reg4/README.md index ac86ca69..e1e7a053 100644 --- a/candle-examples/examples/dinov2reg4/README.md +++ b/candle-examples/examples/dinov2reg4/README.md @@ -1,6 +1,6 @@ # candle-dinov2-reg4 -[DINOv2-reg4](https://arxiv.org/abs/2309.16588) is the lastest version of DINOv2 with registers. +[DINOv2-reg4](https://huggingface.co/papers/2309.16588) is the lastest version of DINOv2 with registers. In this example, it is used as an plant species classifier: the model returns the probability for the image to belong to each of the 7806 PlantCLEF2024 categories. diff --git a/candle-examples/examples/dinov2reg4/main.rs b/candle-examples/examples/dinov2reg4/main.rs index 15270517..7901a5d9 100644 --- a/candle-examples/examples/dinov2reg4/main.rs +++ b/candle-examples/examples/dinov2reg4/main.rs @@ -1,5 +1,5 @@ //! DINOv2 reg4 finetuned on PlantCLEF 2024 -//! https://arxiv.org/abs/2309.16588 +//! https://huggingface.co/papers/2309.16588 //! https://huggingface.co/spaces/BVRA/PlantCLEF2024 //! https://zenodo.org/records/10848263 diff --git a/candle-examples/examples/efficientnet/main.rs b/candle-examples/examples/efficientnet/main.rs index a8f17cca..26398c2b 100644 --- a/candle-examples/examples/efficientnet/main.rs +++ b/candle-examples/examples/efficientnet/main.rs @@ -1,6 +1,6 @@ //! EfficientNet implementation. //! -//! https://arxiv.org/abs/1905.11946 +//! https://huggingface.co/papers/1905.11946 #[cfg(feature = "mkl")] extern crate intel_mkl_src; diff --git a/candle-examples/examples/efficientvit/README.md b/candle-examples/examples/efficientvit/README.md index 7a989a25..5ff5c073 100644 --- a/candle-examples/examples/efficientvit/README.md +++ b/candle-examples/examples/efficientvit/README.md @@ -1,6 +1,6 @@ # candle-efficientvit -[EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention](https://arxiv.org/abs/2305.07027). +[EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention](https://huggingface.co/papers/2305.07027). This candle implementation uses a pre-trained EfficientViT (from Microsoft Research Asia) network for inference. The classification head has been trained on the ImageNet dataset and returns the probabilities for the top-5 classes. diff --git a/candle-examples/examples/eva2/README.md b/candle-examples/examples/eva2/README.md index 10c91b89..c2efbc4f 100644 --- a/candle-examples/examples/eva2/README.md +++ b/candle-examples/examples/eva2/README.md @@ -1,6 +1,6 @@ # candle-eva2 -[EVA-02](https://arxiv.org/abs/2303.11331) is a computer vision model. +[EVA-02](https://huggingface.co/papers/2303.11331) is a computer vision model. In this example, it is used as an ImageNet classifier: the model returns the probability for the image to belong to each of the 1000 ImageNet categories. diff --git a/candle-examples/examples/fastvit/README.md b/candle-examples/examples/fastvit/README.md index 467e1032..c28cd048 100644 --- a/candle-examples/examples/fastvit/README.md +++ b/candle-examples/examples/fastvit/README.md @@ -1,6 +1,6 @@ # candle-fastvit -[FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization](https://arxiv.org/abs/2303.14189). +[FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization](https://huggingface.co/papers/2303.14189). This candle implementation uses a pre-trained FastViT network for inference. The classification head has been trained on the ImageNet dataset and returns the probabilities for the top-5 classes. diff --git a/candle-examples/examples/gte-qwen/README.md b/candle-examples/examples/gte-qwen/README.md index c35599e7..d0d8d587 100644 --- a/candle-examples/examples/gte-qwen/README.md +++ b/candle-examples/examples/gte-qwen/README.md @@ -3,7 +3,7 @@ gte-Qwen1.5-7B-instruct is a variant of the GTE embedding model family. - [Model card](https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct) on the HuggingFace Hub. -- [Technical report](https://arxiv.org/abs/2308.03281) *Towards General Text Embeddings with Multi-stage Contrastive Learning* +- [Technical report](https://huggingface.co/papers/2308.03281) *Towards General Text Embeddings with Multi-stage Contrastive Learning* ## Running the example diff --git a/candle-examples/examples/hiera/README.md b/candle-examples/examples/hiera/README.md index 763ce1a5..c8d33f99 100644 --- a/candle-examples/examples/hiera/README.md +++ b/candle-examples/examples/hiera/README.md @@ -1,6 +1,6 @@ # hiera -[Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) +[Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://huggingface.co/papers/2306.00989) This candle implementation uses pre-trained Hiera models from timm for inference. The classification head has been trained on the ImageNet dataset and returns the probabilities for the top-5 classes. diff --git a/candle-examples/examples/mamba/README.md b/candle-examples/examples/mamba/README.md index 2470ab7f..3843b979 100644 --- a/candle-examples/examples/mamba/README.md +++ b/candle-examples/examples/mamba/README.md @@ -5,7 +5,7 @@ the transformer architecture. It leverages State Space Models (SSMs) with the goal of being computationally efficient on long sequences. The implementation is based on [mamba.rs](https://github.com/LaurentMazare/mamba.rs). -- [1]. [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752). +- [1]. [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://huggingface.co/papers/2312.00752). Compared to the mamba-minimal example, this version is far more efficient but would only work for inference. diff --git a/candle-examples/examples/mobileclip/README.md b/candle-examples/examples/mobileclip/README.md index a3869b25..acef25da 100644 --- a/candle-examples/examples/mobileclip/README.md +++ b/candle-examples/examples/mobileclip/README.md @@ -2,7 +2,7 @@ MobileCLIP is family of efficient CLIP-like models using FastViT-based image encoders. -See [MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced Training](https://arxiv.org/abs/2311.17049) +See [MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced Training](https://huggingface.co/papers/2311.17049) ## Running on an example on cpu diff --git a/candle-examples/examples/mobilenetv4/README.md b/candle-examples/examples/mobilenetv4/README.md index c8356466..59f3e961 100644 --- a/candle-examples/examples/mobilenetv4/README.md +++ b/candle-examples/examples/mobilenetv4/README.md @@ -1,6 +1,6 @@ # candle-mobilenetv4 -[MobileNetV4 - Universal Models for the Mobile Ecosystem](https://arxiv.org/abs/2404.10518) +[MobileNetV4 - Universal Models for the Mobile Ecosystem](https://huggingface.co/papers/2404.10518) This candle implementation uses pre-trained MobileNetV4 models from timm for inference. The classification head has been trained on the ImageNet dataset and returns the probabilities for the top-5 classes. diff --git a/candle-examples/examples/mobileone/README.md b/candle-examples/examples/mobileone/README.md index b5e88b6f..3657d01f 100644 --- a/candle-examples/examples/mobileone/README.md +++ b/candle-examples/examples/mobileone/README.md @@ -1,6 +1,6 @@ # candle-mobileone -[MobileOne: An Improved One millisecond Mobile Backbone](https://arxiv.org/abs/2206.04040). +[MobileOne: An Improved One millisecond Mobile Backbone](https://huggingface.co/papers/2206.04040). This candle implementation uses a pre-trained MobileOne network for inference. The classification head has been trained on the ImageNet dataset and returns the diff --git a/candle-examples/examples/musicgen/README.md b/candle-examples/examples/musicgen/README.md index 8db388b1..efadd5f3 100644 --- a/candle-examples/examples/musicgen/README.md +++ b/candle-examples/examples/musicgen/README.md @@ -1,6 +1,6 @@ # candle-musicgen -Candle implementation of musicgen from [Simple and Controllable Music Generation](https://arxiv.org/pdf/2306.05284). +Candle implementation of musicgen from [Simple and Controllable Music Generation](https://huggingface.co/papers/2306.05284). ## Running an example diff --git a/candle-examples/examples/olmo/README.md b/candle-examples/examples/olmo/README.md index 7ceab841..604b526f 100644 --- a/candle-examples/examples/olmo/README.md +++ b/candle-examples/examples/olmo/README.md @@ -3,7 +3,7 @@ OLMo is a series of Open Language Models designed to enable the science of language models. - **Project Page:** https://allenai.org/olmo -- **Papers:** [OLMo](https://arxiv.org/abs/2402.00838) [OLMo 2](https://arxiv.org/abs/2501.00656) +- **Papers:** [OLMo](https://huggingface.co/papers/2402.00838) [OLMo 2](https://huggingface.co/papers/2501.00656) - **Technical blog post:** https://blog.allenai.org/olmo-open-language-model-87ccfc95f580 - **W&B Logs:** https://wandb.ai/ai2-llm/OLMo-1B/reports/OLMo-1B--Vmlldzo2NzY1Njk1 diff --git a/candle-examples/examples/onnx/README.md b/candle-examples/examples/onnx/README.md index d6ca4d37..a040dcc4 100644 --- a/candle-examples/examples/onnx/README.md +++ b/candle-examples/examples/onnx/README.md @@ -2,7 +2,7 @@ This example demonstrates how to run [ONNX](https://github.com/onnx/onnx) based models in Candle. -It contains small variants of two models, [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf) (default) and [EfficientNet](https://arxiv.org/pdf/1905.11946.pdf). +It contains small variants of two models, [SqueezeNet](https://huggingface.co/papers/1602.07360) (default) and [EfficientNet](https://huggingface.co/papers/1905.11946). You can run the examples with following commands: diff --git a/candle-examples/examples/quantized-t5/README.md b/candle-examples/examples/quantized-t5/README.md index d0a68dbd..f5d98abb 100644 --- a/candle-examples/examples/quantized-t5/README.md +++ b/candle-examples/examples/quantized-t5/README.md @@ -51,7 +51,7 @@ cargo run --example quantized-t5 --release -- \ Note that a storm surge is what forecasters consider a hurricane's most dangerous part. ``` -### [MADLAD-400](https://arxiv.org/abs/2309.04662) +### [MADLAD-400](https://huggingface.co/papers/2309.04662) MADLAD-400 is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models. diff --git a/candle-examples/examples/repvgg/README.md b/candle-examples/examples/repvgg/README.md index d24bcd6d..0ca184b7 100644 --- a/candle-examples/examples/repvgg/README.md +++ b/candle-examples/examples/repvgg/README.md @@ -1,6 +1,6 @@ # candle-repvgg -[RepVGG: Making VGG-style ConvNets Great Again](https://arxiv.org/abs/2101.03697). +[RepVGG: Making VGG-style ConvNets Great Again](https://huggingface.co/papers/2101.03697). This candle implementation uses a pre-trained RepVGG network for inference. The classification head has been trained on the ImageNet dataset and returns the diff --git a/candle-examples/examples/resnet/README.md b/candle-examples/examples/resnet/README.md index 8565a7f3..8fd5d3eb 100644 --- a/candle-examples/examples/resnet/README.md +++ b/candle-examples/examples/resnet/README.md @@ -1,6 +1,6 @@ # candle-resnet -A candle implementation of inference using a pre-trained [ResNet](https://arxiv.org/abs/1512.03385). +A candle implementation of inference using a pre-trained [ResNet](https://huggingface.co/papers/1512.03385). This uses a classification head trained on the ImageNet dataset and returns the probabilities for the top-5 classes. diff --git a/candle-examples/examples/stable-diffusion-3/README.md b/candle-examples/examples/stable-diffusion-3/README.md index adae1b56..2689d5c0 100644 --- a/candle-examples/examples/stable-diffusion-3/README.md +++ b/candle-examples/examples/stable-diffusion-3/README.md @@ -7,7 +7,7 @@ Stable Diffusion 3 Medium is a text-to-image model based on Multimodal Diffusion Transformer (MMDiT) architecture. - [huggingface repo](https://huggingface.co/stabilityai/stable-diffusion-3-medium) -- [research paper](https://arxiv.org/pdf/2403.03206) +- [research paper](https://huggingface.co/papers/2403.03206) - [announcement blog post](https://stability.ai/news/stable-diffusion-3-medium) Stable Diffusion 3.5 is a family of text-to-image models with latest improvements: diff --git a/candle-examples/examples/stable-diffusion-3/sampling.rs b/candle-examples/examples/stable-diffusion-3/sampling.rs index 5e234371..1fcc60c9 100644 --- a/candle-examples/examples/stable-diffusion-3/sampling.rs +++ b/candle-examples/examples/stable-diffusion-3/sampling.rs @@ -69,7 +69,7 @@ pub fn euler_sample( } // The "Resolution-dependent shifting of timestep schedules" recommended in the SD3 tech report paper -// https://arxiv.org/pdf/2403.03206 +// https://huggingface.co/papers/2403.03206 // Following the implementation in ComfyUI: // https://github.com/comfyanonymous/ComfyUI/blob/3c60ecd7a83da43d694e26a77ca6b93106891251/ // comfy/model_sampling.py#L181 diff --git a/candle-examples/examples/starcoder2/README.md b/candle-examples/examples/starcoder2/README.md index ccd7a84e..c684a43d 100644 --- a/candle-examples/examples/starcoder2/README.md +++ b/candle-examples/examples/starcoder2/README.md @@ -1,6 +1,6 @@ # candle-starcoder2 -Candle implementation of Star Coder 2 family of code generation model from [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/pdf/2402.19173). +Candle implementation of Star Coder 2 family of code generation model from [StarCoder 2 and The Stack v2: The Next Generation](https://huggingface.co/papers/2402.19173). ## Running an example diff --git a/candle-examples/examples/stella-en-v5/README.md b/candle-examples/examples/stella-en-v5/README.md index 61c7e4dd..a36b7594 100644 --- a/candle-examples/examples/stella-en-v5/README.md +++ b/candle-examples/examples/stella-en-v5/README.md @@ -16,7 +16,7 @@ $ cargo run --example stella-en-v5 --release -- --query "What are safetensors?" > Tensor[[1, 1024], f32] ``` -Stella_en_1.5B_v5 is trained by [MRL](https://arxiv.org/abs/2205.13147) enabling multiple embedding dimensions. +Stella_en_1.5B_v5 is trained by [MRL](https://huggingface.co/papers/2205.13147) enabling multiple embedding dimensions. The following reproduces the example in the [model card](https://huggingface.co/dunzhang/stella_en_1.5B_v5) for a retrieval task (s2p). The sample queries and docs are hardcoded in the example. diff --git a/candle-examples/examples/t5/README.md b/candle-examples/examples/t5/README.md index 1e824e31..ee3478aa 100644 --- a/candle-examples/examples/t5/README.md +++ b/candle-examples/examples/t5/README.md @@ -13,7 +13,7 @@ $ cargo run --example t5 --release -- --model-id "t5-small" --prompt "translate Variants such as [flan-t5](https://huggingface.co/google/flan-t5-small), [flan-ul2](https://huggingface.co/google/flan-ul2) (with `--revision "refs/pr/25"`), and [Co-EdIT](https://huggingface.co/grammarly/coedit-large) are also supported. -## Translation with [MADLAD-400](https://arxiv.org/abs/2309.04662) +## Translation with [MADLAD-400](https://huggingface.co/papers/2309.04662) MADLAD-400 is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models. diff --git a/candle-examples/examples/wuerstchen/README.md b/candle-examples/examples/wuerstchen/README.md index 1b8accd1..b07757ba 100644 --- a/candle-examples/examples/wuerstchen/README.md +++ b/candle-examples/examples/wuerstchen/README.md @@ -8,7 +8,7 @@ The candle implementation reproduces the same structure/files for models and pipelines. Useful resources: - [Official implementation](https://github.com/dome272/Wuerstchen). -- [Arxiv paper](https://arxiv.org/abs/2306.00637). +- [Arxiv paper](https://huggingface.co/papers/2306.00637). - Blog post: [Introducing Würstchen: Fast Diffusion for Image Generation](https://huggingface.co/blog/wuerstchen). ## Getting the weights diff --git a/candle-metal-kernels/src/reduce.metal b/candle-metal-kernels/src/reduce.metal index c134218c..eae77e8e 100644 --- a/candle-metal-kernels/src/reduce.metal +++ b/candle-metal-kernels/src/reduce.metal @@ -858,7 +858,7 @@ struct finalize_softmax { }; // Welford's algorithm approach for an online softmax implementation. -// Same as the Online normalizer calculation for softmax: https://arxiv.org/pdf/1805.02867.pdf +// Same as the Online normalizer calculation for softmax: https://huggingface.co/papers/1805.02867 template METAL_FUNC void softmax( constant uint &src_numel, diff --git a/candle-nn/src/batch_norm.rs b/candle-nn/src/batch_norm.rs index 4c67961d..5b281831 100644 --- a/candle-nn/src/batch_norm.rs +++ b/candle-nn/src/batch_norm.rs @@ -6,7 +6,7 @@ //! Note that this implementation is for inference only, there is no possibility to track the //! running stats. //! -//! [`Batch Normalization`]: https://arxiv.org/abs/1502.03167 +//! [`Batch Normalization`]: https://huggingface.co/papers/1502.03167 use candle::{DType, Result, Tensor, Var}; #[derive(Debug, Clone, Copy, PartialEq)] diff --git a/candle-nn/src/layer_norm.rs b/candle-nn/src/layer_norm.rs index 468fe24d..a5defa9a 100644 --- a/candle-nn/src/layer_norm.rs +++ b/candle-nn/src/layer_norm.rs @@ -27,7 +27,7 @@ //! # Ok(()) } //! ``` //! -//! [`Layer Normalization`]: https://arxiv.org/abs/1607.06450 +//! [`Layer Normalization`]: https://huggingface.co/papers/1607.06450 use candle::{DType, Module, Result, Tensor, D}; #[derive(Debug, Clone, Copy, PartialEq)] diff --git a/candle-onnx/src/onnx.proto3 b/candle-onnx/src/onnx.proto3 index f47006f8..6ebdb3ac 100644 --- a/candle-onnx/src/onnx.proto3 +++ b/candle-onnx/src/onnx.proto3 @@ -512,8 +512,8 @@ message TensorProto { BFLOAT16 = 16; // Non-IEEE floating-point format based on papers - // FP8 Formats for Deep Learning, https://arxiv.org/abs/2209.05433, - // 8-bit Numerical Formats For Deep Neural Networks, https://arxiv.org/pdf/2206.02915.pdf. + // FP8 Formats for Deep Learning, https://huggingface.co/papers/2209.05433, + // 8-bit Numerical Formats For Deep Neural Networks, https://huggingface.co/papers/2206.02915. // Operators supported FP8 are Cast, CastLike, QuantizeLinear, DequantizeLinear. // The computation usually happens inside a block quantize / dequantize // fused by the runtime. diff --git a/candle-pyo3/py_src/candle/nn/normalization.py b/candle-pyo3/py_src/candle/nn/normalization.py index 61d29c51..6ba86df1 100644 --- a/candle-pyo3/py_src/candle/nn/normalization.py +++ b/candle-pyo3/py_src/candle/nn/normalization.py @@ -9,7 +9,7 @@ import numbers class LayerNorm(Module): r"""Applies Layer Normalization over a mini-batch of inputs as described in - the paper `Layer Normalization ` + the paper `Layer Normalization ` math:: y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta diff --git a/candle-transformers/src/models/based.rs b/candle-transformers/src/models/based.rs index 1dbd6dc2..5ec4d2e1 100644 --- a/candle-transformers/src/models/based.rs +++ b/candle-transformers/src/models/based.rs @@ -1,7 +1,7 @@ //! Based from the Stanford Hazy Research group. //! //! See "Simple linear attention language models balance the recall-throughput tradeoff", Arora et al. 2024 -//! - Simple linear attention language models balance the recall-throughput tradeoff. [Arxiv](https://arxiv.org/abs/2402.18668) +//! - Simple linear attention language models balance the recall-throughput tradeoff. [Arxiv](https://huggingface.co/papers/2402.18668) //! - [Github Rep](https://github.com/HazyResearch/based) //! - [Blogpost](https://hazyresearch.stanford.edu/blog/2024-03-03-based) diff --git a/candle-transformers/src/models/beit.rs b/candle-transformers/src/models/beit.rs index 2f61d9d6..4ed8be13 100644 --- a/candle-transformers/src/models/beit.rs +++ b/candle-transformers/src/models/beit.rs @@ -1,7 +1,7 @@ //! Based on the BEIT vision-language model. //! //! See "BEIT: BERT Pre-Training of Image Transformers", Bao et al. 2021 -//! - [Arxiv](https://arxiv.org/abs/2106.08254) +//! - [Arxiv](https://huggingface.co/papers/2106.08254) //! - [Github](https://github.com/microsoft/unilm/tree/master/beit) //! diff --git a/candle-transformers/src/models/bert.rs b/candle-transformers/src/models/bert.rs index 06f4c17d..3b06bec5 100644 --- a/candle-transformers/src/models/bert.rs +++ b/candle-transformers/src/models/bert.rs @@ -3,7 +3,7 @@ //! Bert is a general large language model that can be used for various language tasks: //! - Compute sentence embeddings for a prompt. //! - Compute similarities between a set of sentences. -//! - [Arxiv](https://arxiv.org/abs/1810.04805) "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" +//! - [Arxiv](https://huggingface.co/papers/1810.04805) "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" //! - Upstream [Github repo](https://github.com/google-research/bert). //! - See bert in [candle-examples](https://github.com/huggingface/candle/tree/main/candle-examples/) for runnable code //! diff --git a/candle-transformers/src/models/bigcode.rs b/candle-transformers/src/models/bigcode.rs index c5dcb6bc..941616bb 100644 --- a/candle-transformers/src/models/bigcode.rs +++ b/candle-transformers/src/models/bigcode.rs @@ -3,7 +3,7 @@ //! [StarCoder/BigCode](https://huggingface.co/bigcode/starcoderbase-1b) is a LLM //! model specialized to code generation. The initial model was trained on 80 //! programming languages. See "StarCoder: A State-of-the-Art LLM for Code", Mukherjee et al. 2023 -//! - [Arxiv](https://arxiv.org/abs/2305.06161) +//! - [Arxiv](https://huggingface.co/papers/2305.06161) //! - [Github](https://github.com/bigcode-project/starcoder) //! //! ## Running some example diff --git a/candle-transformers/src/models/blip.rs b/candle-transformers/src/models/blip.rs index a391daac..3c875997 100644 --- a/candle-transformers/src/models/blip.rs +++ b/candle-transformers/src/models/blip.rs @@ -5,7 +5,7 @@ //! - ⚡ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning) //! - 💻 [GH Link](https://github.com/salesforce/BLIP) //! - 🤗 [HF Link](https://huggingface.co/Salesforce/blip-image-captioning-base) -//! - 📝 [Paper](https://arxiv.org/abs/2201.12086) +//! - 📝 [Paper](https://huggingface.co/papers/2201.12086) //! use super::blip_text; diff --git a/candle-transformers/src/models/blip_text.rs b/candle-transformers/src/models/blip_text.rs index ad28193b..3fc92bb0 100644 --- a/candle-transformers/src/models/blip_text.rs +++ b/candle-transformers/src/models/blip_text.rs @@ -1,11 +1,11 @@ //! Implementation of BLIP text encoder/decoder. //! -//! - 📝 [Paper](https://arxiv.org/abs/2201.12086). BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation" +//! - 📝 [Paper](https://huggingface.co/papers/2201.12086). BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation" //! //! - ⚡ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning) //! - 💻 [GH Link](https://github.com/salesforce/BLIP) //! - 🤗 [HF Link](https://huggingface.co/Salesforce/blip-image-captioning-base) -//! - 📝 [Paper](https://arxiv.org/abs/2201.12086) +//! - 📝 [Paper](https://huggingface.co/papers/2201.12086) //! use super::with_tracing::{linear, Embedding, Linear}; use candle::{Module, Result, Tensor, D}; diff --git a/candle-transformers/src/models/chinese_clip/text_model.rs b/candle-transformers/src/models/chinese_clip/text_model.rs index b43c7423..3032fffe 100644 --- a/candle-transformers/src/models/chinese_clip/text_model.rs +++ b/candle-transformers/src/models/chinese_clip/text_model.rs @@ -13,9 +13,9 @@ use super::Activation; /// Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For /// positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to -/// [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). +/// [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). /// For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models -/// with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). +/// with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). #[derive(Clone, Debug)] pub enum PositionEmbeddingType { Absolute, diff --git a/candle-transformers/src/models/codegeex4_9b.rs b/candle-transformers/src/models/codegeex4_9b.rs index 12522eab..441e3391 100644 --- a/candle-transformers/src/models/codegeex4_9b.rs +++ b/candle-transformers/src/models/codegeex4_9b.rs @@ -2,7 +2,7 @@ //! //! A Pre-Trained Model For Code Generation with Multilingual Evaluations on HumanEval-X" //! -//! - 📝 [Arxiv](https://arxiv.org/abs/2303.17568) +//! - 📝 [Arxiv](https://huggingface.co/papers/2303.17568) //! - 💻 [Github](https://github.com/THUDM/CodeGeeX) //! diff --git a/candle-transformers/src/models/convmixer.rs b/candle-transformers/src/models/convmixer.rs index 7f924794..b8fcf613 100644 --- a/candle-transformers/src/models/convmixer.rs +++ b/candle-transformers/src/models/convmixer.rs @@ -2,7 +2,7 @@ //! //! See "Patches Are All You Need?" by Trockman et al. 2022 //! -//! - 📝 [Arxiv](https://arxiv.org/abs/2201.09792) +//! - 📝 [Arxiv](https://huggingface.co/papers/2201.09792) //! - 💻 [Github](https://github.com/locuslab/convmixer) //! use candle::Result; diff --git a/candle-transformers/src/models/convnext.rs b/candle-transformers/src/models/convnext.rs index 727e1138..647b5256 100644 --- a/candle-transformers/src/models/convnext.rs +++ b/candle-transformers/src/models/convnext.rs @@ -8,8 +8,8 @@ //! - 💻 [ConvNeXt](https://github.com/facebookresearch/ConvNeXt/) //! - 💻 [ConvNeXt-V2](https://github.com/facebookresearch/ConvNeXt-V2/) //! - 💻 [timm](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py) -//! - 📝 [Paper](https://arxiv.org/abs/2201.03545) A ConvNet for the 2020s -//! - 📝 [Paper](https://arxiv.org/abs/2301.00808) ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders +//! - 📝 [Paper](https://huggingface.co/papers/2201.03545) A ConvNet for the 2020s +//! - 📝 [Paper](https://huggingface.co/papers/2301.00808) ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders //! use candle::shape::ShapeWithOneHole; diff --git a/candle-transformers/src/models/dinov2reg4.rs b/candle-transformers/src/models/dinov2reg4.rs index 549f2c3c..b173750e 100644 --- a/candle-transformers/src/models/dinov2reg4.rs +++ b/candle-transformers/src/models/dinov2reg4.rs @@ -4,7 +4,7 @@ //! original architecture. This implementation is specifically trained for plant species //! classification on the PlantCLEF2024 dataset with 7,806 classes. //! -//! - [Paper](https://arxiv.org/abs/2309.16588). DINOv2: Learning Robust Visual Features without Supervision +//! - [Paper](https://huggingface.co/papers/2309.16588). DINOv2: Learning Robust Visual Features without Supervision //! - [GH Repo](https://github.com/facebookresearch/dinov2) //! //! # Example diff --git a/candle-transformers/src/models/distilbert.rs b/candle-transformers/src/models/distilbert.rs index 1b15c5f8..661b0b80 100644 --- a/candle-transformers/src/models/distilbert.rs +++ b/candle-transformers/src/models/distilbert.rs @@ -1,7 +1,7 @@ //! Implementation of DistilBert, a distilled version of BERT. //! //! See: -//! - ["DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter"](https://arxiv.org/abs/1910.01108) +//! - ["DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter"](https://huggingface.co/papers/1910.01108) //! use super::with_tracing::{layer_norm, linear, LayerNorm, Linear}; use candle::{DType, Device, Result, Tensor}; diff --git a/candle-transformers/src/models/efficientnet.rs b/candle-transformers/src/models/efficientnet.rs index be695460..60464e0a 100644 --- a/candle-transformers/src/models/efficientnet.rs +++ b/candle-transformers/src/models/efficientnet.rs @@ -1,7 +1,7 @@ //! Implementation of EfficientBert, an efficient variant of BERT for computer vision tasks. //! //! See: -//! - ["EfficientBERT: Progressively Searching Multilayer Perceptron Architectures for BERT"](https://arxiv.org/abs/2201.00462) +//! - ["EfficientBERT: Progressively Searching Multilayer Perceptron Architectures for BERT"](https://huggingface.co/papers/2201.00462) //! use candle::{Context, Result, Tensor, D}; use candle_nn as nn; diff --git a/candle-transformers/src/models/efficientvit.rs b/candle-transformers/src/models/efficientvit.rs index 4c231d76..bc59713a 100644 --- a/candle-transformers/src/models/efficientvit.rs +++ b/candle-transformers/src/models/efficientvit.rs @@ -5,7 +5,7 @@ //! to achieve strong performance while maintaining low memory usage. //! //! The model was originally described in the paper: -//! ["EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention"](https://arxiv.org/abs/2305.07027) +//! ["EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention"](https://huggingface.co/papers/2305.07027) //! //! This implementation is based on the reference implementation from //! [pytorch-image-models](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/efficientvit_msra.py). diff --git a/candle-transformers/src/models/encodec.rs b/candle-transformers/src/models/encodec.rs index 4bea97b9..e8fad495 100644 --- a/candle-transformers/src/models/encodec.rs +++ b/candle-transformers/src/models/encodec.rs @@ -1,6 +1,6 @@ //! EnCodec neural audio codec based on the Encodec implementation. //! -//! See ["High Fidelity Neural Audio Compression"](https://arxiv.org/abs/2210.13438) +//! See ["High Fidelity Neural Audio Compression"](https://huggingface.co/papers/2210.13438) //! //! Based on implementation from [huggingface/transformers](https://github.com/huggingface/transformers/blob/main/src/transformers/models/encodec/modeling_encodec.py) diff --git a/candle-transformers/src/models/eva2.rs b/candle-transformers/src/models/eva2.rs index 9e31f58c..4d042af6 100644 --- a/candle-transformers/src/models/eva2.rs +++ b/candle-transformers/src/models/eva2.rs @@ -4,7 +4,7 @@ //! The model returns the probability for an image to belong to each of the 1000 //! ImageNet categories. //! -//! - [Paper](https://arxiv.org/abs/2303.11331). EVA-02: A Visual Representation for Neon Genesis +//! - [Paper](https://huggingface.co/papers/2303.11331). EVA-02: A Visual Representation for Neon Genesis //! - [Code](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/eva2.py) //! //! # Example diff --git a/candle-transformers/src/models/fastvit.rs b/candle-transformers/src/models/fastvit.rs index 3f8664d9..f6f37091 100644 --- a/candle-transformers/src/models/fastvit.rs +++ b/candle-transformers/src/models/fastvit.rs @@ -1,7 +1,7 @@ //! # FastViT inference implementation based on timm //! //! ## Description -//! See ["FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization"](https://arxiv.org/pdf/2303.14189) +//! See ["FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization"](https://huggingface.co/papers/2303.14189) //! //! Implementation based on [timm model](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/fastvit.py) diff --git a/candle-transformers/src/models/hiera.rs b/candle-transformers/src/models/hiera.rs index 98ad8257..2146c833 100644 --- a/candle-transformers/src/models/hiera.rs +++ b/candle-transformers/src/models/hiera.rs @@ -2,7 +2,7 @@ //! //! //! - 💻 [Hiera](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/hiera.py) -//! - 📝 [Paper](https://arxiv.org/abs/2306.00989). Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles +//! - 📝 [Paper](https://huggingface.co/papers/2306.00989). Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles use candle::{Result, D}; use candle_nn::{conv2d, layer_norm, linear, ops::softmax, Conv2dConfig, Func, VarBuilder}; diff --git a/candle-transformers/src/models/llama.rs b/candle-transformers/src/models/llama.rs index 4396063f..93582c78 100644 --- a/candle-transformers/src/models/llama.rs +++ b/candle-transformers/src/models/llama.rs @@ -1,6 +1,6 @@ //! Llama inference implementation. //! -//! See ["LLaMA: Open and Efficient Foundation Language Models"](https://arxiv.org/abs/2302.13971) +//! See ["LLaMA: Open and Efficient Foundation Language Models"](https://huggingface.co/papers/2302.13971) //! //! Implementation based on Hugging Face's [transformers](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py) diff --git a/candle-transformers/src/models/llama2_c.rs b/candle-transformers/src/models/llama2_c.rs index 930c8b8a..eb737a88 100644 --- a/candle-transformers/src/models/llama2_c.rs +++ b/candle-transformers/src/models/llama2_c.rs @@ -1,6 +1,6 @@ //! Llama2 inference implementation. //! -//! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://arxiv.org/abs/2307.09288) +//! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://huggingface.co/papers/2307.09288) //! //! - ⚡ [Interactive Wasm Example](https://huggingface.co/spaces/lmz/candle-llama2) //! - 💻 llama2.c [GH Link](https://github.com/karpathy/llama2.c) diff --git a/candle-transformers/src/models/llama2_c_weights.rs b/candle-transformers/src/models/llama2_c_weights.rs index 8149c214..6294d965 100644 --- a/candle-transformers/src/models/llama2_c_weights.rs +++ b/candle-transformers/src/models/llama2_c_weights.rs @@ -1,6 +1,6 @@ //! Llama2 inference implementation. //! -//! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://arxiv.org/abs/2307.09288) +//! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://huggingface.co/papers/2307.09288) //! //! Based on the [llama2.c](https://github.com/karpathy/llama2.c) implementation diff --git a/candle-transformers/src/models/llava/mod.rs b/candle-transformers/src/models/llava/mod.rs index bc855538..e54b6245 100644 --- a/candle-transformers/src/models/llava/mod.rs +++ b/candle-transformers/src/models/llava/mod.rs @@ -4,7 +4,7 @@ //! language model (Llama) for multimodal capabilities. The architecture implements the training-free projection technique. //! //! - 💻[GH Link](https://github.com/haotian-liu/LLaVA/tree/main) -//! - 📝 [Paper](https://arxiv.org/abs/2304.08485)/ Visual Instruction Tuning +//! - 📝 [Paper](https://huggingface.co/papers/2304.08485)/ Visual Instruction Tuning //! pub mod config; diff --git a/candle-transformers/src/models/mamba.rs b/candle-transformers/src/models/mamba.rs index dfae0af3..4f352404 100644 --- a/candle-transformers/src/models/mamba.rs +++ b/candle-transformers/src/models/mamba.rs @@ -1,6 +1,6 @@ //! Mamba inference implementation. //! -//! See ["Mamba: Linear-Time Sequence Modeling with Selective State Spaces"](https://arxiv.org/abs/2312.00752) +//! See ["Mamba: Linear-Time Sequence Modeling with Selective State Spaces"](https://huggingface.co/papers/2312.00752) //! //! Based on reference implementation from the AlbertMamba project //! A fast implementation of mamba for inference only. @@ -122,7 +122,7 @@ impl MambaBlock { let proj_for_conv = candle_nn::ops::silu(&proj_for_conv)?; // SSM + Selection, we're doing inference here so only need the last step of // the sequence. - // Algorithm 3.2 on page 6, https://arxiv.org/pdf/2312.00752.pdf + // Algorithm 3.2 on page 6, https://huggingface.co/papers/2312.00752 let x_proj = self.x_proj.forward(&proj_for_conv)?; let delta = x_proj.narrow(D::Minus1, 0, self.dt_rank)?.contiguous()?; diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs index 2c2909c3..8dc92c92 100644 --- a/candle-transformers/src/models/mixformer.rs +++ b/candle-transformers/src/models/mixformer.rs @@ -1,14 +1,14 @@ //! MixFormer (Microsoft's Phi Architecture) //! //! See "Textbooks Are All You Need II: phi-1.5 technical report", Lin et al. 2023 -//! - [Arxiv](https://arxiv.org/abs/2309.05463) +//! - [Arxiv](https://huggingface.co/papers/2309.05463) //! - [Github](https://huggingface.co/microsoft/phi-1_5) //! use crate::models::with_tracing::{linear, Embedding as E, Linear}; /// MixFormer model. /// https://huggingface.co/microsoft/phi-1_5 -/// https://arxiv.org/abs/2309.05463 +/// https://huggingface.co/papers/2309.05463 use candle::{DType, Device, IndexOp, Module, Result, Tensor, D}; use candle_nn::{Activation, VarBuilder}; use serde::Deserialize; diff --git a/candle-transformers/src/models/mmdit/mod.rs b/candle-transformers/src/models/mmdit/mod.rs index 88e73e1e..5d12601d 100644 --- a/candle-transformers/src/models/mmdit/mod.rs +++ b/candle-transformers/src/models/mmdit/mod.rs @@ -3,14 +3,14 @@ //! Mix of Multi-scale Dilated and Traditional Convolutions (MMDiT) is an architecture //! introduced for Stable Diffusion 3, with the MMDiT-X variant used in Stable Diffusion 3.5. //! -//! - 📝 [Research Paper](https://arxiv.org/abs/2403.03206) +//! - 📝 [Research Paper](https://huggingface.co/papers/2403.03206) //! - 💻 ComfyUI [reference implementation](https://github.com/comfyanonymous/ComfyUI/blob/78e133d0415784924cd2674e2ee48f3eeca8a2aa/comfy/ldm/modules/diffusionmodules/mmdit.py) //! - 💻 Stability-AI [MMDiT-X implementation](https://github.com/Stability-AI/sd3.5/blob/4e484e05308d83fb77ae6f680028e6c313f9da54/mmditx.py) //! - ⚡ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning) //! - 💻 [GH Link](https://github.com/salesforce/BLIP) //! - 🤗 [HF Link](https://huggingface.co/Salesforce/blip-image-captioning-base) -//! - 📝 [Paper](https://arxiv.org/abs/2201.12086) +//! - 📝 [Paper](https://huggingface.co/papers/2201.12086) //! pub mod blocks; diff --git a/candle-transformers/src/models/mmdit/model.rs b/candle-transformers/src/models/mmdit/model.rs index 21897aa3..f0c4bdc1 100644 --- a/candle-transformers/src/models/mmdit/model.rs +++ b/candle-transformers/src/models/mmdit/model.rs @@ -1,4 +1,4 @@ -// Implement the MMDiT model originally introduced for Stable Diffusion 3 (https://arxiv.org/abs/2403.03206), +// Implement the MMDiT model originally introduced for Stable Diffusion 3 (https://huggingface.co/papers/2403.03206), // as well as the MMDiT-X variant introduced for Stable Diffusion 3.5-medium (https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) // This follows the implementation of the MMDiT model in the ComfyUI repository. // https://github.com/comfyanonymous/ComfyUI/blob/78e133d0415784924cd2674e2ee48f3eeca8a2aa/comfy/ldm/modules/diffusionmodules/mmdit.py#L1 diff --git a/candle-transformers/src/models/mobileclip.rs b/candle-transformers/src/models/mobileclip.rs index f0baf9e1..74a0178e 100644 --- a/candle-transformers/src/models/mobileclip.rs +++ b/candle-transformers/src/models/mobileclip.rs @@ -6,12 +6,12 @@ //! - Projection layers to align the feature spaces //! //! See model details at: -//! - [FastViT](https://arxiv.org/abs/2303.14189) +//! - [FastViT](https://huggingface.co/papers/2303.14189) //! - [OpenCLIP](https://github.com/mlfoundations/open_clip) //! //! References: //! - [MobileVLM](https://huggingface.co/mobileVLM) -//! - [MetaCLIP](https://arxiv.org/abs/2309.16671) +//! - [MetaCLIP](https://huggingface.co/papers/2309.16671) //! use super::fastvit; diff --git a/candle-transformers/src/models/mobilenetv4.rs b/candle-transformers/src/models/mobilenetv4.rs index ab1e7080..b673c5f0 100644 --- a/candle-transformers/src/models/mobilenetv4.rs +++ b/candle-transformers/src/models/mobilenetv4.rs @@ -4,7 +4,7 @@ //! //! ## Paper //! -//! ["MobileNetV4 - Universal Models for the Mobile Ecosystem"](https://arxiv.org/abs/2404.10518) +//! ["MobileNetV4 - Universal Models for the Mobile Ecosystem"](https://huggingface.co/papers/2404.10518) //! //! ## References //! diff --git a/candle-transformers/src/models/mobileone.rs b/candle-transformers/src/models/mobileone.rs index e8836745..99ec69c9 100644 --- a/candle-transformers/src/models/mobileone.rs +++ b/candle-transformers/src/models/mobileone.rs @@ -2,7 +2,7 @@ //! //! MobileOne inference implementation based on timm and candle-repvgg //! -//! See ["MobileOne: An Improved One millisecond Mobile Backbone"](https://arxiv.org/abs/2206.04040) +//! See ["MobileOne: An Improved One millisecond Mobile Backbone"](https://huggingface.co/papers/2206.04040) use candle::{DType, Result, Tensor, D}; use candle_nn::{ diff --git a/candle-transformers/src/models/modernbert.rs b/candle-transformers/src/models/modernbert.rs index e9f4e01c..5269c625 100644 --- a/candle-transformers/src/models/modernbert.rs +++ b/candle-transformers/src/models/modernbert.rs @@ -1,7 +1,7 @@ //! ModernBERT //! //! ModernBERT is a modernized bidirectional encoder-only Transformer model. -//! - [Arxiv](https://arxiv.org/abs/2412.13663) "Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference" +//! - [Arxiv](https://huggingface.co/papers/2412.13663) "Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference" //! - Upstream [Github repo](https://github.com/AnswerDotAI/ModernBERT). //! - See modernbert in [candle-examples](https://github.com/huggingface/candle/tree/main/candle-examples/) for runnable code //! diff --git a/candle-transformers/src/models/nvembed_v2/mod.rs b/candle-transformers/src/models/nvembed_v2/mod.rs index 8a8f7007..0a9e15e4 100644 --- a/candle-transformers/src/models/nvembed_v2/mod.rs +++ b/candle-transformers/src/models/nvembed_v2/mod.rs @@ -2,7 +2,7 @@ //! //! NV-Embed-v2 is a text embedding model that combines a Mistral decoder with a latent attention mechanism to produce high-quality text embeddings. //! -//! This implementation is based on the [paper](https://arxiv.org/pdf/2405.17428) and [weights](https://huggingface.co/nvidia/NV-Embed-v2) +//! This implementation is based on the [paper](https://huggingface.co/papers/2405.17428) and [weights](https://huggingface.co/nvidia/NV-Embed-v2) //! //! # Query-Passage Retrieval Example //! ```bash diff --git a/candle-transformers/src/models/olmo2.rs b/candle-transformers/src/models/olmo2.rs index 5567cb67..d3ac3c63 100644 --- a/candle-transformers/src/models/olmo2.rs +++ b/candle-transformers/src/models/olmo2.rs @@ -2,7 +2,7 @@ //! //! See OLMo 2 model details at: //! - [Hugging Face Collection](https://huggingface.co/collections/allenai/olmo-2-674117b93ab84e98afc72edc) -//! - [OLMo 2 Paper](https://arxiv.org/abs/2501.00656) +//! - [OLMo 2 Paper](https://huggingface.co/papers/2501.00656) //! //! use candle::{DType, Device, Module, Result, Tensor, D}; diff --git a/candle-transformers/src/models/openclip/mod.rs b/candle-transformers/src/models/openclip/mod.rs index b3864b81..449ffc22 100644 --- a/candle-transformers/src/models/openclip/mod.rs +++ b/candle-transformers/src/models/openclip/mod.rs @@ -4,7 +4,7 @@ //! pairs of images with related texts. //! //! - 💻 [GH Link](https://github.com/mlfoundations/open_clip) -//! - 📝 [Paper](https://arxiv.org/abs/2212.07143) +//! - 📝 [Paper](https://huggingface.co/papers/2212.07143) //! //! ## Overview //! diff --git a/candle-transformers/src/models/paligemma.rs b/candle-transformers/src/models/paligemma.rs index e9928699..426786b4 100644 --- a/candle-transformers/src/models/paligemma.rs +++ b/candle-transformers/src/models/paligemma.rs @@ -1,7 +1,7 @@ //! Multimodal multi-purpose model combining Gemma-based language model with SigLIP image understanding //! //! See PaLiGemma details at: -//! - [Paper](https://arxiv.org/abs/2402.05257) +//! - [Paper](https://huggingface.co/papers/2402.05257) //! - [Google Blog Post](https://blog.research.google/2024/02/paligemma-scaling-language-image.html) //! //! The model is a multimodal combination of: @@ -11,7 +11,7 @@ //! //! References: //! - [HuggingFace Implementation](https://huggingface.co/google/paligemma-3b) -//! - [Paper: PaLI-3 and Beyond: Scaling Language-Image Learning](https://arxiv.org/abs/2402.05257) +//! - [Paper: PaLI-3 and Beyond: Scaling Language-Image Learning](https://huggingface.co/papers/2402.05257) //! use crate::models::{gemma, siglip}; diff --git a/candle-transformers/src/models/quantized_blip.rs b/candle-transformers/src/models/quantized_blip.rs index acba9ba1..ed981a0d 100644 --- a/candle-transformers/src/models/quantized_blip.rs +++ b/candle-transformers/src/models/quantized_blip.rs @@ -10,7 +10,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - [BLIP Paper](https://arxiv.org/abs/2201.12086) +//! - [BLIP Paper](https://huggingface.co/papers/2201.12086) //! - [Hugging Face Implementation](https://huggingface.co/docs/transformers/model_doc/blip) //! diff --git a/candle-transformers/src/models/quantized_blip_text.rs b/candle-transformers/src/models/quantized_blip_text.rs index 61e468e7..21f9c160 100644 --- a/candle-transformers/src/models/quantized_blip_text.rs +++ b/candle-transformers/src/models/quantized_blip_text.rs @@ -11,7 +11,7 @@ //! - Quantized linear transformations //! //! References: -//! - [BLIP Paper](https://arxiv.org/abs/2201.12086) +//! - [BLIP Paper](https://huggingface.co/papers/2201.12086) //! - [Hugging Face Implementation](https://huggingface.co/docs/transformers/model_doc/blip) //! diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index e171b54f..96f96dd8 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -11,7 +11,7 @@ //! - Configurable model sizes and parameter counts //! //! - 💻 [GH Link](https://github.com/facebookresearch/llama) -//! - 📝 [Paper](https://arxiv.org/abs/2302.13971) +//! - 📝 [Paper](https://huggingface.co/papers/2302.13971) //! //! ![](https://raw.githubusercontent.com/huggingface/candle/main/candle-examples/examples/quantized/assets/aoc.gif) //! diff --git a/candle-transformers/src/models/quantized_llama2_c.rs b/candle-transformers/src/models/quantized_llama2_c.rs index 3eb14bb9..920a611e 100644 --- a/candle-transformers/src/models/quantized_llama2_c.rs +++ b/candle-transformers/src/models/quantized_llama2_c.rs @@ -10,7 +10,7 @@ //! - 8-bit quantization of weights //! //! References: -//! - [LLaMA2 Paper](https://arxiv.org/abs/2307.09288) +//! - [LLaMA2 Paper](https://huggingface.co/papers/2307.09288) //! - [LLaMA2 Technical Report](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) //! diff --git a/candle-transformers/src/models/quantized_mistral.rs b/candle-transformers/src/models/quantized_mistral.rs index cdb687d5..d38e70c2 100644 --- a/candle-transformers/src/models/quantized_mistral.rs +++ b/candle-transformers/src/models/quantized_mistral.rs @@ -11,7 +11,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - [Mistral Paper](https://arxiv.org/abs/2310.06825) +//! - [Mistral Paper](https://huggingface.co/papers/2310.06825) //! - [Model Card](https://huggingface.co/mistralai/Mistral-7B-v0.1) //! diff --git a/candle-transformers/src/models/quantized_phi.rs b/candle-transformers/src/models/quantized_phi.rs index b874ad94..7799900f 100644 --- a/candle-transformers/src/models/quantized_phi.rs +++ b/candle-transformers/src/models/quantized_phi.rs @@ -11,7 +11,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - [Phi2 Paper](https://arxiv.org/abs/2309.05463) +//! - [Phi2 Paper](https://huggingface.co/papers/2309.05463) //! - [Model Card](https://huggingface.co/microsoft/phi-2) //! diff --git a/candle-transformers/src/models/quantized_recurrent_gemma.rs b/candle-transformers/src/models/quantized_recurrent_gemma.rs index e40daa1f..c365ffec 100644 --- a/candle-transformers/src/models/quantized_recurrent_gemma.rs +++ b/candle-transformers/src/models/quantized_recurrent_gemma.rs @@ -11,7 +11,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - [Gemma Paper](https://arxiv.org/abs/2401.06751) +//! - [Gemma Paper](https://huggingface.co/papers/2401.06751) //! - [Model Card](https://ai.google.dev/gemma) //! diff --git a/candle-transformers/src/models/quantized_t5.rs b/candle-transformers/src/models/quantized_t5.rs index 4fc9c537..292df9aa 100644 --- a/candle-transformers/src/models/quantized_t5.rs +++ b/candle-transformers/src/models/quantized_t5.rs @@ -11,7 +11,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - 📝 [T5 Paper](https://arxiv.org/abs/1910.10683) +//! - 📝 [T5 Paper](https://huggingface.co/papers/1910.10683) //! - 🤗 [Model Card](https://huggingface.co/t5-base) //! - 🤗 Original model from [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py) diff --git a/candle-transformers/src/models/qwen2_moe.rs b/candle-transformers/src/models/qwen2_moe.rs index 40e02797..2c17fdbb 100644 --- a/candle-transformers/src/models/qwen2_moe.rs +++ b/candle-transformers/src/models/qwen2_moe.rs @@ -12,7 +12,7 @@ //! - Rotary positional embeddings (RoPE) //! //! References: -//! - [Qwen2 Paper](https://arxiv.org/abs/2401.08985) +//! - [Qwen2 Paper](https://huggingface.co/papers/2401.08985) //! - [Model Card](https://huggingface.co/Qwen/Qwen2-7B-beta) //! diff --git a/candle-transformers/src/models/recurrent_gemma.rs b/candle-transformers/src/models/recurrent_gemma.rs index d6a029ba..65783850 100644 --- a/candle-transformers/src/models/recurrent_gemma.rs +++ b/candle-transformers/src/models/recurrent_gemma.rs @@ -12,7 +12,7 @@ //! //! References: //! - [Gemma: Open Models Based on Gemini Technology](https://blog.google/technology/developers/gemma-open-models/) -//! - [Recurrent Memory model architecture](https://arxiv.org/abs/2402.00441) +//! - [Recurrent Memory model architecture](https://huggingface.co/papers/2402.00441) //! //! This implementation is based on the python version from huggingface/transformers. //! https://github.com/huggingface/transformers/blob/b109257f4fb8b1166e7c53cc5418632014ed53a5/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py#L2 diff --git a/candle-transformers/src/models/repvgg.rs b/candle-transformers/src/models/repvgg.rs index 6e45c2d6..85cc0e13 100644 --- a/candle-transformers/src/models/repvgg.rs +++ b/candle-transformers/src/models/repvgg.rs @@ -7,7 +7,7 @@ //! - High accuracy with VGG-like plain architecture and training //! //! References: -//! - [RepVGG Paper](https://arxiv.org/abs/2101.03697). RepVGG: Making VGG-style ConvNets Great Again +//! - [RepVGG Paper](https://huggingface.co/papers/2101.03697). RepVGG: Making VGG-style ConvNets Great Again //! - [Official Implementation](https://github.com/DingXiaoH/RepVGG) //! diff --git a/candle-transformers/src/models/resnet.rs b/candle-transformers/src/models/resnet.rs index 31395c8f..f33a698d 100644 --- a/candle-transformers/src/models/resnet.rs +++ b/candle-transformers/src/models/resnet.rs @@ -4,7 +4,7 @@ //! //! ## Reference //! -//! [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) +//! [Deep Residual Learning for Image Recognition](https://huggingface.co/papers/1512.03385) //! He et al. (2015) //! //! This paper introduced ResNet, a deep neural network architecture that utilizes diff --git a/candle-transformers/src/models/segformer.rs b/candle-transformers/src/models/segformer.rs index 6d750df2..ad7490f7 100644 --- a/candle-transformers/src/models/segformer.rs +++ b/candle-transformers/src/models/segformer.rs @@ -10,7 +10,7 @@ //! - Lightweight all-MLP decode head //! //! References: -//! - [SegFormer Paper](https://arxiv.org/abs/2105.15203) +//! - [SegFormer Paper](https://huggingface.co/papers/2105.15203) //! - [Model Card](https://huggingface.co/nvidia/mit-b0) //! diff --git a/candle-transformers/src/models/segment_anything/mod.rs b/candle-transformers/src/models/segment_anything/mod.rs index fe0b0990..6f429f27 100644 --- a/candle-transformers/src/models/segment_anything/mod.rs +++ b/candle-transformers/src/models/segment_anything/mod.rs @@ -8,7 +8,7 @@ //! //! - ⚡ [Interactive Wasm Example](https://huggingface.co/spaces/radames/candle-segment-anything-wasm) //! - 💻 [GH Link](https://github.com/facebookresearch/segment-anything) -//! - 📝 [Paper](https://arxiv.org/abs/2304.02643) +//! - 📝 [Paper](https://huggingface.co/papers/2304.02643) //! - 💡 The default backbone can be replaced by the smaller and faster TinyViT model based on [MobileSAM](https://github.com/ChaoningZhang/MobileSAM). //! //! diff --git a/candle-transformers/src/models/snac.rs b/candle-transformers/src/models/snac.rs index 65fcb97b..a886e09a 100644 --- a/candle-transformers/src/models/snac.rs +++ b/candle-transformers/src/models/snac.rs @@ -4,7 +4,7 @@ //! See: [SNAC](https://github.com/hubertsiuzdak/snac) //! /// Multi-Scale Neural Audio Codec (SNAC) compresses audio into discrete codes at a low bitrate. -/// For more information, read the paper: https://arxiv.org/abs/2410.14411 +/// For more information, read the paper: https://huggingface.co/papers/2410.14411 /// use candle::{DType, Device, IndexOp, Module, Result, Tensor, D}; use candle_nn::{ diff --git a/candle-transformers/src/models/stable_diffusion/ddim.rs b/candle-transformers/src/models/stable_diffusion/ddim.rs index d8ef5ec9..39354a5f 100644 --- a/candle-transformers/src/models/stable_diffusion/ddim.rs +++ b/candle-transformers/src/models/stable_diffusion/ddim.rs @@ -6,7 +6,7 @@ //! this to non-Markovian guidance. //! //! Denoising Diffusion Implicit Models, J. Song et al, 2020. -//! https://arxiv.org/abs/2010.02502 +//! https://huggingface.co/papers/2010.02502 use super::schedulers::{ betas_for_alpha_bar, BetaSchedule, PredictionType, Scheduler, SchedulerConfig, TimestepSpacing, }; diff --git a/candle-transformers/src/models/stable_diffusion/ddpm.rs b/candle-transformers/src/models/stable_diffusion/ddpm.rs index 42a0dc7e..9bf5463a 100644 --- a/candle-transformers/src/models/stable_diffusion/ddpm.rs +++ b/candle-transformers/src/models/stable_diffusion/ddpm.rs @@ -104,7 +104,7 @@ impl DDPMScheduler { }; let current_beta_t = 1. - alpha_prod_t / alpha_prod_t_prev; - // For t > 0, compute predicted variance βt (see formula (6) and (7) from [the pdf](https://arxiv.org/pdf/2006.11239.pdf)) + // For t > 0, compute predicted variance βt (see formula (6) and (7) from [the pdf](https://huggingface.co/papers/2006.11239)) // and sample from it to get previous sample // x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample let variance = (1. - alpha_prod_t_prev) / (1. - alpha_prod_t) * current_beta_t; @@ -112,7 +112,7 @@ impl DDPMScheduler { // retrieve variance match self.config.variance_type { DDPMVarianceType::FixedSmall => variance.max(1e-20), - // for rl-diffuser https://arxiv.org/abs/2205.09991 + // for rl-diffuser https://huggingface.co/papers/2205.09991 DDPMVarianceType::FixedSmallLog => { let variance = variance.max(1e-20).ln(); (variance * 0.5).exp() @@ -166,12 +166,12 @@ impl DDPMScheduler { } // 4. Compute coefficients for pred_original_sample x_0 and current sample x_t - // See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + // See formula (7) from https://huggingface.co/papers/2006.11239 let pred_original_sample_coeff = (alpha_prod_t_prev.sqrt() * current_beta_t) / beta_prod_t; let current_sample_coeff = current_alpha_t.sqrt() * beta_prod_t_prev / beta_prod_t; // 5. Compute predicted previous sample µ_t - // See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + // See formula (7) from https://huggingface.co/papers/2006.11239 let pred_prev_sample = ((&pred_original_sample * pred_original_sample_coeff)? + sample * current_sample_coeff)?; diff --git a/candle-transformers/src/models/stable_diffusion/resnet.rs b/candle-transformers/src/models/stable_diffusion/resnet.rs index 8a6490c5..d345d4b3 100644 --- a/candle-transformers/src/models/stable_diffusion/resnet.rs +++ b/candle-transformers/src/models/stable_diffusion/resnet.rs @@ -3,7 +3,7 @@ //! Some Residual Network blocks used in UNet models. //! //! Denoising Diffusion Implicit Models, K. He and al, 2015. -//! - [Paper](https://arxiv.org/abs/1512.03385) +//! - [Paper](https://huggingface.co/papers/1512.03385) //! use crate::models::with_tracing::{conv2d, Conv2d}; use candle::{Result, Tensor, D}; diff --git a/candle-transformers/src/models/stable_diffusion/schedulers.rs b/candle-transformers/src/models/stable_diffusion/schedulers.rs index 1ce94ca2..0c123442 100644 --- a/candle-transformers/src/models/stable_diffusion/schedulers.rs +++ b/candle-transformers/src/models/stable_diffusion/schedulers.rs @@ -43,7 +43,7 @@ pub enum PredictionType { /// Time step spacing for the diffusion process. /// -/// "linspace", "leading", "trailing" corresponds to annotation of Table 2. of the [paper](https://arxiv.org/abs/2305.08891) +/// "linspace", "leading", "trailing" corresponds to annotation of Table 2. of the [paper](https://huggingface.co/papers/2305.08891) #[derive(Debug, Clone, Copy)] pub enum TimestepSpacing { Leading, diff --git a/candle-transformers/src/models/stable_diffusion/uni_pc.rs b/candle-transformers/src/models/stable_diffusion/uni_pc.rs index c83417f3..548ba6b1 100644 --- a/candle-transformers/src/models/stable_diffusion/uni_pc.rs +++ b/candle-transformers/src/models/stable_diffusion/uni_pc.rs @@ -10,7 +10,7 @@ //! //! For more information, see the original publication: //! UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of Diffusion Models, W. Zhao et al, 2023. -//! https://arxiv.org/abs/2302.04867 +//! https://huggingface.co/papers/2302.04867 //! //! This work is based largely on UniPC implementation from the diffusers python package: //! https://raw.githubusercontent.com/huggingface/diffusers/e8aacda762e311505ba05ae340af23b149e37af3/src/diffusers/schedulers/scheduling_unipc_multistep.py diff --git a/candle-transformers/src/models/starcoder2.rs b/candle-transformers/src/models/starcoder2.rs index 266221e5..544f983c 100644 --- a/candle-transformers/src/models/starcoder2.rs +++ b/candle-transformers/src/models/starcoder2.rs @@ -11,7 +11,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - 📝 [StarCoder Paper](https://arxiv.org/abs/2305.06161) +//! - 📝 [StarCoder Paper](https://huggingface.co/papers/2305.06161) //! - 🤗 [Model Card](https://huggingface.co/bigcode/starcoder) //! diff --git a/candle-transformers/src/models/stella_en_v5.rs b/candle-transformers/src/models/stella_en_v5.rs index 761e44a9..2c527dae 100644 --- a/candle-transformers/src/models/stella_en_v5.rs +++ b/candle-transformers/src/models/stella_en_v5.rs @@ -11,7 +11,7 @@ //! - Rotary positional embeddings (RoPE) //! //! References: -//! - [MRL Framework](https://arxiv.org/abs/2205.13147) +//! - [MRL Framework](https://huggingface.co/papers/2205.13147) //! - [Model Card](https://huggingface.co/dunzhang/stella_en_1.5B_v5) //! @@ -56,7 +56,7 @@ pub struct Config { } // Excerpt from `stella` model card: -// `Stella_en_1.5B_v5` models have been trained on [MRL](https://arxiv.org/abs/2205.13147) enabling multiple output dimensions +// `Stella_en_1.5B_v5` models have been trained on [MRL](https://huggingface.co/papers/2205.13147) enabling multiple output dimensions // Embed head represents the config for various embedding dims supported #[derive(Debug, Default, Clone, PartialEq, serde::Deserialize)] pub struct EmbedHead { diff --git a/candle-transformers/src/models/t5.rs b/candle-transformers/src/models/t5.rs index 5d23549f..a01d197f 100644 --- a/candle-transformers/src/models/t5.rs +++ b/candle-transformers/src/models/t5.rs @@ -14,7 +14,7 @@ //! - ⚡ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm) //! - 💻[GH Model](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py) //! - 🤗 [HF Link](https://huggingface.co/docs/transformers/model_doc/t5) -//! - 📝 [T5 Paper](https://arxiv.org/abs/1910.10683) +//! - 📝 [T5 Paper](https://huggingface.co/papers/1910.10683) //! //! # Encoder-decoder example: //! @@ -33,7 +33,7 @@ //! # Translation with MADLAD //! //! -//! [MADLAD-400](https://arxiv.org/abs/2309.04662) is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models. +//! [MADLAD-400](https://huggingface.co/papers/2309.04662) is a series of multilingual machine translation T5 models trained on 250 billion tokens covering over 450 languages using publicly available data. These models are competitive with significantly larger models. //! //! ```bash //! cargo run --example t5 --release -- \ diff --git a/candle-transformers/src/models/trocr.rs b/candle-transformers/src/models/trocr.rs index 88418dd3..fcfb6d1e 100644 --- a/candle-transformers/src/models/trocr.rs +++ b/candle-transformers/src/models/trocr.rs @@ -10,7 +10,7 @@ //! - Layer normalization and self-attention //! //! References: -//! - [Paper](https://arxiv.org/abs/2109.10282) +//! - [Paper](https://huggingface.co/papers/2109.10282) //! - [Model Card](https://huggingface.co/microsoft/trocr-base-handwritten) //! diff --git a/candle-transformers/src/models/vgg.rs b/candle-transformers/src/models/vgg.rs index 57f9ae67..fb565962 100644 --- a/candle-transformers/src/models/vgg.rs +++ b/candle-transformers/src/models/vgg.rs @@ -10,7 +10,7 @@ //! - ReLU activation and dropout //! //! References: -//! - [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556) +//! - [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://huggingface.co/papers/1409.1556) //! use candle::{ModuleT, Result, Tensor}; diff --git a/candle-transformers/src/models/vit.rs b/candle-transformers/src/models/vit.rs index 49ab4630..d0a9755e 100644 --- a/candle-transformers/src/models/vit.rs +++ b/candle-transformers/src/models/vit.rs @@ -11,7 +11,7 @@ //! - Layer normalization //! //! References: -//! - [ViT Paper](https://arxiv.org/abs/2010.11929) +//! - [ViT Paper](https://huggingface.co/papers/2010.11929) //! - [Model Card](https://huggingface.co/google/vit-base-patch16-224) //! diff --git a/candle-transformers/src/models/yi.rs b/candle-transformers/src/models/yi.rs index 8a2fb111..a38bd86c 100644 --- a/candle-transformers/src/models/yi.rs +++ b/candle-transformers/src/models/yi.rs @@ -6,7 +6,7 @@ //! Original code: //! - 💻 [Yi Model](https://huggingface.co/01-ai/Yi-6B) //! - 💻 [Yi Modeling Code](https://huggingface.co/01-ai/Yi-6B/blob/main/modeling_yi.py) -//! - 📝 [Technical Report](https://arxiv.org/abs/2403.04652) Yi: Open Foundation Models by 01.AI +//! - 📝 [Technical Report](https://huggingface.co/papers/2403.04652) Yi: Open Foundation Models by 01.AI //! //! Key characteristics: //! - Multi-head attention with rotary positional embeddings diff --git a/candle-transformers/src/object_detection.rs b/candle-transformers/src/object_detection.rs index d1b78cfa..da285cd6 100644 --- a/candle-transformers/src/object_detection.rs +++ b/candle-transformers/src/object_detection.rs @@ -70,7 +70,7 @@ fn update_confidences( for index in (current_index + 1)..len { let iou_val = iou(current_bbox, &bboxes_for_class[index]); if iou_val > iou_threshold { - // Decay calculation from page 4 of: https://arxiv.org/pdf/1704.04503 + // Decay calculation from page 4 of: https://huggingface.co/papers/1704.04503 let decay = (-iou_val * iou_val / sigma).exp(); let updated_confidence = bboxes_for_class[index].confidence * decay; updated_confidences[index] = updated_confidence; @@ -80,7 +80,7 @@ fn update_confidences( } // Sorts the bounding boxes by confidence and applies soft non-maximum suppression. -// This function is based on the algorithm described in https://arxiv.org/pdf/1704.04503 +// This function is based on the algorithm described in https://huggingface.co/papers/1704.04503 pub fn soft_non_maximum_suppression( bboxes: &mut [Vec>], iou_threshold: Option, diff --git a/candle-wasm-examples/phi/index.html b/candle-wasm-examples/phi/index.html index dbef698a..eabd3336 100644 --- a/candle-wasm-examples/phi/index.html +++ b/candle-wasm-examples/phi/index.html @@ -376,7 +376,7 @@ Very polite review:`, billion parameters. Here you can try the quantized versions. Additional prompt examples are available in the