Skip to content

Commit 826cb12

Browse files
authored
Merge branch 'main' into add_afmoe_model
2 parents 8958684 + 453a246 commit 826cb12

File tree

946 files changed

+36588
-61261
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

946 files changed

+36588
-61261
lines changed

.circleci/config.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ jobs:
4646
- run: uv pip install -U -e .
4747
- run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
4848
- run: mkdir -p test_preparation
49-
- run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
50-
- run: python utils/tests_fetcher.py --filter_tests
49+
- run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt || true
50+
- run: python utils/tests_fetcher.py --filter_tests || true
5151
- run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
5252
- run: |
5353
if [ ! -s test_preparation/generated_config.yml ]; then
@@ -98,8 +98,8 @@ jobs:
9898
- run: uv pip install -U -e .
9999
- run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
100100
- run: mkdir -p test_preparation
101-
- run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt
102-
- run: python utils/tests_fetcher.py --filter_tests
101+
- run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt || true
102+
- run: python utils/tests_fetcher.py --filter_tests || true
103103
- run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
104104
- run: |
105105
if [ ! -s test_preparation/generated_config.yml ]; then

CONTRIBUTING.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,9 @@ If you're contributing a **vision-language model** (or any multimodal model that
125125
All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:
126126

127127
- Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
128-
- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well.
128+
- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](./modular_transformers#implementing-a-modular-file) shows a quick way to set up a modular file.
129129
- Reuse existing patterns from similar models as much as possible
130+
- You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](./transformers_as_backend#multimodal-models)
130131

131132
To verify your modular file is correct, run:
132133

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ repo-consistency:
4545
python utils/check_modular_conversion.py
4646
python utils/check_dummies.py
4747
python utils/check_repo.py
48+
python utils/check_init_weights_data.py
4849
python utils/check_inits.py
4950
python utils/check_pipeline_typing.py
5051
python utils/check_config_docstrings.py

benchmark_v2/framework/benchmark_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ def adapt_configs(
203203
config["sequence_length"] = seqlen
204204
config["num_tokens_to_generate"] = ntok
205205
config["gpu_monitoring"] = monitor
206+
# Remove the old name so it gets re-inferred with the updated values
207+
config.pop("name", None)
206208
adapted_configs.append(BenchmarkConfig.from_dict(config))
207209
return adapted_configs
208210

benchmark_v2/run_benchmarks.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,16 +80,16 @@
8080
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
8181
logger.info(f"Output directory: {args.output_dir}")
8282

83-
# We cannot compute ITL if we don't have at least two measurements
84-
if any(n <= 1 for n in args.num_tokens_to_generate):
85-
raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
86-
8783
# Error out if one of the arguments is not provided
88-
if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
84+
if any(arg is None for arg in [args.batch_size, args.sequence_length, args.num_tokens_to_generate]):
8985
raise ValueError(
90-
"At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
86+
"All of the arguments --batch-size, --sequence-length, and --num-tokens-to-generate are required"
9187
)
9288

89+
# We cannot compute ITL if we don't have at least two measurements
90+
if any(n <= 1 for n in args.num_tokens_to_generate):
91+
raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
92+
9393
# Get the configs for the given coverage level
9494
configs = get_config_by_level(args.level)
9595
# Adapt the configs to the given arguments

docker/transformers-pytorch-amd-gpu/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
1+
FROM rocm/pytorch:rocm7.1_ubuntu22.04_py3.10_pytorch_release_2.8.0
22
LABEL maintainer="Hugging Face"
33

44
ARG DEBIAN_FRONTEND=noninteractive
@@ -34,7 +34,7 @@ RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
3434
RUN python3 -m pip uninstall -y kernels
3535

3636
# On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
37-
RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
37+
RUN python3 -m pip install --no-cache-dir "torchcodec==0.7"
3838

3939
# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
4040
RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \

docs/source/de/add_new_model.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -508,16 +508,16 @@ BERT `_init_weights` Methode:
508508
def _init_weights(self, module):
509509
"""Initialize the weights"""
510510
if isinstance(module, nn.Linear):
511-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
511+
module.weight.normal_(mean=0.0, std=self.config.initializer_range)
512512
if module.bias is not None:
513-
module.bias.data.zero_()
513+
module.bias.zero_()
514514
elif isinstance(module, nn.Embedding):
515-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
515+
module.weight.normal_(mean=0.0, std=self.config.initializer_range)
516516
if module.padding_idx is not None:
517517
module.weight.data[module.padding_idx].zero_()
518518
elif isinstance(module, nn.LayerNorm):
519-
module.bias.data.zero_()
520-
module.weight.data.fill_(1.0)
519+
module.bias.zero_()
520+
module.weight.fill_(1.0)
521521
```
522522

523523
Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
@@ -533,9 +533,9 @@ def _init_weights(self, module):
533533
module.project_hid._is_hf_initialized = True
534534
module.project_q._is_hf_initialized = True
535535
elif isinstance(module, nn.Linear):
536-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
536+
module.weight.normal_(mean=0.0, std=self.config.initializer_range)
537537
if module.bias is not None:
538-
module.bias.data.zero_()
538+
module.bias.zero_()
539539
```
540540

541541
Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf

docs/source/en/_toctree.yml

Lines changed: 11 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@
118118
- local: tools
119119
title: Tools
120120
- local: transformers_as_backend
121-
title: Inference server backends
121+
title: Transformers as modeling backend
122122
- local: continuous_batching
123123
title: Continuous Batching
124124
title: Inference
@@ -422,8 +422,6 @@
422422
title: BLOOM
423423
- local: model_doc/blt
424424
title: BLT
425-
- local: model_doc/bort
426-
title: BORT
427425
- local: model_doc/byt5
428426
title: ByT5
429427
- local: model_doc/camembert
@@ -478,8 +476,6 @@
478476
title: Ernie4_5
479477
- local: model_doc/ernie4_5_moe
480478
title: Ernie4_5_MoE
481-
- local: model_doc/ernie_m
482-
title: ErnieM
483479
- local: model_doc/esm
484480
title: ESM
485481
- local: model_doc/exaone4
@@ -534,8 +530,6 @@
534530
title: GPTBigCode
535531
- local: model_doc/gpt_oss
536532
title: GptOss
537-
- local: model_doc/gptsan-japanese
538-
title: GPTSAN Japanese
539533
- local: model_doc/gpt-sw3
540534
title: GPTSw3
541535
- local: model_doc/granite
@@ -560,8 +554,6 @@
560554
title: Jamba
561555
- local: model_doc/jetmoe
562556
title: JetMoe
563-
- local: model_doc/jukebox
564-
title: Jukebox
565557
- local: model_doc/led
566558
title: LED
567559
- local: model_doc/lfm2
@@ -596,8 +588,6 @@
596588
title: MarkupLM
597589
- local: model_doc/mbart
598590
title: MBart and MBart-50
599-
- local: model_doc/mega
600-
title: MEGA
601591
- local: model_doc/megatron-bert
602592
title: MegatronBERT
603593
- local: model_doc/megatron_gpt2
@@ -632,8 +622,6 @@
632622
title: myt5
633623
- local: model_doc/nemotron
634624
title: Nemotron
635-
- local: model_doc/nezha
636-
title: NEZHA
637625
- local: model_doc/nllb
638626
title: NLLB
639627
- local: model_doc/nllb-moe
@@ -648,8 +636,6 @@
648636
title: Olmo3
649637
- local: model_doc/olmoe
650638
title: OLMoE
651-
- local: model_doc/open-llama
652-
title: Open-Llama
653639
- local: model_doc/opt
654640
title: OPT
655641
- local: model_doc/pegasus
@@ -670,8 +656,6 @@
670656
title: PLBart
671657
- local: model_doc/prophetnet
672658
title: ProphetNet
673-
- local: model_doc/qdqbert
674-
title: QDQBert
675659
- local: model_doc/qwen2
676660
title: Qwen2
677661
- local: model_doc/qwen2_moe
@@ -684,16 +668,12 @@
684668
title: Qwen3Next
685669
- local: model_doc/rag
686670
title: RAG
687-
- local: model_doc/realm
688-
title: REALM
689671
- local: model_doc/recurrent_gemma
690672
title: RecurrentGemma
691673
- local: model_doc/reformer
692674
title: Reformer
693675
- local: model_doc/rembert
694676
title: RemBERT
695-
- local: model_doc/retribert
696-
title: RetriBERT
697677
- local: model_doc/roberta
698678
title: RoBERTa
699679
- local: model_doc/roberta-prelayernorm
@@ -722,10 +702,6 @@
722702
title: T5Gemma
723703
- local: model_doc/t5v1.1
724704
title: T5v1.1
725-
- local: model_doc/tapex
726-
title: TAPEX
727-
- local: model_doc/transfo-xl
728-
title: Transformer XL
729705
- local: model_doc/ul2
730706
title: UL2
731707
- local: model_doc/umt5
@@ -738,8 +714,6 @@
738714
title: XGLM
739715
- local: model_doc/xlm
740716
title: XLM
741-
- local: model_doc/xlm-prophetnet
742-
title: XLM-ProphetNet
743717
- local: model_doc/xlm-roberta
744718
title: XLM-RoBERTa
745719
- local: model_doc/xlm-roberta-xl
@@ -786,8 +760,6 @@
786760
title: Depth Anything V2
787761
- local: model_doc/depth_pro
788762
title: DepthPro
789-
- local: model_doc/deta
790-
title: DETA
791763
- local: model_doc/detr
792764
title: DETR
793765
- local: model_doc/dinat
@@ -802,8 +774,6 @@
802774
title: DiT
803775
- local: model_doc/dpt
804776
title: DPT
805-
- local: model_doc/efficientformer
806-
title: EfficientFormer
807777
- local: model_doc/efficientloftr
808778
title: EfficientLoFTR
809779
- local: model_doc/efficientnet
@@ -840,8 +810,6 @@
840810
title: MobileViT
841811
- local: model_doc/mobilevitv2
842812
title: MobileViTV2
843-
- local: model_doc/nat
844-
title: NAT
845813
- local: model_doc/poolformer
846814
title: PoolFormer
847815
- local: model_doc/prompt_depth_anything
@@ -860,6 +828,8 @@
860828
title: RT-DETRv2
861829
- local: model_doc/sam2
862830
title: SAM2
831+
- local: model_doc/sam3_tracker
832+
title: Sam3Tracker
863833
- local: model_doc/segformer
864834
title: SegFormer
865835
- local: model_doc/seggpt
@@ -888,12 +858,8 @@
888858
title: Timm Wrapper
889859
- local: model_doc/upernet
890860
title: UperNet
891-
- local: model_doc/van
892-
title: VAN
893861
- local: model_doc/vit
894862
title: Vision Transformer (ViT)
895-
- local: model_doc/vit_hybrid
896-
title: ViT Hybrid
897863
- local: model_doc/vitdet
898864
title: ViTDet
899865
- local: model_doc/vit_mae
@@ -932,8 +898,6 @@
932898
title: Hubert
933899
- local: model_doc/kyutai_speech_to_text
934900
title: Kyutai Speech-To-Text
935-
- local: model_doc/mctct
936-
title: MCTCT
937901
- local: model_doc/mimi
938902
title: Mimi
939903
- local: model_doc/mms
@@ -960,8 +924,6 @@
960924
title: SEW-D
961925
- local: model_doc/speech_to_text
962926
title: Speech2Text
963-
- local: model_doc/speech_to_text_2
964-
title: Speech2Text2
965927
- local: model_doc/speecht5
966928
title: SpeechT5
967929
- local: model_doc/unispeech
@@ -994,6 +956,8 @@
994956
- sections:
995957
- local: model_doc/sam2_video
996958
title: SAM2 Video
959+
- local: model_doc/sam3_tracker_video
960+
title: Sam3TrackerVideo
997961
- local: model_doc/timesformer
998962
title: TimeSformer
999963
- local: model_doc/vjepa2
@@ -1068,6 +1032,8 @@
10681032
title: Gemma3n
10691033
- local: model_doc/git
10701034
title: GIT
1035+
- local: model_doc/glm46v
1036+
title: Glm46V
10711037
- local: model_doc/glm4v
10721038
title: glm4v
10731039
- local: model_doc/glm4v_moe
@@ -1172,6 +1138,10 @@
11721138
title: Qwen3VL
11731139
- local: model_doc/qwen3_vl_moe
11741140
title: Qwen3VLMoe
1141+
- local: model_doc/sam3
1142+
title: SAM3
1143+
- local: model_doc/sam3_video
1144+
title: SAM3 Video
11751145
- local: model_doc/shieldgemma2
11761146
title: ShieldGemma2
11771147
- local: model_doc/siglip
@@ -1188,8 +1158,6 @@
11881158
title: TAPAS
11891159
- local: model_doc/trocr
11901160
title: TrOCR
1191-
- local: model_doc/tvlt
1192-
title: TVLT
11931161
- local: model_doc/tvp
11941162
title: TVP
11951163
- local: model_doc/udop
@@ -1216,8 +1184,6 @@
12161184
- sections:
12171185
- local: model_doc/decision_transformer
12181186
title: Decision Transformer
1219-
- local: model_doc/trajectory_transformer
1220-
title: Trajectory Transformer
12211187
title: Reinforcement learning models
12221188
- sections:
12231189
- local: model_doc/autoformer
@@ -1233,10 +1199,6 @@
12331199
- local: model_doc/timesfm
12341200
title: TimesFM
12351201
title: Time series models
1236-
- sections:
1237-
- local: model_doc/graphormer
1238-
title: Graphormer
1239-
title: Graph models
12401202
title: Models
12411203
- sections:
12421204
- local: internal/modeling_utils

docs/source/en/add_new_model.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -314,16 +314,16 @@ Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreT
314314
def _init_weights(self, module):
315315
"""Initialize the weights"""
316316
if isinstance(module, nn.Linear):
317-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
317+
module.weight.normal_(mean=0.0, std=self.config.initializer_range)
318318
if module.bias is not None:
319-
module.bias.data.zero_()
319+
module.bias.zero_()
320320
elif isinstance(module, nn.Embedding):
321-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
321+
module.weight.normal_(mean=0.0, std=self.config.initializer_range)
322322
if module.padding_idx is not None:
323323
module.weight.data[module.padding_idx].zero_()
324324
elif isinstance(module, nn.LayerNorm):
325-
module.bias.data.zero_()
326-
module.weight.data.fill_(1.0)
325+
module.bias.zero_()
326+
module.weight.fill_(1.0)
327327
```
328328

329329
The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
@@ -339,9 +339,9 @@ def _init_weights(self, module):
339339
module.project_hid._is_hf_initialized = True
340340
module.project_q._is_hf_initialized = True
341341
elif isinstance(module, nn.Linear):
342-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
342+
module.weight.normal_(mean=0.0, std=self.config.initializer_range)
343343
if module.bias is not None:
344-
module.bias.data.zero_()
344+
module.bias.zero_()
345345
```
346346

347347
### Convert checkpoints to Transformers

0 commit comments

Comments
 (0)