huggingface
diff --git a/‎.circleci/config.yml‎
Lines changed: 4 additions & 4 deletions b/‎.circleci/config.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 1 addition & 0 deletions b/‎Makefile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmark_v2/framework/benchmark_config.py‎
Lines changed: 2 additions & 0 deletions b/‎benchmark_v2/framework/benchmark_config.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmark_v2/run_benchmarks.py‎
Lines changed: 6 additions & 6 deletions b/‎benchmark_v2/run_benchmarks.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎docker/transformers-pytorch-amd-gpu/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎docker/transformers-pytorch-amd-gpu/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/de/add_new_model.md‎
Lines changed: 7 additions & 7 deletions b/‎docs/source/de/add_new_model.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 11 additions & 49 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 11 additions & 49 deletions
diff --git a/‎docs/source/en/add_new_model.md‎
Lines changed: 7 additions & 7 deletions b/‎docs/source/en/add_new_model.md‎
Lines changed: 7 additions & 7 deletions
@@ -46,8 +46,8 @@ jobs:
             - run: uv pip install -U -e .
             - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
             - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
-            - run: python utils/tests_fetcher.py --filter_tests
+            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt || true
+            - run: python utils/tests_fetcher.py --filter_tests || true
             - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
             - run: |
                 if [ ! -s test_preparation/generated_config.yml ]; then
@@ -98,8 +98,8 @@ jobs:
             - run: uv pip install -U -e .
             - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
             - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt
-            - run: python utils/tests_fetcher.py --filter_tests
+            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt || true
+            - run: python utils/tests_fetcher.py --filter_tests || true
             - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
             - run: |
                 if [ ! -s test_preparation/generated_config.yml ]; then
 
@@ -125,8 +125,9 @@ If you're contributing a **vision-language model** (or any multimodal model that
 All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:
 
 - Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
-- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. 
+- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](./modular_transformers#implementing-a-modular-file) shows a quick way to set up a modular file.
 - Reuse existing patterns from similar models as much as possible
+- You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](./transformers_as_backend#multimodal-models)
 
 To verify your modular file is correct, run:
 
 
@@ -45,6 +45,7 @@ repo-consistency:
 	python utils/check_modular_conversion.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
+	python utils/check_init_weights_data.py
 	python utils/check_inits.py
 	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
 
@@ -203,6 +203,8 @@ def adapt_configs(
             config["sequence_length"] = seqlen
             config["num_tokens_to_generate"] = ntok
             config["gpu_monitoring"] = monitor
+            # Remove the old name so it gets re-inferred with the updated values
+            config.pop("name", None)
             adapted_configs.append(BenchmarkConfig.from_dict(config))
     return adapted_configs
 
 
@@ -80,16 +80,16 @@
     logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
     logger.info(f"Output directory: {args.output_dir}")
 
-    # We cannot compute ITL if we don't have at least two measurements
-    if any(n <= 1 for n in args.num_tokens_to_generate):
-        raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
-
     # Error out if one of the arguments is not provided
-    if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
+    if any(arg is None for arg in [args.batch_size, args.sequence_length, args.num_tokens_to_generate]):
         raise ValueError(
-            "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
+            "All of the arguments --batch-size, --sequence-length, and --num-tokens-to-generate are required"
         )
 
+    # We cannot compute ITL if we don't have at least two measurements
+    if any(n <= 1 for n in args.num_tokens_to_generate):
+        raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
+
     # Get the configs for the given coverage level
     configs = get_config_by_level(args.level)
     # Adapt the configs to the given arguments
 
@@ -1,4 +1,4 @@
-FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
+FROM rocm/pytorch:rocm7.1_ubuntu22.04_py3.10_pytorch_release_2.8.0
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -34,7 +34,7 @@ RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
 RUN python3 -m pip uninstall -y kernels
 
 # On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
-RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
+RUN python3 -m pip install --no-cache-dir "torchcodec==0.7"
 
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
 
@@ -508,16 +508,16 @@ BERT `_init_weights` Methode:
 def _init_weights(self, module):
     """Initialize the weights"""
     if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
         if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
     elif isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
         if module.padding_idx is not None:
             module.weight.data[module.padding_idx].zero_()
     elif isinstance(module, nn.LayerNorm):
-        module.bias.data.zero_()
-        module.weight.data.fill_(1.0)
+        module.bias.zero_()
+        module.weight.fill_(1.0)
 ```
 
 Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
@@ -533,9 +533,9 @@ def _init_weights(self, module):
         module.project_hid._is_hf_initialized = True
         module.project_q._is_hf_initialized = True
     elif isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
         if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
 ```
 
 Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
 
@@ -118,7 +118,7 @@
   - local: tools
     title: Tools
   - local: transformers_as_backend
-    title: Inference server backends
+    title: Transformers as modeling backend
   - local: continuous_batching
     title: Continuous Batching
   title: Inference
@@ -422,8 +422,6 @@
         title: BLOOM
       - local: model_doc/blt
         title: BLT
-      - local: model_doc/bort
-        title: BORT
       - local: model_doc/byt5
         title: ByT5
       - local: model_doc/camembert
@@ -478,8 +476,6 @@
         title: Ernie4_5
       - local: model_doc/ernie4_5_moe
         title: Ernie4_5_MoE
-      - local: model_doc/ernie_m
-        title: ErnieM
       - local: model_doc/esm
         title: ESM
       - local: model_doc/exaone4
@@ -534,8 +530,6 @@
         title: GPTBigCode
       - local: model_doc/gpt_oss
         title: GptOss
-      - local: model_doc/gptsan-japanese
-        title: GPTSAN Japanese
       - local: model_doc/gpt-sw3
         title: GPTSw3
       - local: model_doc/granite
@@ -560,8 +554,6 @@
         title: Jamba
       - local: model_doc/jetmoe
         title: JetMoe
-      - local: model_doc/jukebox
-        title: Jukebox
       - local: model_doc/led
         title: LED
       - local: model_doc/lfm2
@@ -596,8 +588,6 @@
         title: MarkupLM
       - local: model_doc/mbart
         title: MBart and MBart-50
-      - local: model_doc/mega
-        title: MEGA
       - local: model_doc/megatron-bert
         title: MegatronBERT
       - local: model_doc/megatron_gpt2
@@ -632,8 +622,6 @@
         title: myt5
       - local: model_doc/nemotron
         title: Nemotron
-      - local: model_doc/nezha
-        title: NEZHA
       - local: model_doc/nllb
         title: NLLB
       - local: model_doc/nllb-moe
@@ -648,8 +636,6 @@
         title: Olmo3
       - local: model_doc/olmoe
         title: OLMoE
-      - local: model_doc/open-llama
-        title: Open-Llama
       - local: model_doc/opt
         title: OPT
       - local: model_doc/pegasus
@@ -670,8 +656,6 @@
         title: PLBart
       - local: model_doc/prophetnet
         title: ProphetNet
-      - local: model_doc/qdqbert
-        title: QDQBert
       - local: model_doc/qwen2
         title: Qwen2
       - local: model_doc/qwen2_moe
@@ -684,16 +668,12 @@
         title: Qwen3Next
       - local: model_doc/rag
         title: RAG
-      - local: model_doc/realm
-        title: REALM
       - local: model_doc/recurrent_gemma
         title: RecurrentGemma
       - local: model_doc/reformer
         title: Reformer
       - local: model_doc/rembert
         title: RemBERT
-      - local: model_doc/retribert
-        title: RetriBERT
       - local: model_doc/roberta
         title: RoBERTa
       - local: model_doc/roberta-prelayernorm
@@ -722,10 +702,6 @@
         title: T5Gemma
       - local: model_doc/t5v1.1
         title: T5v1.1
-      - local: model_doc/tapex
-        title: TAPEX
-      - local: model_doc/transfo-xl
-        title: Transformer XL
       - local: model_doc/ul2
         title: UL2
       - local: model_doc/umt5
@@ -738,8 +714,6 @@
         title: XGLM
       - local: model_doc/xlm
         title: XLM
-      - local: model_doc/xlm-prophetnet
-        title: XLM-ProphetNet
       - local: model_doc/xlm-roberta
         title: XLM-RoBERTa
       - local: model_doc/xlm-roberta-xl
@@ -786,8 +760,6 @@
         title: Depth Anything V2
       - local: model_doc/depth_pro
         title: DepthPro
-      - local: model_doc/deta
-        title: DETA
       - local: model_doc/detr
         title: DETR
       - local: model_doc/dinat
@@ -802,8 +774,6 @@
         title: DiT
       - local: model_doc/dpt
         title: DPT
-      - local: model_doc/efficientformer
-        title: EfficientFormer
       - local: model_doc/efficientloftr
         title: EfficientLoFTR
       - local: model_doc/efficientnet
@@ -840,8 +810,6 @@
         title: MobileViT
       - local: model_doc/mobilevitv2
         title: MobileViTV2
-      - local: model_doc/nat
-        title: NAT
       - local: model_doc/poolformer
         title: PoolFormer
       - local: model_doc/prompt_depth_anything
@@ -860,6 +828,8 @@
         title: RT-DETRv2
       - local: model_doc/sam2
         title: SAM2
+      - local: model_doc/sam3_tracker
+        title: Sam3Tracker
       - local: model_doc/segformer
         title: SegFormer
       - local: model_doc/seggpt
@@ -888,12 +858,8 @@
         title: Timm Wrapper
       - local: model_doc/upernet
         title: UperNet
-      - local: model_doc/van
-        title: VAN
       - local: model_doc/vit
         title: Vision Transformer (ViT)
-      - local: model_doc/vit_hybrid
-        title: ViT Hybrid
       - local: model_doc/vitdet
         title: ViTDet
       - local: model_doc/vit_mae
@@ -932,8 +898,6 @@
         title: Hubert
       - local: model_doc/kyutai_speech_to_text
         title: Kyutai Speech-To-Text
-      - local: model_doc/mctct
-        title: MCTCT
       - local: model_doc/mimi
         title: Mimi
       - local: model_doc/mms
@@ -960,8 +924,6 @@
         title: SEW-D
       - local: model_doc/speech_to_text
         title: Speech2Text
-      - local: model_doc/speech_to_text_2
-        title: Speech2Text2
       - local: model_doc/speecht5
         title: SpeechT5
       - local: model_doc/unispeech
@@ -994,6 +956,8 @@
     - sections:
       - local: model_doc/sam2_video
         title: SAM2 Video
+      - local: model_doc/sam3_tracker_video
+        title: Sam3TrackerVideo
       - local: model_doc/timesformer
         title: TimeSformer
       - local: model_doc/vjepa2
@@ -1068,6 +1032,8 @@
         title: Gemma3n
       - local: model_doc/git
         title: GIT
+      - local: model_doc/glm46v
+        title: Glm46V
       - local: model_doc/glm4v
         title: glm4v
       - local: model_doc/glm4v_moe
@@ -1172,6 +1138,10 @@
         title: Qwen3VL
       - local: model_doc/qwen3_vl_moe
         title: Qwen3VLMoe
+      - local: model_doc/sam3
+        title: SAM3
+      - local: model_doc/sam3_video
+        title: SAM3 Video
       - local: model_doc/shieldgemma2
         title: ShieldGemma2
       - local: model_doc/siglip
@@ -1188,8 +1158,6 @@
         title: TAPAS
       - local: model_doc/trocr
         title: TrOCR
-      - local: model_doc/tvlt
-        title: TVLT
       - local: model_doc/tvp
         title: TVP
       - local: model_doc/udop
@@ -1216,8 +1184,6 @@
     - sections:
       - local: model_doc/decision_transformer
         title: Decision Transformer
-      - local: model_doc/trajectory_transformer
-        title: Trajectory Transformer
       title: Reinforcement learning models
     - sections:
       - local: model_doc/autoformer
@@ -1233,10 +1199,6 @@
       - local: model_doc/timesfm
         title: TimesFM
       title: Time series models
-    - sections:
-      - local: model_doc/graphormer
-        title: Graphormer
-      title: Graph models
     title: Models
   - sections:
     - local: internal/modeling_utils
 
@@ -314,16 +314,16 @@ Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreT
 def _init_weights(self, module):
     """Initialize the weights"""
     if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
         if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
     elif isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
         if module.padding_idx is not None:
             module.weight.data[module.padding_idx].zero_()
     elif isinstance(module, nn.LayerNorm):
-        module.bias.data.zero_()
-        module.weight.data.fill_(1.0)
+        module.bias.zero_()
+        module.weight.fill_(1.0)
 ```
 
 The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
@@ -339,9 +339,9 @@ def _init_weights(self, module):
         module.project_hid._is_hf_initialized = True
         module.project_q._is_hf_initialized = True
     elif isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
         if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
 ```
 
 ### Convert checkpoints to Transformers