Fix broken image inference for Fuyu model (#39915)

Isotr0py · ArthurZucker · commit 95ae07d11fb9 · 2025-08-13T10:22:01.000+02:00
* fix fuyu

Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;

* oops

Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;

* run test on GPU

Signed-off-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;

* clean unused

Signed-off-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;

* revert

Signed-off-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;

* add fuyu multimodal test

Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;

* fix

Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;

---------

Signed-off-by: Isotr0py &lt;2037008807@qq.com&gt;
Signed-off-by: Isotr0py &lt;mozf@mail2.sysu.edu.cn&gt;
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -225,7 +225,7 @@ def forward(
         if image_patches is not None:
             patch_embeddings = self.get_image_features(image_patches)
             patch_embeddings = torch.cat(patch_embeddings, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
-            special_image_mask = self.get_placeholder_tokens(
+            special_image_mask = self.get_placeholder_mask(
                 input_ids, inputs_embeds=inputs_embeds, image_features=patch_embeddings
             )
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, patch_embeddings)
@@ -379,6 +379,7 @@ def prepare_inputs_for_generation(
         inputs_embeds=None,
         image_patches=None,
         image_patches_indices=None,
+        cache_position=None,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -390,10 +391,12 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
             image_patches=image_patches,
             image_patches_indices=image_patches_indices,
+            cache_position=cache_position,
             **kwargs,
         )
 
-        if past_key_values is not None:
+        if cache_position[0] != 0:
+            # set image_patches and image_patches_indices to `None` for decoding stage
             model_inputs["image_patches_indices"] = None
             model_inputs["image_patches"] = None
 
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
@@ -13,19 +13,21 @@
 # limitations under the License.
 """Testing suite for the PyTorch Fuyu model."""
 
+import copy
 import io
 import unittest
 
 import pytest
 import requests
+import torch
 from parameterized import parameterized
 
 from transformers import FuyuConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_accelerator, slow
+from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -47,6 +49,7 @@ def __init__(
         parent,
         batch_size=13,
         seq_length=7,
+        num_image_tokens=2,
         image_size=30,
         patch_size=15,
         num_channels=3,
@@ -67,12 +70,14 @@ def __init__(
         initializer_range=0.02,
         num_labels=3,
         num_choices=4,
-        pad_token_id=0,
+        pad_token_id=10,
+        image_token_id=1,
         scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
-        self.seq_length = seq_length
+        self.num_image_tokens = num_image_tokens
+        self.seq_length = seq_length + num_image_tokens
         self.image_size = image_size
         self.patch_size = patch_size
         self.num_channels = num_channels
@@ -94,10 +99,15 @@ def __init__(
         self.num_labels = num_labels
         self.num_choices = num_choices
         self.pad_token_id = pad_token_id
+        self.image_token_id = image_token_id
         self.scope = scope
 
     def prepare_config_and_inputs(self):
+        config = self.get_config()
+
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids[input_ids == config.image_token_id] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_id
 
         input_mask = None
         if self.use_input_mask:
@@ -109,8 +119,6 @@ def prepare_config_and_inputs(self):
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
 
-        config = self.get_config()
-
         return config, input_ids, input_mask, sequence_labels, token_labels
 
     def get_config(self):
@@ -128,6 +136,7 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             pad_token_id=self.pad_token_id,
+            image_token_id=self.image_token_id,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -139,7 +148,10 @@ def prepare_config_and_inputs_for_common(self):
             sequence_labels,
             token_labels,
         ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        image_patches = floats_tensor(
+            [self.batch_size, self.num_image_tokens, config.num_channels * config.patch_size**2]
+        )
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask, "image_patches": image_patches}
         return config, inputs_dict
 
 
@@ -166,6 +178,27 @@ class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     def setUp(self):
         self.model_tester = FuyuModelTester(self)
 
+    def test_mismatching_image_patches(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            curr_input_dict = copy.deepcopy(input_dict)  # in=place modifications further
+
+            # two image token and two image
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_ids = curr_input_dict["input_ids"]
+            image_patches = curr_input_dict["image_patches"][1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, image_patches=image_patches)
+
+            # remove one image token from text
+            input_ids = curr_input_dict["input_ids"][2:]
+            image_patches = curr_input_dict["image_patches"]
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, image_patches=image_patches)
+
     @unittest.skip(
         reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -232,7 +265,7 @@ def default_processor(self):
 
     @cached_property
     def default_model(self):
-        return FuyuForCausalLM.from_pretrained("adept/fuyu-8b")
+        return FuyuForCausalLM.from_pretrained("adept/fuyu-8b", torch_dtype="float16", device_map=torch_device)
 
     def test_greedy_generation(self):
         processor = self.default_processor
@@ -243,7 +276,9 @@ def test_greedy_generation(self):
 
         text_prompt_coco_captioning = "Generate a coco-style caption.\n"
 
-        inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt")
+        inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt").to(
+            torch_device, torch.float16
+        )
         generated_ids = model.generate(**inputs, max_new_tokens=10)
 
         # take the last 8 tokens (in order to skip special \n\x04 characters) and decode them