Merge pull request #540 from yangguohao/main

cangtianhuang · web-flow · commit 44e074cba5ed · 2025-09-16T16:29:34.000+08:00
【Hackathon 9th No.15】Fix variable_length_memory_efficient_attention
diff --git a/tester/api_config/config_analyzer.py b/tester/api_config/config_analyzer.py
@@ -1948,8 +1948,10 @@ def get_padding_offset(bsz, max_seq_len, seq_lens_this_time):
                     self.numpy_tensor = self.get_random_numpy_tensor(shape=self.shape, data_type=self.dtype, min=1, max=min(k_seq_len, v_seq_len))
                 elif self.check_arg(api_config, 5, "mask"):
                     # mask should between -inf and 0 (0 is included)
-                    eps = numpy.finfo(self.dtype).eps
-                    self.numpy_tensor = self.get_random_numpy_tensor(shape=self.shape, data_type=self.dtype, max=0 + eps)
+                    # eps = numpy.finfo(self.dtype).eps
+                    # self.numpy_tensor = self.get_random_numpy_tensor(shape=self.shape, data_type=self.dtype, max=0 + eps)
+                    # mask should be -inf(masked) or 0(not masked)
+                    self.numpy_tensor = numpy.random.randint(0, 2, size=self.shape).astype(self.dtype) * (numpy.finfo(self.dtype).min)
             elif api_config.api_name == "paddle.zeros":
                 self.numpy_tensor = numpy.random.randint(0, 2048, size = self.shape)
 
diff --git a/tester/paddle_to_torch/rules.py b/tester/paddle_to_torch/rules.py
@@ -6350,8 +6350,10 @@ def variable_length_memory_efficient_attention(
     if key.shape[1] != num_heads:
         # Repeat key and value along the num_heads dimension
         repeat_factor = num_heads // key.shape[1]
-        key = key.repeat(1, repeat_factor, 1, 1)
-        value = value.repeat(1, repeat_factor, 1, 1)
+        # key = key.repeat(1, repeat_factor, 1, 1)
+        # value = value.repeat(1, repeat_factor, 1, 1)
+        key = key.unsqueeze(2).expand(-1,-1, repeat_factor, -1, -1).reshape(batch_size, num_heads, key_seq_len, head_size)
+        value = value.unsqueeze(2).expand(-1,-1, repeat_factor, -1, -1).reshape(batch_size, num_heads, key_seq_len, head_size)
     # Default scale if not provided
     if scale is None:
         scale = math.sqrt(1.0 / head_size)
@@ -6380,8 +6382,9 @@ def variable_length_memory_efficient_attention(
     qk_res = torch.matmul(query, key.transpose(-1, -2))  # [batch_size, num_heads, query_seq_len, key_seq_len]
     # Apply scale
     attention = qk_res * scale
-    attention = attention.masked_fill(~seq_mask, torch.finfo(attention.dtype).min)
+    # attention = attention.masked_fill(~seq_mask, torch.finfo(attention.dtype).min)
     attention = attention + mask
+    attention = attention.masked_fill(~seq_mask, torch.finfo(attention.dtype).min)
     # Softmax over the last dimension
     softmax_result = torch.nn.functional.softmax(attention, dim=-1)
     softmax_result = softmax_result.masked_fill(~seq_mask, 0.0)