Skip to content

Commit 3128bf1

Browse files
Merge pull request #543 from cangtianhuang/develop
[Accuracy diff No.95] Fix accuracy diff for paddle.incubate.nn.functional.fused_rotary_position_embedding API
2 parents a1dc900 + 7120e38 commit 3128bf1

File tree

3 files changed

+107
-167
lines changed

3 files changed

+107
-167
lines changed

tester/accuracy.py

Lines changed: 64 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test(self):
5252
print("gen_numpy_input failed")
5353
return
5454
except Exception as err:
55-
print("[numpy error]", self.api_config.config, "\n", str(err))
55+
print(f"[numpy error] {self.api_config.config}\n{str(err)}")
5656
traceback.print_exc()
5757
write_to_log("numpy_error", self.api_config.config)
5858
return
@@ -121,7 +121,7 @@ def test(self):
121121

122122
paddle.base.core.eager._for_test_check_cuda_error()
123123
except Exception as err:
124-
print("[torch error]", self.api_config.config, "\n", str(err), flush=True)
124+
print(f"[torch error] {self.api_config.config}\n{str(err)}", flush=True)
125125
traceback.print_exc()
126126
write_to_log("torch_error", self.api_config.config)
127127
if "CUDA error" in str(err) or "memory corruption" in str(err) or "CUDA out of memory" in str(err):
@@ -145,7 +145,7 @@ def test(self):
145145
del inputs_list, result_outputs, result_outputs_grads
146146
except Exception as err:
147147
if str(err).startswith("Too large tensor to get cached numpy: "):
148-
print("[numpy error]", self.api_config.config, "\n", str(err))
148+
print(f"[numpy error] {self.api_config.config}\n{str(err)}")
149149
write_to_log("numpy_error", self.api_config.config)
150150
return
151151
print(str(err), flush=True)
@@ -154,7 +154,7 @@ def test(self):
154154
try:
155155
paddle.base.core.eager._for_test_check_cuda_error()
156156
except Exception as err:
157-
print("[torch error] backward", self.api_config.config, "\n", str(err), flush=True)
157+
print(f"[torch error] backward {self.api_config.config}\n{str(err)}", flush=True)
158158
write_to_log("torch_error", self.api_config.config)
159159
raise
160160
else:
@@ -200,10 +200,10 @@ def process_torch_outputs(obj):
200200
paddle_output = self.paddle_args[0] if len(self.paddle_args) > 0 else next(iter(self.paddle_kwargs.values()))
201201
except Exception as err:
202202
if self.should_ignore_paddle_error(str(err)):
203-
print("[Pass]", self.api_config.config, flush=True)
203+
print(f"[Pass] {self.api_config.config}", flush=True)
204204
write_to_log("pass", self.api_config.config)
205205
return
206-
print("[paddle error]", self.api_config.config, "\n", str(err), flush=True)
206+
print(f"[paddle error] {self.api_config.config}\n{str(err)}", flush=True)
207207
write_to_log("paddle_error", self.api_config.config)
208208
if "CUDA error" in str(err) or "memory corruption" in str(err):
209209
raise err
@@ -214,14 +214,14 @@ def process_torch_outputs(obj):
214214
try:
215215
paddle.base.core.eager._for_test_check_cuda_error()
216216
except Exception as err:
217-
print("[cuda error]", self.api_config.config, "\n", str(err), flush=True)
217+
print(f"[cuda error] {self.api_config.config}\n{str(err)}", flush=True)
218218
write_to_log("paddle_error", self.api_config.config)
219219
raise
220220

221221
paddle_output, torch_output = process_output(self.api_config, paddle_output, torch_output)
222222

223223
self.is_backward = False
224-
def compare_paddle_and_torch(paddle_tensor, torch_tensor) -> bool:
224+
def compare_paddle_and_torch(paddle_tensor, torch_tensor, idx=0) -> bool:
225225
try:
226226
# if paddle_tensor.dtype == paddle.bfloat16:
227227
# paddle_tensor = paddle.cast(paddle_tensor, dtype="float32")
@@ -231,13 +231,14 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor) -> bool:
231231
self.torch_assert_accuracy(paddle_tensor, torch_tensor, atol=self.atol, rtol=self.rtol)
232232
except Exception as err:
233233
if self.is_backward:
234-
print(f"[accuracy error] backward {self.api_config.config}\n{str(err)}", flush=True)
234+
print(f"[accuracy error] backward at {idx} {self.api_config.config}\n{str(err)}", flush=True)
235235
else:
236-
print(f"[accuracy error] {self.api_config.config}\n{str(err)}", flush=True)
236+
print(f"[accuracy error] at {idx} {self.api_config.config}\n{str(err)}", flush=True)
237237
write_to_log("accuracy_error", self.api_config.config)
238238
return False
239239
return True
240240

241+
# Forward output check:
241242
if isinstance(paddle_output, paddle.Tensor):
242243
if isinstance(torch_output, torch.Tensor):
243244
if not compare_paddle_and_torch(paddle_output, torch_output):
@@ -248,52 +249,65 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor) -> bool:
248249
assert paddle_output.shape == [], "paddle_output shape is not []"
249250
assert bool(paddle_output) == torch_output, f"paddle_output {bool(paddle_output)} is not equal to torch_output {torch_output}"
250251
except Exception as err:
251-
print("[accuracy error]", self.api_config.config, "\n", str(err), flush=True)
252+
print(f"[not compare] {self.api_config.config}\n{str(err)}", flush=True)
252253
write_to_log("accuracy_error", self.api_config.config)
253254
return
254255
elif isinstance(torch_output, (torch.return_types.max, torch.return_types.min)):
255256
torch_output = torch_output.values
256257
if not compare_paddle_and_torch(paddle_output, torch_output):
257258
return
258259
else:
259-
print("[accuracy error]", self.api_config.config, "\n[output type diff error1], ", type(torch_output), flush=True)
260+
print(
261+
f"[not compare] {self.api_config.config}\n"
262+
f"torch is {type(torch_output)} but paddle is {type(paddle_output)}",
263+
flush=True,
264+
)
260265
write_to_log("accuracy_error", self.api_config.config)
261266
return
262267
elif isinstance(paddle_output, (list, tuple)):
263268
if not isinstance(torch_output, (list, tuple)):
264-
print("[output type diff error]", self.api_config.config, flush=True)
269+
print(f"[not compare] {self.api_config.config}\n"
270+
f"torch is {type(torch_output)} but paddle is {type(paddle_output)}",
271+
flush=True)
272+
write_to_log("accuracy_error", self.api_config.config)
265273
return
266274
paddle_output = list(paddle_output)
267275
torch_output = list(torch_output)
268276
if len(paddle_output) != len(torch_output):
269-
print("[accuracy error]", self.api_config.config, "\n[output type diff error2], ", len(paddle_output), len(torch_output), flush=True)
277+
print(f"[not compare] {self.api_config.config}\n"
278+
f"torch len is {len(torch_output)} but paddle len is {len(paddle_output)}",
279+
flush=True)
270280
write_to_log("accuracy_error", self.api_config.config)
271281
return
272-
for paddle_item, torch_item in zip(paddle_output, torch_output):
282+
for i, (paddle_item, torch_item) in enumerate(zip(paddle_output, torch_output)):
273283
if isinstance(paddle_item, int) or self.api_config.api_name.endswith('tolist'):
274284
self.np_assert_accuracy(numpy.array(paddle_item), numpy.array(torch_item), atol=self.atol, rtol=self.rtol)
275285
# especially for paddle.vision.ops.distribute_fpn_proposals
276286
elif isinstance(paddle_item, list) and isinstance(torch_item, list):
277287
if any(isinstance(x, paddle.Tensor) for x in paddle_item) and any(isinstance(x, torch.Tensor) for x in torch_item):
278288
for paddle_item_sub, torch_item_sub in zip(paddle_item, torch_item):
279-
if not compare_paddle_and_torch(paddle_item_sub, torch_item_sub):
289+
if not compare_paddle_and_torch(paddle_item_sub, torch_item_sub, i):
280290
return
281291
else:
282-
print("[accuracy error]", self.api_config.config, "\n[output type diff error4]", flush=True)
292+
print(f"[not compare] at {i} {self.api_config.config}\n"
293+
f"torch is {type(torch_item)} but paddle is {type(paddle_item)}",
294+
flush=True)
283295
write_to_log("accuracy_error", self.api_config.config)
284296
return
285-
elif not isinstance(paddle_item, paddle.Tensor):
286-
print("[not compare]", paddle_item, torch_item, flush=True)
287-
write_to_log("accuracy_error", self.api_config.config)
288-
return
289-
elif not isinstance(torch_item, torch.Tensor):
290-
print("[accuracy error]", self.api_config.config, "\n[output type diff error3], ", type(torch_item), flush=True)
297+
elif (paddle_item is None or not paddle_item._is_initialized()) and torch_item is None:
298+
pass
299+
elif not isinstance(paddle_item, paddle.Tensor) or not isinstance(torch_item, torch.Tensor):
300+
print(f"[not compare] at {i} {self.api_config.config}\n"
301+
f"torch is {type(torch_item)} but paddle is {type(paddle_item)}",
302+
flush=True)
291303
write_to_log("accuracy_error", self.api_config.config)
292304
return
293305
else:
294-
if not compare_paddle_and_torch(paddle_item, torch_item):
306+
if not compare_paddle_and_torch(paddle_item, torch_item, i):
295307
return
296308

309+
# Forward check now pass.
310+
# Then do paddle backward and backward result check.
297311
if torch_grad_success:
298312
self.is_backward = True
299313
try:
@@ -306,14 +320,14 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor) -> bool:
306320
del inputs_list, result_outputs, result_outputs_grads
307321
except Exception as err:
308322
if str(err).startswith("Too large tensor to get cached numpy: "):
309-
print("[numpy error]", self.api_config.config, "\n", str(err))
323+
print(f"[numpy error] backward {self.api_config.config}\n{str(err)}", flush=True)
310324
write_to_log("numpy_error", self.api_config.config)
311325
return
312326
if self.should_ignore_paddle_error(str(err)):
313-
print("[Pass]", self.api_config.config, flush=True)
327+
print(f"[Pass] {self.api_config.config}", flush=True)
314328
write_to_log("pass", self.api_config.config)
315329
return
316-
print("[paddle error] backward", self.api_config.config, "\n", str(err), flush=True)
330+
print(f"[paddle error] backward {self.api_config.config}\n{str(err)}", flush=True)
317331
write_to_log("paddle_error", self.api_config.config)
318332
if "CUDA error" in str(err) or "memory corruption" in str(err):
319333
raise err
@@ -324,46 +338,51 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor) -> bool:
324338
try:
325339
paddle.base.core.eager._for_test_check_cuda_error()
326340
except Exception as err:
327-
print("[cuda error] backward", self.api_config.config, "\n", str(err), flush=True)
341+
print(f"[cuda error] backward {self.api_config.config}\n{str(err)}", flush=True)
328342
write_to_log("paddle_error", self.api_config.config)
329343
raise
330344

331345
paddle_out_grads, torch_out_grads = process_grad_output(self.api_config, paddle_out_grads, torch_out_grads)
332346

347+
# Backward output check:
333348
if isinstance(paddle_out_grads, paddle.Tensor):
334349
if isinstance(torch_out_grads, torch.Tensor):
335350
if not compare_paddle_and_torch(paddle_out_grads, torch_out_grads):
336351
return
337352
else:
338-
print("[accuracy error] backward", self.api_config.config, "\n[output type diff error1], ", type(torch_out_grads), flush=True)
353+
print(f"[not compare] backward {self.api_config.config}\n"
354+
f"torch is {type(torch_out_grads)} but paddle is {type(paddle_out_grads)}", flush=True)
339355
write_to_log("accuracy_error", self.api_config.config)
340356
return
341357
elif isinstance(paddle_out_grads, (list, tuple)):
342358
if not isinstance(torch_out_grads, (list, tuple)):
343-
print("[output type diff error]", self.api_config.config, flush=True)
359+
print(f"[not compare] backward {self.api_config.config}\n"
360+
f"torch is {type(torch_out_grads)} but paddle is {type(paddle_out_grads)}", flush=True)
361+
write_to_log("accuracy_error", self.api_config.config)
344362
return
345363
paddle_out_grads = list(paddle_out_grads)
346364
torch_out_grads = list(torch_out_grads)
347365
if len(paddle_out_grads) != len(torch_out_grads):
348-
print("[accuracy error] backward", self.api_config.config, "\n[output type diff error2], ", len(paddle_out_grads), len(torch_out_grads), flush=True)
366+
print(f"[not compare] backward {self.api_config.config}\n"
367+
f"torch len is {len(torch_out_grads)} but paddle len is {len(paddle_out_grads)}", flush=True)
349368
write_to_log("accuracy_error", self.api_config.config)
350369
return
351-
for paddle_item, torch_item in zip(paddle_out_grads, torch_out_grads):
370+
for i, (paddle_item, torch_item) in enumerate(zip(paddle_out_grads, torch_out_grads)):
352371
if isinstance(paddle_item, int):
353372
self.np_assert_accuracy(numpy.array(paddle_item), numpy.array(torch_item), atol=self.atol, rtol=self.rtol)
354-
elif not isinstance(paddle_item, paddle.Tensor):
355-
print("[not compare]", paddle_item, torch_item, flush=True)
356-
write_to_log("accuracy_error", self.api_config.config)
357-
return
358-
elif not isinstance(torch_item, torch.Tensor):
359-
print("[accuracy error] backward", self.api_config.config, "\n[output type diff error3], ", type(torch_out_grads[i]), flush=True)
373+
elif (paddle_item is None or not paddle_item._is_initialized()) and torch_item is None:
374+
pass
375+
elif not isinstance(paddle_item, paddle.Tensor) or not isinstance(torch_item, torch.Tensor):
376+
print(f"[not compare] backward at {i} {self.api_config.config}\n"
377+
f"torch is {type(torch_item)} but paddle is {type(paddle_item)}",
378+
flush=True)
360379
write_to_log("accuracy_error", self.api_config.config)
361380
return
362381
else:
363-
if not compare_paddle_and_torch(paddle_item, torch_item):
382+
if not compare_paddle_and_torch(paddle_item, torch_item, i):
364383
return
365384

366-
print("[Pass]", self.api_config.config, flush=True)
385+
print(f"[Pass] {self.api_config.config}", flush=True)
367386
write_to_log("pass", self.api_config.config)
368387

369388

@@ -467,4 +486,9 @@ def process_grad_output(api_config, paddle_out_grads, torch_out_grads):
467486
if is_upper
468487
else torch.tril(torch_out_grads[1])
469488
)
489+
elif api_config.api_name == "paddle.incubate.nn.functional.fused_rotary_position_embedding":
490+
# Paddle only has 3 outputs/grads Q, K, V
491+
valid_out_num = len([out for out in paddle_out_grads if out is not None])
492+
paddle_out_grads = paddle_out_grads[:valid_out_num]
493+
torch_out_grads = torch_out_grads[:valid_out_num]
470494
return paddle_out_grads, torch_out_grads

0 commit comments

Comments
 (0)