[Backend Tester] Add SNR validation

GregoryComer · GregoryComer · commit 1b11fd3b3ebb · 2025-07-28T13:17:06.000-07:00
ghstack-source-id: 8c6f72f ghstack-comment-id: 3129418402 Pull-Request: #12924
diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
@@ -1,7 +1,9 @@
-import random
 from collections import Counter, OrderedDict
+from torch.ao.ns.fx.utils import compute_sqnr
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
+import math
+import random
 import torch
 
 from executorch.backends.test.harness.stages import (
@@ -302,17 +304,18 @@ def run_method_and_compare_outputs(
         atol=1e-03,
         rtol=1e-03,
         qtol=0,
+        snr: float | None = None,
     ):
         number_of_runs = 1 if inputs is not None else num_runs
         reference_stage = self.stages[StageType.EXPORT]
 
         stage = stage or self.cur
 
-        print(f"Comparing Stage {stage} with Stage {reference_stage}")
+        #print(f"Comparing Stage {stage} with Stage {reference_stage}")
         for run_iteration in range(number_of_runs):
             inputs_to_run = inputs if inputs else next(self.generate_random_inputs())
             input_shapes = [generated_input.shape for generated_input in inputs_to_run]
-            print(f"Run {run_iteration} with input shapes: {input_shapes}")
+            #print(f"Run {run_iteration} with input shapes: {input_shapes}")
 
             # Reference output (and quantization scale)
             (
@@ -325,13 +328,13 @@ def run_method_and_compare_outputs(
             # Output from running artifact at stage
             stage_output = self.stages[stage].run_artifact(inputs_to_run)
             self._compare_outputs(
-                reference_output, stage_output, quantization_scale, atol, rtol, qtol
+                reference_output, stage_output, quantization_scale, atol, rtol, qtol, snr
             )
 
         return self
 
     @staticmethod
-    def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
+    def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03, snr: float | None = None):
         """
         Helper testing function that asserts that the model output and the reference output
         are equal with some tolerance. Due to numerical differences between eager mode and
@@ -356,15 +359,18 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
                     f"\tMismatched count: {(model != ref).sum().item()} / {model.numel()}\n"
                 )
             else:
+                computed_snr = compute_sqnr(model.to(torch.float), ref.to(torch.float))
+                snr = snr or float("-inf")
+
                 assert torch.allclose(
                     model,
                     ref,
                     atol=atol,
                     rtol=rtol,
                     equal_nan=True,
-                ), (
+                ) and computed_snr >= snr or math.isnan(computed_snr), (
                     f"Output {i} does not match reference output.\n"
-                    f"\tGiven atol: {atol}, rtol: {rtol}.\n"
+                    f"\tGiven atol: {atol}, rtol: {rtol}, snr: {snr}.\n"
                     f"\tOutput tensor shape: {model.shape}, dtype: {model.dtype}\n"
                     f"\tDifference: max: {torch.max(model-ref)}, abs: {torch.max(torch.abs(model-ref))}, mean abs error: {torch.mean(torch.abs(model-ref).to(torch.double))}.\n"
                     f"\t-- Model vs. Reference --\n"
@@ -373,8 +379,10 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
                     f"\t  Mean: {model.to(torch.double).mean()}, {ref.to(torch.double).mean()}\n"
                     f"\t   Max: {model.max()}, {ref.max()}\n"
                     f"\t   Min: {model.min()}, {ref.min()}\n"
+                    f"\t   SNR: {computed_snr}\n"
                 )
 
+
     @staticmethod
     def _compare_outputs(
         reference_output,
@@ -383,6 +391,7 @@ def _compare_outputs(
         atol=1e-03,
         rtol=1e-03,
         qtol=0,
+        snr: float | None = None,
     ):
         """
         Compares the original of the original nn module with the output of the generated artifact.
@@ -405,6 +414,7 @@ def _compare_outputs(
             reference_output,
             atol=atol,
             rtol=rtol,
+            snr=snr,
         )
 
     @staticmethod
diff --git a/backends/test/suite/operators/test_abs.py b/backends/test/suite/operators/test_abs.py
@@ -86,11 +86,11 @@ def test_abs_edge_cases(self, flow: TestFlow) -> None:
         )
 
         # Tensor with infinity
-        x = torch.tensor([float("inf"), float("-inf"), 1.0, -1.0])
+        x = (torch.tensor([float("inf"), float("-inf"), 1.0, -1.0]),)
         self._test_op(AbsModel(), (x,), flow, generate_random_test_inputs=False)
 
         # Tensor with NaN
-        x = torch.tensor([float("nan"), 1.0, -1.0])
+        x = (torch.tensor([float("nan"), 1.0, -1.0]),)
         self._test_op(AbsModel(), (x,), flow, generate_random_test_inputs=False)
 
     def test_abs_scalar(self, flow: TestFlow) -> None:
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
@@ -104,7 +104,10 @@ def build_result(
         # AssertionErrors to catch output mismatches, but this might catch more than that.
         try:
             tester.run_method_and_compare_outputs(
-                None if generate_random_test_inputs else inputs
+                inputs=None if generate_random_test_inputs else inputs,
+                atol=5e-2,
+                rtol=5e-2,
+                snr=40,
             )
         except AssertionError as e:
             return build_result(TestResult.OUTPUT_MISMATCH_FAIL, e)