Add EMTEST_RETRY_COUNT to force retrying failed tests (#25565)

juj · web-flow · commit c862f75b7054 · 2025-10-16T15:47:42.000Z
The way this differs from the existing EMTEST_RETRY_FLAKY is that this
retries any failed test, whereas EMTEST_RETRY_FLAKY only retries those
tests that had explicitly been deemed to be flaky beforehand.

The rationale for adding this feature is twofold:

1. There is currently so much flakiness in tests in the current test
suites, that I would be flagging flaky tests for many months to come in
my own CI. It is unclear if some of that flakiness is a harness problem
or a systemic problem rather than an individual test problem, so I could
end up flagging a majority of all tests in the suites as flaky.

2. Whenever a test fails in my CI, the very first thing I need to check
is whether the failure was just a one-off, or whether the failure was a
deterministic failure. So being able to run with `EMTEST_RETRY_COUNT=5`
will automate such testing for me and immediately give me feedback
whether any test failure was deterministic or intermittent.
diff --git a/test/common.py b/test/common.py
@@ -10,6 +10,7 @@
 from typing import Dict, Tuple
 from urllib.parse import unquote, unquote_plus, urlparse, parse_qs
 from http.server import ThreadingHTTPServer, SimpleHTTPRequestHandler
+from retryable_unittest import RetryableTestCase
 import contextlib
 import difflib
 import hashlib
@@ -286,8 +287,8 @@ def is_slow_test(func):
   return decorated
 
 
-def record_flaky_test(test_name, attempt_count, exception_msg):
-  logging.info(f'Retrying flaky test "{test_name}" (attempt {attempt_count}/{EMTEST_RETRY_FLAKY} failed):\n{exception_msg}')
+def record_flaky_test(test_name, attempt_count, max_attempts, exception_msg):
+  logging.info(f'Retrying flaky test "{test_name}" (attempt {attempt_count}/{max_attempts} failed):\n{exception_msg}')
   open(flaky_tests_log_filename, 'a').write(f'{test_name}\n')
 
 
@@ -313,7 +314,7 @@ def modified(self, *args, **kwargs):
           return func(self, *args, **kwargs)
         except (AssertionError, subprocess.TimeoutExpired) as exc:
           preserved_exc = exc
-          record_flaky_test(self.id(), i, exc)
+          record_flaky_test(self.id(), i, EMTEST_RETRY_FLAKY, exc)
 
       raise AssertionError('Flaky test has failed too many times') from preserved_exc
 
@@ -1032,7 +1033,7 @@ def __new__(mcs, name, bases, attrs):
     return type.__new__(mcs, name, bases, new_attrs)
 
 
-class RunnerCore(unittest.TestCase, metaclass=RunnerMeta):
+class RunnerCore(RetryableTestCase, metaclass=RunnerMeta):
   # default temporary directory settings. set_temp_dir may be called later to
   # override these
   temp_dir = shared.TEMP_DIR
@@ -2774,7 +2775,7 @@ def run_browser(self, html_file, expected=None, message=None, timeout=None, extr
             self.assertContained(expected, output)
           except self.failureException as e:
             if extra_tries > 0:
-              record_flaky_test(self.id(), EMTEST_RETRY_FLAKY - extra_tries, e)
+              record_flaky_test(self.id(), EMTEST_RETRY_FLAKY - extra_tries, EMTEST_RETRY_FLAKY, e)
               if not self.capture_stdio:
                 print('[enabling stdio/stderr reporting]')
                 self.capture_stdio = True
diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
@@ -244,6 +244,8 @@ def __init__(self, lock, progress_counter, num_tests):
     self.lock = lock
     self.progress_counter = progress_counter
     self.num_tests = num_tests
+    self.failures = []
+    self.errors = []
 
   @property
   def test(self):
@@ -336,12 +338,14 @@ def addFailure(self, test, err):
     errlog(f'{self.compute_progress()}{with_color(RED, msg)}')
     self.buffered_result = BufferedTestFailure(test, err)
     self.test_result = 'failed'
+    self.failures += [test]
 
   def addError(self, test, err):
     msg = f'{test} ... ERROR'
     errlog(f'{self.compute_progress()}{with_color(RED, msg)}')
     self.buffered_result = BufferedTestError(test, err)
     self.test_result = 'errored'
+    self.errors += [test]
 
 
 class BufferedTestBase:
diff --git a/test/retryable_unittest.py b/test/retryable_unittest.py
@@ -0,0 +1,36 @@
+import common
+import os
+import unittest
+
+EMTEST_RETRY_COUNT = int(os.getenv('EMTEST_RETRY_COUNT', '0'))
+
+
+class RetryableTestCase(unittest.TestCase):
+  '''This class patches in to the Python unittest TestCase object to incorporate
+  support for an environment variable EMTEST_RETRY_COUNT=x, which enables a
+  failed test to be automatically re-run to test if the failure might have been
+  due to an instability.'''
+
+  def run(self, result=None):
+    retries_left = EMTEST_RETRY_COUNT
+
+    num_fails = len(result.failures)
+    num_errors = len(result.errors)
+
+    while retries_left >= 0:
+      super().run(result)
+
+      # The test passed if it didn't accumulate an error.
+      if len(result.failures) == num_fails and len(result.errors) == num_errors:
+        return
+
+      retries_left -= 1
+      if retries_left >= 0:
+        if len(result.failures) != num_fails:
+          err = result.failures.pop(-1)
+        elif len(result.errors) != num_errors:
+          err = result.errors.pop(-1)
+        else:
+          raise Exception('Internal error in RetryableTestCase: did not detect an error')
+
+        common.record_flaky_test(self.id(), EMTEST_RETRY_COUNT - retries_left, EMTEST_RETRY_COUNT, str(err))