Skip to content

Commit c862f75

Browse files
authored
Add EMTEST_RETRY_COUNT to force retrying failed tests (#25565)
The way this differs from the existing EMTEST_RETRY_FLAKY is that this retries any failed test, whereas EMTEST_RETRY_FLAKY only retries those tests that had explicitly been deemed to be flaky beforehand. The rationale for adding this feature is twofold: 1. There is currently so much flakiness in tests in the current test suites, that I would be flagging flaky tests for many months to come in my own CI. It is unclear if some of that flakiness is a harness problem or a systemic problem rather than an individual test problem, so I could end up flagging a majority of all tests in the suites as flaky. 2. Whenever a test fails in my CI, the very first thing I need to check is whether the failure was just a one-off, or whether the failure was a deterministic failure. So being able to run with `EMTEST_RETRY_COUNT=5` will automate such testing for me and immediately give me feedback whether any test failure was deterministic or intermittent.
1 parent a4a8121 commit c862f75

File tree

3 files changed

+46
-5
lines changed

3 files changed

+46
-5
lines changed

test/common.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing import Dict, Tuple
1111
from urllib.parse import unquote, unquote_plus, urlparse, parse_qs
1212
from http.server import ThreadingHTTPServer, SimpleHTTPRequestHandler
13+
from retryable_unittest import RetryableTestCase
1314
import contextlib
1415
import difflib
1516
import hashlib
@@ -286,8 +287,8 @@ def is_slow_test(func):
286287
return decorated
287288

288289

289-
def record_flaky_test(test_name, attempt_count, exception_msg):
290-
logging.info(f'Retrying flaky test "{test_name}" (attempt {attempt_count}/{EMTEST_RETRY_FLAKY} failed):\n{exception_msg}')
290+
def record_flaky_test(test_name, attempt_count, max_attempts, exception_msg):
291+
logging.info(f'Retrying flaky test "{test_name}" (attempt {attempt_count}/{max_attempts} failed):\n{exception_msg}')
291292
open(flaky_tests_log_filename, 'a').write(f'{test_name}\n')
292293

293294

@@ -313,7 +314,7 @@ def modified(self, *args, **kwargs):
313314
return func(self, *args, **kwargs)
314315
except (AssertionError, subprocess.TimeoutExpired) as exc:
315316
preserved_exc = exc
316-
record_flaky_test(self.id(), i, exc)
317+
record_flaky_test(self.id(), i, EMTEST_RETRY_FLAKY, exc)
317318

318319
raise AssertionError('Flaky test has failed too many times') from preserved_exc
319320

@@ -1032,7 +1033,7 @@ def __new__(mcs, name, bases, attrs):
10321033
return type.__new__(mcs, name, bases, new_attrs)
10331034

10341035

1035-
class RunnerCore(unittest.TestCase, metaclass=RunnerMeta):
1036+
class RunnerCore(RetryableTestCase, metaclass=RunnerMeta):
10361037
# default temporary directory settings. set_temp_dir may be called later to
10371038
# override these
10381039
temp_dir = shared.TEMP_DIR
@@ -2774,7 +2775,7 @@ def run_browser(self, html_file, expected=None, message=None, timeout=None, extr
27742775
self.assertContained(expected, output)
27752776
except self.failureException as e:
27762777
if extra_tries > 0:
2777-
record_flaky_test(self.id(), EMTEST_RETRY_FLAKY - extra_tries, e)
2778+
record_flaky_test(self.id(), EMTEST_RETRY_FLAKY - extra_tries, EMTEST_RETRY_FLAKY, e)
27782779
if not self.capture_stdio:
27792780
print('[enabling stdio/stderr reporting]')
27802781
self.capture_stdio = True

test/parallel_testsuite.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,8 @@ def __init__(self, lock, progress_counter, num_tests):
244244
self.lock = lock
245245
self.progress_counter = progress_counter
246246
self.num_tests = num_tests
247+
self.failures = []
248+
self.errors = []
247249

248250
@property
249251
def test(self):
@@ -336,12 +338,14 @@ def addFailure(self, test, err):
336338
errlog(f'{self.compute_progress()}{with_color(RED, msg)}')
337339
self.buffered_result = BufferedTestFailure(test, err)
338340
self.test_result = 'failed'
341+
self.failures += [test]
339342

340343
def addError(self, test, err):
341344
msg = f'{test} ... ERROR'
342345
errlog(f'{self.compute_progress()}{with_color(RED, msg)}')
343346
self.buffered_result = BufferedTestError(test, err)
344347
self.test_result = 'errored'
348+
self.errors += [test]
345349

346350

347351
class BufferedTestBase:

test/retryable_unittest.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import common
2+
import os
3+
import unittest
4+
5+
EMTEST_RETRY_COUNT = int(os.getenv('EMTEST_RETRY_COUNT', '0'))
6+
7+
8+
class RetryableTestCase(unittest.TestCase):
9+
'''This class patches in to the Python unittest TestCase object to incorporate
10+
support for an environment variable EMTEST_RETRY_COUNT=x, which enables a
11+
failed test to be automatically re-run to test if the failure might have been
12+
due to an instability.'''
13+
14+
def run(self, result=None):
15+
retries_left = EMTEST_RETRY_COUNT
16+
17+
num_fails = len(result.failures)
18+
num_errors = len(result.errors)
19+
20+
while retries_left >= 0:
21+
super().run(result)
22+
23+
# The test passed if it didn't accumulate an error.
24+
if len(result.failures) == num_fails and len(result.errors) == num_errors:
25+
return
26+
27+
retries_left -= 1
28+
if retries_left >= 0:
29+
if len(result.failures) != num_fails:
30+
err = result.failures.pop(-1)
31+
elif len(result.errors) != num_errors:
32+
err = result.errors.pop(-1)
33+
else:
34+
raise Exception('Internal error in RetryableTestCase: did not detect an error')
35+
36+
common.record_flaky_test(self.id(), EMTEST_RETRY_COUNT - retries_left, EMTEST_RETRY_COUNT, str(err))

0 commit comments

Comments
 (0)