Skip to content

Commit b1732c6

Browse files
stats/opentelemetry: fix flaky TestTraceSpan_WithRetriesAndNameResolutionDelay
This commit fixes the flaky test TestTraceSpan_WithRetriesAndNameResolutionDelay which was introduced in the previous commit and caused PR #8342 to be reverted. Root Cause: The test had race conditions related to timing: 1. The goroutine that updates resolver state could complete before or after the delayed resolution event was fully processed and recorded in spans 2. Span export timing was not synchronized with test validation, causing the test to sometimes check spans before they were fully exported Fix: 1. Added 'stateUpdated' event to synchronize between the resolver state update completing and span validation beginning 2. Added explicit wait for the stateUpdated event before validating spans 3. Added a 50ms sleep after RPC completion to give the span exporter time to process and export all spans before validation Testing: - Test now passes consistently (10+ consecutive runs) - Passes with race detector enabled (-race flag) - No data races detected Fixes #8700
1 parent adce981 commit b1732c6

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

stats/opentelemetry/e2e_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,6 +1704,7 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) {
17041704
for _, tt := range tests {
17051705
t.Run(tt.name, func(t *testing.T) {
17061706
resolutionWait := grpcsync.NewEvent()
1707+
stateUpdated := grpcsync.NewEvent()
17071708
prevHook := internal.NewStreamWaitingForResolver
17081709
internal.NewStreamWaitingForResolver = func() { resolutionWait.Fire() }
17091710
defer func() { internal.NewStreamWaitingForResolver = prevHook }()
@@ -1746,13 +1747,32 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) {
17461747
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
17471748
defer cancel()
17481749

1750+
// Start the goroutine that will update resolver state once the stream
1751+
// is waiting for resolution. Use stateUpdated event to ensure the
1752+
// resolver state is updated before we start validating spans.
17491753
go func() {
17501754
<-resolutionWait.Done()
17511755
rb.UpdateState(resolver.State{Addresses: []resolver.Address{{Addr: ss.Address}}})
1756+
stateUpdated.Fire()
17521757
}()
1758+
17531759
if err := tt.doCall(ctx, client); err != nil {
17541760
t.Fatalf("%s call failed: %v", tt.name, err)
17551761
}
1762+
1763+
// Wait for the resolver state to be updated to ensure the delayed
1764+
// resolution event has been processed.
1765+
select {
1766+
case <-stateUpdated.Done():
1767+
case <-ctx.Done():
1768+
t.Fatal("Timed out waiting for resolver state update")
1769+
}
1770+
1771+
// Give the span exporter a small amount of time to process and export
1772+
// all spans from the completed RPC. This reduces flakiness by ensuring
1773+
// all trace events have been fully recorded before validation.
1774+
time.Sleep(50 * time.Millisecond)
1775+
17561776
spans, err := waitForTraceSpans(ctx, exporter, tt.wantSpanInfos)
17571777
if err != nil {
17581778
t.Fatal(err)

0 commit comments

Comments
 (0)