Skip to content

Commit 469cd7d

Browse files
Remove infinite snapshot logging (#137821)
Modifies the SnapshotShutdownProgressTracker to only log the periodic progress report when there are still snapshots left to process. This removes the existing behaviour where the report will log indefinitely, even once all snapshots are completed or paused. Closes: ES-12794
1 parent f3069c6 commit 469cd7d

File tree

3 files changed

+238
-79
lines changed

3 files changed

+238
-79
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -566,28 +566,28 @@ public void testSnapshotShutdownProgressTracker() throws Exception {
566566
})
567567
);
568568

569+
// The SnapshotShutdownProgressTracker periodically logs the status of in-flight snapshots during node shutdown.
570+
// The periodic logger ends when the last snapshot is either finished or paused. Since this is asynchronous, to avoid race
571+
// conditions we must define all of our mocklog assertions up front to ensure that the tracker is acting as expected
572+
573+
// Check that the SnapshotShutdownProgressTracker observed the request sent to the master node.
574+
// Note that we cannot guarantee how many requests the tracker receives before finishing its logging
569575
mockLog.addExpectation(
570576
new MockLog.SeenEventExpectation(
571577
"SnapshotShutdownProgressTracker shard snapshot has paused log message",
572578
SnapshotShutdownProgressTracker.class.getCanonicalName(),
573579
Level.INFO,
574-
"*Number shard snapshots waiting for master node reply to status update request [" + numShards + "]*"
580+
"*Number shard snapshots waiting for master node reply to status update request [*]*"
575581
)
576582
);
577583

578-
// Let the shard snapshot proceed. It will still get stuck waiting for the master node to respond.
579-
unblockNode(repoName, nodeForRemoval);
580-
581-
// Check that the SnapshotShutdownProgressTracker observed the request sent to the master node.
582-
mockLog.awaitAllExpectationsMatched();
583-
resetMockLog();
584-
584+
// Check that the SnapshotShutdownProgressTracker observed the shard snapshot finishing as paused.
585585
mockLog.addExpectation(
586586
new MockLog.SeenEventExpectation(
587587
"SnapshotShutdownProgressTracker shard snapshot has paused log message",
588588
SnapshotShutdownProgressTracker.class.getCanonicalName(),
589589
Level.INFO,
590-
"Current active shard snapshot stats on data node [" + nodeForRemovalId + "]*Paused [" + numShards + "]"
590+
"*on data node [" + nodeForRemovalId + "]*Paused [" + numShards + "]"
591591
)
592592
);
593593
mockLog.addExpectation(
@@ -599,13 +599,15 @@ public void testSnapshotShutdownProgressTracker() throws Exception {
599599
)
600600
);
601601

602+
// Let the shard snapshot proceed. It will still get stuck waiting for the master node to respond.
603+
unblockNode(repoName, nodeForRemoval);
604+
602605
// Release the master node to respond
603606
snapshotStatusUpdateLatch.countDown();
604607

605608
// Wait for the snapshot to fully pause.
606609
safeAwait(snapshotPausedListener);
607610

608-
// Check that the SnapshotShutdownProgressTracker observed the shard snapshot finishing as paused.
609611
mockLog.awaitAllExpectationsMatched();
610612
resetMockLog();
611613

server/src/main/java/org/elasticsearch/snapshots/SnapshotShutdownProgressTracker.java

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@
3030
import java.util.function.Supplier;
3131

3232
/**
33-
* Tracks progress of shard snapshots during shutdown, on this single data node. Periodically reports progress via logging, the interval for
34-
* which see {@link #SNAPSHOT_PROGRESS_DURING_SHUTDOWN_LOG_INTERVAL_SETTING}.
33+
* Tracks progress of shard snapshots during shutdown, on this single data node.
34+
* Always tracks the progress of snapshots, but only logs the period progress report once node shutdown starts.
35+
* See {@link #SNAPSHOT_PROGRESS_DURING_SHUTDOWN_LOG_INTERVAL_SETTING} for the logging interval.
3536
*/
3637
public class SnapshotShutdownProgressTracker {
3738

@@ -68,6 +69,7 @@ public class SnapshotShutdownProgressTracker {
6869

6970
/**
7071
* Tracks the number of shard snapshots that have started on the data node but not yet finished.
72+
* If the node starts shutting down, when this reaches 0, we stop logging the periodic progress report
7173
*/
7274
private final AtomicLong numberOfShardSnapshotsInProgressOnDataNode = new AtomicLong();
7375

@@ -87,6 +89,11 @@ public class SnapshotShutdownProgressTracker {
8789
private final AtomicLong abortedCount = new AtomicLong();
8890
private final AtomicLong pausedCount = new AtomicLong();
8991

92+
/**
93+
* Ensure that we only log the exit message once
94+
*/
95+
private volatile boolean exitLogMessageCompleted = false;
96+
9097
public SnapshotShutdownProgressTracker(
9198
Supplier<String> localNodeIdSupplier,
9299
Consumer<Logger> logShardStatuses,
@@ -135,27 +142,51 @@ private void cancelProgressLogger() {
135142
* Logs information about shard snapshot progress.
136143
*/
137144
private void logProgressReport() {
138-
logger.info(
139-
"""
140-
Current active shard snapshot stats on data node [{}]. \
141-
Node shutdown cluster state update received at [{} UTC]. \
142-
Finished signalling shard snapshots to pause at [{} UTC]. \
143-
Time between the node shutdown cluster state update and signalling shard snapshots to pause is [{} millis]. \
144-
Number shard snapshots running [{}]. \
145-
Number shard snapshots waiting for master node reply to status update request [{}] \
146-
Shard snapshot completion stats since shutdown began: Done [{}]; Failed [{}]; Aborted [{}]; Paused [{}]\
147-
""",
148-
getLocalNodeId.get(),
149-
DATE_TIME_FORMATTER.formatMillis(shutdownStartMillis),
150-
DATE_TIME_FORMATTER.formatMillis(shutdownFinishedSignallingPausingMillis),
151-
shutdownFinishedSignallingPausingMillis - shutdownStartMillis,
152-
numberOfShardSnapshotsInProgressOnDataNode.get(),
153-
shardSnapshotRequests.size(),
154-
doneCount.get(),
155-
failureCount.get(),
156-
abortedCount.get(),
157-
pausedCount.get()
158-
);
145+
assert exitLogMessageCompleted == false : "We should not log after the exit message";
146+
// If there are no more snapshots in progress, then stop logging periodic progress reports
147+
if (numberOfShardSnapshotsInProgressOnDataNode.get() == 0 && shardSnapshotRequests.isEmpty()) {
148+
logger.info(
149+
"""
150+
All shard snapshots have finished or been paused on data node [{}].\
151+
Node shutdown cluster state update received at [{} UTC]. \
152+
Progress logging completed at [{} UTC]. \
153+
Number shard snapshots waiting for master node reply to status update request [{}] \
154+
Shard snapshot completion stats since shutdown began: Done [{}]; Failed [{}]; Aborted [{}]; Paused [{}]\
155+
""",
156+
getLocalNodeId.get(),
157+
DATE_TIME_FORMATTER.formatMillis(shutdownStartMillis),
158+
DATE_TIME_FORMATTER.formatMillis(threadPool.relativeTimeInMillis()),
159+
shardSnapshotRequests.size(),
160+
doneCount.get(),
161+
failureCount.get(),
162+
abortedCount.get(),
163+
pausedCount.get()
164+
);
165+
cancelProgressLogger();
166+
this.exitLogMessageCompleted = true;
167+
} else {
168+
logger.info(
169+
"""
170+
Current active shard snapshot stats on data node [{}]. \
171+
Node shutdown cluster state update received at [{} UTC]. \
172+
Finished signalling shard snapshots to pause at [{} UTC]. \
173+
Time between the node shutdown cluster state update and signalling shard snapshots to pause is [{} millis]. \
174+
Number shard snapshots running [{}]. \
175+
Number shard snapshots waiting for master node reply to status update request [{}] \
176+
Shard snapshot completion stats since shutdown began: Done [{}]; Failed [{}]; Aborted [{}]; Paused [{}]\
177+
""",
178+
getLocalNodeId.get(),
179+
DATE_TIME_FORMATTER.formatMillis(shutdownStartMillis),
180+
DATE_TIME_FORMATTER.formatMillis(shutdownFinishedSignallingPausingMillis),
181+
shutdownFinishedSignallingPausingMillis - shutdownStartMillis,
182+
numberOfShardSnapshotsInProgressOnDataNode.get(),
183+
shardSnapshotRequests.size(),
184+
doneCount.get(),
185+
failureCount.get(),
186+
abortedCount.get(),
187+
pausedCount.get()
188+
);
189+
}
159190
// Use a callback to log the shard snapshot details.
160191
logIndexShardSnapshotStatuses.accept(logger);
161192
}
@@ -204,6 +235,7 @@ public void onClusterStateRemoveShutdown() {
204235

205236
// Turn off the progress logger, which we only want to run during shutdown.
206237
cancelProgressLogger();
238+
this.exitLogMessageCompleted = false;
207239
}
208240

209241
/**

0 commit comments

Comments
 (0)