3030import java .util .function .Supplier ;
3131
3232/**
33- * Tracks progress of shard snapshots during shutdown, on this single data node. Periodically reports progress via logging, the interval for
34- * which see {@link #SNAPSHOT_PROGRESS_DURING_SHUTDOWN_LOG_INTERVAL_SETTING}.
33+ * Tracks progress of shard snapshots during shutdown, on this single data node.
34+ * Always tracks the progress of snapshots, but only logs the period progress report once node shutdown starts.
35+ * See {@link #SNAPSHOT_PROGRESS_DURING_SHUTDOWN_LOG_INTERVAL_SETTING} for the logging interval.
3536 */
3637public class SnapshotShutdownProgressTracker {
3738
@@ -68,6 +69,7 @@ public class SnapshotShutdownProgressTracker {
6869
6970 /**
7071 * Tracks the number of shard snapshots that have started on the data node but not yet finished.
72+ * If the node starts shutting down, when this reaches 0, we stop logging the periodic progress report
7173 */
7274 private final AtomicLong numberOfShardSnapshotsInProgressOnDataNode = new AtomicLong ();
7375
@@ -87,6 +89,11 @@ public class SnapshotShutdownProgressTracker {
8789 private final AtomicLong abortedCount = new AtomicLong ();
8890 private final AtomicLong pausedCount = new AtomicLong ();
8991
92+ /**
93+ * Ensure that we only log the exit message once
94+ */
95+ private volatile boolean exitLogMessageCompleted = false ;
96+
9097 public SnapshotShutdownProgressTracker (
9198 Supplier <String > localNodeIdSupplier ,
9299 Consumer <Logger > logShardStatuses ,
@@ -135,27 +142,51 @@ private void cancelProgressLogger() {
135142 * Logs information about shard snapshot progress.
136143 */
137144 private void logProgressReport () {
138- logger .info (
139- """
140- Current active shard snapshot stats on data node [{}]. \
141- Node shutdown cluster state update received at [{} UTC]. \
142- Finished signalling shard snapshots to pause at [{} UTC]. \
143- Time between the node shutdown cluster state update and signalling shard snapshots to pause is [{} millis]. \
144- Number shard snapshots running [{}]. \
145- Number shard snapshots waiting for master node reply to status update request [{}] \
146- Shard snapshot completion stats since shutdown began: Done [{}]; Failed [{}]; Aborted [{}]; Paused [{}]\
147- """ ,
148- getLocalNodeId .get (),
149- DATE_TIME_FORMATTER .formatMillis (shutdownStartMillis ),
150- DATE_TIME_FORMATTER .formatMillis (shutdownFinishedSignallingPausingMillis ),
151- shutdownFinishedSignallingPausingMillis - shutdownStartMillis ,
152- numberOfShardSnapshotsInProgressOnDataNode .get (),
153- shardSnapshotRequests .size (),
154- doneCount .get (),
155- failureCount .get (),
156- abortedCount .get (),
157- pausedCount .get ()
158- );
145+ assert exitLogMessageCompleted == false : "We should not log after the exit message" ;
146+ // If there are no more snapshots in progress, then stop logging periodic progress reports
147+ if (numberOfShardSnapshotsInProgressOnDataNode .get () == 0 && shardSnapshotRequests .isEmpty ()) {
148+ logger .info (
149+ """
150+ All shard snapshots have finished or been paused on data node [{}].\
151+ Node shutdown cluster state update received at [{} UTC]. \
152+ Progress logging completed at [{} UTC]. \
153+ Number shard snapshots waiting for master node reply to status update request [{}] \
154+ Shard snapshot completion stats since shutdown began: Done [{}]; Failed [{}]; Aborted [{}]; Paused [{}]\
155+ """ ,
156+ getLocalNodeId .get (),
157+ DATE_TIME_FORMATTER .formatMillis (shutdownStartMillis ),
158+ DATE_TIME_FORMATTER .formatMillis (threadPool .relativeTimeInMillis ()),
159+ shardSnapshotRequests .size (),
160+ doneCount .get (),
161+ failureCount .get (),
162+ abortedCount .get (),
163+ pausedCount .get ()
164+ );
165+ cancelProgressLogger ();
166+ this .exitLogMessageCompleted = true ;
167+ } else {
168+ logger .info (
169+ """
170+ Current active shard snapshot stats on data node [{}]. \
171+ Node shutdown cluster state update received at [{} UTC]. \
172+ Finished signalling shard snapshots to pause at [{} UTC]. \
173+ Time between the node shutdown cluster state update and signalling shard snapshots to pause is [{} millis]. \
174+ Number shard snapshots running [{}]. \
175+ Number shard snapshots waiting for master node reply to status update request [{}] \
176+ Shard snapshot completion stats since shutdown began: Done [{}]; Failed [{}]; Aborted [{}]; Paused [{}]\
177+ """ ,
178+ getLocalNodeId .get (),
179+ DATE_TIME_FORMATTER .formatMillis (shutdownStartMillis ),
180+ DATE_TIME_FORMATTER .formatMillis (shutdownFinishedSignallingPausingMillis ),
181+ shutdownFinishedSignallingPausingMillis - shutdownStartMillis ,
182+ numberOfShardSnapshotsInProgressOnDataNode .get (),
183+ shardSnapshotRequests .size (),
184+ doneCount .get (),
185+ failureCount .get (),
186+ abortedCount .get (),
187+ pausedCount .get ()
188+ );
189+ }
159190 // Use a callback to log the shard snapshot details.
160191 logIndexShardSnapshotStatuses .accept (logger );
161192 }
@@ -204,6 +235,7 @@ public void onClusterStateRemoveShutdown() {
204235
205236 // Turn off the progress logger, which we only want to run during shutdown.
206237 cancelProgressLogger ();
238+ this .exitLogMessageCompleted = false ;
207239 }
208240
209241 /**
0 commit comments