@@ -17,6 +17,8 @@ limitations under the License.
1717package generic
1818
1919import (
20+ "fmt"
21+ "strings"
2022 "sync"
2123 "time"
2224
@@ -82,19 +84,15 @@ type ConfigMetaController struct {
8284 // controllers
8385 Configs []* v1alpha1.GenericController
8486
85- // Total timeout for any condition to succeed.
86- //
87- // NOTE:
88- // This is currently used to load config that is required
89- // to run Metac.
90- WaitTimeoutForCondition time.Duration
87+ // This will allow executing start logic to be retried
88+ // indefinitely till all the watch controllers are started
89+ RetryIndefinitelyUntilSucceed * bool
9190
92- // Interval between retries for any condition to succeed.
93- //
94- // NOTE:
95- // This is currently used to load config that is required
96- // to run Metac
97- WaitIntervalForCondition time.Duration
91+ // Maximum time to wait to start all watch controllers
92+ WaitTimeoutForStartAttempt time.Duration
93+
94+ // Interval between retries to start all watch controllers
95+ WaitIntervalBetweenRestarts time.Duration
9896
9997 opts []ConfigMetaControllerOption
10098 err error
@@ -106,8 +104,29 @@ type ConfigMetaController struct {
106104// This follows **functional options** pattern
107105type ConfigMetaControllerOption func (* ConfigMetaController ) error
108106
109- // SetMetaControllerConfigLoadFn sets the config loader function
110- func SetMetaControllerConfigLoadFn (
107+ // SetMetacConfigToRetryIndefinitelyForStart will let this
108+ // controller to retry indefinitely till all its watch controllers
109+ // are started
110+ //
111+ // NOTE:
112+ // Indefinite retry is set only when the provided flag is true
113+ func SetMetacConfigToRetryIndefinitelyForStart (enabled * bool ) ConfigMetaControllerOption {
114+ return func (c * ConfigMetaController ) error {
115+ // indefinite retry is set only if enabled is true
116+ if enabled == nil || ! * enabled {
117+ return nil
118+ }
119+ // We are not bothered about performance. We do not
120+ // need to be fast. We can be lazy but not to the
121+ // extent of sleeping for hours.
122+ c .WaitIntervalBetweenRestarts = 1 * time .Minute
123+ c .RetryIndefinitelyUntilSucceed = k8s .BoolPtr (true )
124+ return nil
125+ }
126+ }
127+
128+ // SetMetacConfigLoadFn sets the config loader function
129+ func SetMetacConfigLoadFn (
111130 fn func () ([]* v1alpha1.GenericController , error ),
112131) ConfigMetaControllerOption {
113132 return func (c * ConfigMetaController ) error {
@@ -116,8 +135,8 @@ func SetMetaControllerConfigLoadFn(
116135 }
117136}
118137
119- // SetMetaControllerConfigPath sets the config path
120- func SetMetaControllerConfigPath (path string ) ConfigMetaControllerOption {
138+ // SetMetacConfigPath sets the config path
139+ func SetMetacConfigPath (path string ) ConfigMetaControllerOption {
121140 return func (c * ConfigMetaController ) error {
122141 c .ConfigPath = path
123142 return nil
@@ -134,9 +153,12 @@ func NewConfigMetaController(
134153) (* ConfigMetaController , error ) {
135154 // initialize with defaults & the provided values
136155 ctl := & ConfigMetaController {
137- WaitTimeoutForCondition : 30 * time .Minute ,
138- WaitIntervalForCondition : 1 * time .Second ,
139- opts : opts ,
156+ // Default setting for retry
157+ // - Retry times out in 30 minutes
158+ WaitTimeoutForStartAttempt : 30 * time .Minute ,
159+ // - Interval between retries is 1 second
160+ WaitIntervalBetweenRestarts : 1 * time .Second ,
161+ opts : opts ,
140162 BaseMetaController : BaseMetaController {
141163 ResourceManager : resourceMgr ,
142164 DynClientset : dynClientset ,
@@ -230,24 +252,27 @@ func (mc *ConfigMetaController) Start() {
230252
231253 glog .Infof ("Starting %s" , mc )
232254
233- // we run this as a continuous process
234- // until all the configs are loaded
235- err := mc .wait (mc .startAllWatchControllers )
255+ // Run this with retries until all the configs are loaded
256+ // and started. In other words, this starts all the
257+ // generic controllers configured in config file eventually.
258+ err := mc .startWithRetries (mc .startAllWatchControllers )
236259 if err != nil {
237260 glog .Fatalf ("Failed to start %s: %+v" , mc , err )
238261 }
239262 }()
240263}
241264
242- // wait polls the condition until it's true, with a configured
243- // interval and timeout.
265+ // startWithRetries polls the condition until it's true, with
266+ // a configured interval and timeout.
244267//
245268// The condition function returns a bool indicating whether it
246269// is satisfied, as well as an error which should be non-nil if
247270// and only if the function was unable to determine whether or
248271// not the condition is satisfied (for example if the check
249272// involves calling a remote server and the request failed).
250- func (mc * ConfigMetaController ) wait (condition func () (bool , error )) error {
273+ func (mc * ConfigMetaController ) startWithRetries (
274+ condition func () (bool , error ),
275+ ) error {
251276 // mark the start time
252277 start := time .Now ()
253278 for {
@@ -257,31 +282,38 @@ func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
257282 // returning nil implies the condition has completed
258283 return nil
259284 }
260- if time .Since (start ) > mc .WaitTimeoutForCondition {
261- return errors .Wrapf (
262- err ,
263- "Condition timed out after %s: %s" ,
264- mc .WaitTimeoutForCondition ,
265- mc ,
266- )
285+ if time .Since (start ) > mc .WaitTimeoutForStartAttempt &&
286+ (mc .RetryIndefinitelyUntilSucceed == nil ||
287+ ! * mc .RetryIndefinitelyUntilSucceed ) {
288+ {
289+ return errors .Errorf (
290+ "Condition timed out after %s: %+v: %s" ,
291+ mc .WaitTimeoutForStartAttempt ,
292+ err ,
293+ mc ,
294+ )
295+ }
267296 }
268297 if err != nil {
269- // Log error, but keep trying until timeout.
298+ // condition resulted in error
299+ // keep trying until timeout
270300 glog .V (7 ).Infof (
271- "Condition failed: Will retry after %s: %s: %v" ,
272- mc .WaitIntervalForCondition ,
273- mc ,
301+ "Condition failed: Will retry after %s: %+v: %s" ,
302+ mc .WaitIntervalBetweenRestarts ,
274303 err ,
304+ mc ,
275305 )
276306 } else {
307+ // condition did not pass
308+ // keep trying until timeout
277309 glog .V (7 ).Infof (
278310 "Waiting for condition to succeed: Will retry after %s: %s" ,
279- mc .WaitIntervalForCondition ,
311+ mc .WaitIntervalBetweenRestarts ,
280312 mc ,
281313 )
282314 }
283315 // wait & then continue retrying
284- time .Sleep (mc .WaitIntervalForCondition )
316+ time .Sleep (mc .WaitIntervalBetweenRestarts )
285317 }
286318}
287319
@@ -290,38 +322,56 @@ func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
290322// binary
291323//
292324// NOTE:
293- // This method is used as a condition and hence can be invoked
294- // more than once .
325+ // This method is used as a condition and is repeatedly executed
326+ // under a loop till this condition is not met .
295327func (mc * ConfigMetaController ) startAllWatchControllers () (bool , error ) {
296- // In this metacontroller, we are only responsible for
297- // starting/stopping the relevant watch based controllers
328+ var errs []string
329+ // This logic is responsible to start all the generic controllers
330+ // configured in the config file
298331 for _ , conf := range mc .Configs {
299332 key := conf .AsNamespaceNameKey ()
300333 if _ , ok := mc .WatchControllers [key ]; ok {
301334 // Already added; perhaps during earlier condition
302335 // checks
303336 continue
304337 }
305- // watch controller i.e. a controller based on the resource
306- // specified in the **watch field** of GenericController
338+ // Initialize the GenericController
307339 wc , err := NewWatchController (
308340 mc .ResourceManager ,
309341 mc .DynClientset ,
310342 mc .DynInformerFactory ,
311343 conf ,
312344 )
313345 if err != nil {
314- return false , errors .Wrapf (
315- err ,
316- "Failed to init %s: %s" ,
317- key ,
318- mc ,
346+ errs = append (
347+ errs ,
348+ fmt .Sprintf (
349+ "Failed to init gctl %s: %s" ,
350+ key ,
351+ err .Error (),
352+ ),
319353 )
354+ // continue to initialise & start remaining controllers
355+ //
356+ // NOTE:
357+ // This helps operating controllers independently
358+ // from other controllers that are currently experiencing
359+ // issues. Most of the times these issues are temporary
360+ // in nature.
361+ continue
320362 }
321- // start this watch controller
363+ // start this controller
322364 wc .Start (mc .WorkerCount )
323365 mc .WatchControllers [key ] = wc
324366 }
367+ if len (errs ) != 0 {
368+ return false , errors .Errorf (
369+ "Failed to start all gctl controllers: %d errors found: %s: %s" ,
370+ len (errs ),
371+ strings .Join (errs , ": " ),
372+ mc ,
373+ )
374+ }
325375 return true , nil
326376}
327377
0 commit comments