Skip to content
This repository was archived by the owner on May 14, 2025. It is now read-only.

Commit 370ef1a

Browse files
author
Amit Kumar Das
authored
feat(gctl): #142 add indefinite retry option during metac startup (#143)
This commit closes #142 A new boolean flag named 'retry-indefinitely-to-start' is added that lets metac to retry indefinitely to start all the configured generic controllers. When this flag is set to true, retries happen even when one or more configured kubernetes custom resource definitions are not available at the cluster. Once these custom definitions are available & discovered by metac, this retry stops completely. When this flag is not set which is also the default setting, metac binary panics (i.e. metac container stops running) if any of the configured controllers make use of custom resources whose definitions are not yet available in the cluster. The binary panics after exhausting the configured timeout (which defaults to 30 minutes). This feature help teams to manage the transient phases during controller upgrades. One such scenario is when these upgraded controllers make use of new custom resource definition(s) that are yet to be applied against the cluster. Signed-off-by: AmitKumarDas <amit.das@mayadata.io>
1 parent 55357e9 commit 370ef1a

File tree

12 files changed

+351
-91
lines changed

12 files changed

+351
-91
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ integration-dependencies: all
109109
.PHONY: integration-test
110110
integration-test: integration-dependencies
111111
@go test ./test/integration/... \
112-
-v -timeout=10m -args --logtostderr --alsologtostderr -v=1
112+
-v -short -timeout=10m -args --logtostderr --alsologtostderr -v=1
113113

114114
# integration-test-crd-mode runs tests with metac loading
115115
# metacontrollers as kubernetes custom resources

controller/generic/metacontroller.go

Lines changed: 100 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package generic
1818

1919
import (
20+
"fmt"
21+
"strings"
2022
"sync"
2123
"time"
2224

@@ -82,19 +84,15 @@ type ConfigMetaController struct {
8284
// controllers
8385
Configs []*v1alpha1.GenericController
8486

85-
// Total timeout for any condition to succeed.
86-
//
87-
// NOTE:
88-
// This is currently used to load config that is required
89-
// to run Metac.
90-
WaitTimeoutForCondition time.Duration
87+
// This will allow executing start logic to be retried
88+
// indefinitely till all the watch controllers are started
89+
RetryIndefinitelyUntilSucceed *bool
9190

92-
// Interval between retries for any condition to succeed.
93-
//
94-
// NOTE:
95-
// This is currently used to load config that is required
96-
// to run Metac
97-
WaitIntervalForCondition time.Duration
91+
// Maximum time to wait to start all watch controllers
92+
WaitTimeoutForStartAttempt time.Duration
93+
94+
// Interval between retries to start all watch controllers
95+
WaitIntervalBetweenRestarts time.Duration
9896

9997
opts []ConfigMetaControllerOption
10098
err error
@@ -106,8 +104,29 @@ type ConfigMetaController struct {
106104
// This follows **functional options** pattern
107105
type ConfigMetaControllerOption func(*ConfigMetaController) error
108106

109-
// SetMetaControllerConfigLoadFn sets the config loader function
110-
func SetMetaControllerConfigLoadFn(
107+
// SetMetacConfigToRetryIndefinitelyForStart will let this
108+
// controller to retry indefinitely till all its watch controllers
109+
// are started
110+
//
111+
// NOTE:
112+
// Indefinite retry is set only when the provided flag is true
113+
func SetMetacConfigToRetryIndefinitelyForStart(enabled *bool) ConfigMetaControllerOption {
114+
return func(c *ConfigMetaController) error {
115+
// indefinite retry is set only if enabled is true
116+
if enabled == nil || !*enabled {
117+
return nil
118+
}
119+
// We are not bothered about performance. We do not
120+
// need to be fast. We can be lazy but not to the
121+
// extent of sleeping for hours.
122+
c.WaitIntervalBetweenRestarts = 1 * time.Minute
123+
c.RetryIndefinitelyUntilSucceed = k8s.BoolPtr(true)
124+
return nil
125+
}
126+
}
127+
128+
// SetMetacConfigLoadFn sets the config loader function
129+
func SetMetacConfigLoadFn(
111130
fn func() ([]*v1alpha1.GenericController, error),
112131
) ConfigMetaControllerOption {
113132
return func(c *ConfigMetaController) error {
@@ -116,8 +135,8 @@ func SetMetaControllerConfigLoadFn(
116135
}
117136
}
118137

119-
// SetMetaControllerConfigPath sets the config path
120-
func SetMetaControllerConfigPath(path string) ConfigMetaControllerOption {
138+
// SetMetacConfigPath sets the config path
139+
func SetMetacConfigPath(path string) ConfigMetaControllerOption {
121140
return func(c *ConfigMetaController) error {
122141
c.ConfigPath = path
123142
return nil
@@ -134,9 +153,12 @@ func NewConfigMetaController(
134153
) (*ConfigMetaController, error) {
135154
// initialize with defaults & the provided values
136155
ctl := &ConfigMetaController{
137-
WaitTimeoutForCondition: 30 * time.Minute,
138-
WaitIntervalForCondition: 1 * time.Second,
139-
opts: opts,
156+
// Default setting for retry
157+
// - Retry times out in 30 minutes
158+
WaitTimeoutForStartAttempt: 30 * time.Minute,
159+
// - Interval between retries is 1 second
160+
WaitIntervalBetweenRestarts: 1 * time.Second,
161+
opts: opts,
140162
BaseMetaController: BaseMetaController{
141163
ResourceManager: resourceMgr,
142164
DynClientset: dynClientset,
@@ -230,24 +252,27 @@ func (mc *ConfigMetaController) Start() {
230252

231253
glog.Infof("Starting %s", mc)
232254

233-
// we run this as a continuous process
234-
// until all the configs are loaded
235-
err := mc.wait(mc.startAllWatchControllers)
255+
// Run this with retries until all the configs are loaded
256+
// and started. In other words, this starts all the
257+
// generic controllers configured in config file eventually.
258+
err := mc.startWithRetries(mc.startAllWatchControllers)
236259
if err != nil {
237260
glog.Fatalf("Failed to start %s: %+v", mc, err)
238261
}
239262
}()
240263
}
241264

242-
// wait polls the condition until it's true, with a configured
243-
// interval and timeout.
265+
// startWithRetries polls the condition until it's true, with
266+
// a configured interval and timeout.
244267
//
245268
// The condition function returns a bool indicating whether it
246269
// is satisfied, as well as an error which should be non-nil if
247270
// and only if the function was unable to determine whether or
248271
// not the condition is satisfied (for example if the check
249272
// involves calling a remote server and the request failed).
250-
func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
273+
func (mc *ConfigMetaController) startWithRetries(
274+
condition func() (bool, error),
275+
) error {
251276
// mark the start time
252277
start := time.Now()
253278
for {
@@ -257,31 +282,38 @@ func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
257282
// returning nil implies the condition has completed
258283
return nil
259284
}
260-
if time.Since(start) > mc.WaitTimeoutForCondition {
261-
return errors.Wrapf(
262-
err,
263-
"Condition timed out after %s: %s",
264-
mc.WaitTimeoutForCondition,
265-
mc,
266-
)
285+
if time.Since(start) > mc.WaitTimeoutForStartAttempt &&
286+
(mc.RetryIndefinitelyUntilSucceed == nil ||
287+
!*mc.RetryIndefinitelyUntilSucceed) {
288+
{
289+
return errors.Errorf(
290+
"Condition timed out after %s: %+v: %s",
291+
mc.WaitTimeoutForStartAttempt,
292+
err,
293+
mc,
294+
)
295+
}
267296
}
268297
if err != nil {
269-
// Log error, but keep trying until timeout.
298+
// condition resulted in error
299+
// keep trying until timeout
270300
glog.V(7).Infof(
271-
"Condition failed: Will retry after %s: %s: %v",
272-
mc.WaitIntervalForCondition,
273-
mc,
301+
"Condition failed: Will retry after %s: %+v: %s",
302+
mc.WaitIntervalBetweenRestarts,
274303
err,
304+
mc,
275305
)
276306
} else {
307+
// condition did not pass
308+
// keep trying until timeout
277309
glog.V(7).Infof(
278310
"Waiting for condition to succeed: Will retry after %s: %s",
279-
mc.WaitIntervalForCondition,
311+
mc.WaitIntervalBetweenRestarts,
280312
mc,
281313
)
282314
}
283315
// wait & then continue retrying
284-
time.Sleep(mc.WaitIntervalForCondition)
316+
time.Sleep(mc.WaitIntervalBetweenRestarts)
285317
}
286318
}
287319

@@ -290,38 +322,56 @@ func (mc *ConfigMetaController) wait(condition func() (bool, error)) error {
290322
// binary
291323
//
292324
// NOTE:
293-
// This method is used as a condition and hence can be invoked
294-
// more than once.
325+
// This method is used as a condition and is repeatedly executed
326+
// under a loop till this condition is not met.
295327
func (mc *ConfigMetaController) startAllWatchControllers() (bool, error) {
296-
// In this metacontroller, we are only responsible for
297-
// starting/stopping the relevant watch based controllers
328+
var errs []string
329+
// This logic is responsible to start all the generic controllers
330+
// configured in the config file
298331
for _, conf := range mc.Configs {
299332
key := conf.AsNamespaceNameKey()
300333
if _, ok := mc.WatchControllers[key]; ok {
301334
// Already added; perhaps during earlier condition
302335
// checks
303336
continue
304337
}
305-
// watch controller i.e. a controller based on the resource
306-
// specified in the **watch field** of GenericController
338+
// Initialize the GenericController
307339
wc, err := NewWatchController(
308340
mc.ResourceManager,
309341
mc.DynClientset,
310342
mc.DynInformerFactory,
311343
conf,
312344
)
313345
if err != nil {
314-
return false, errors.Wrapf(
315-
err,
316-
"Failed to init %s: %s",
317-
key,
318-
mc,
346+
errs = append(
347+
errs,
348+
fmt.Sprintf(
349+
"Failed to init gctl %s: %s",
350+
key,
351+
err.Error(),
352+
),
319353
)
354+
// continue to initialise & start remaining controllers
355+
//
356+
// NOTE:
357+
// This helps operating controllers independently
358+
// from other controllers that are currently experiencing
359+
// issues. Most of the times these issues are temporary
360+
// in nature.
361+
continue
320362
}
321-
// start this watch controller
363+
// start this controller
322364
wc.Start(mc.WorkerCount)
323365
mc.WatchControllers[key] = wc
324366
}
367+
if len(errs) != 0 {
368+
return false, errors.Errorf(
369+
"Failed to start all gctl controllers: %d errors found: %s: %s",
370+
len(errs),
371+
strings.Join(errs, ": "),
372+
mc,
373+
)
374+
}
325375
return true, nil
326376
}
327377

controller/generic/metacontroller_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -159,16 +159,16 @@ func TestConfigMetaControllerIsDuplicateConfig(t *testing.T) {
159159
}
160160
}
161161

162-
func TestConfigMetaControllerWait(t *testing.T) {
162+
func TestConfigMetaControllerStartWithRetry(t *testing.T) {
163163
var tests = map[string]struct {
164164
controller *ConfigMetaController
165165
condition func() (bool, error)
166166
isErr bool
167167
}{
168168
"cond returns error": {
169169
controller: &ConfigMetaController{
170-
WaitIntervalForCondition: 1 * time.Second,
171-
WaitTimeoutForCondition: 5 * time.Second,
170+
WaitIntervalBetweenRestarts: 1 * time.Second,
171+
WaitTimeoutForStartAttempt: 5 * time.Second,
172172
},
173173
condition: func() (bool, error) {
174174
return false, errors.Errorf("Err")
@@ -177,8 +177,8 @@ func TestConfigMetaControllerWait(t *testing.T) {
177177
},
178178
"cond returns true": {
179179
controller: &ConfigMetaController{
180-
WaitIntervalForCondition: 1 * time.Second,
181-
WaitTimeoutForCondition: 5 * time.Second,
180+
WaitIntervalBetweenRestarts: 1 * time.Second,
181+
WaitTimeoutForStartAttempt: 5 * time.Second,
182182
},
183183
condition: func() (bool, error) {
184184
return true, nil
@@ -187,21 +187,21 @@ func TestConfigMetaControllerWait(t *testing.T) {
187187
},
188188
"cond returns false": {
189189
controller: &ConfigMetaController{
190-
WaitIntervalForCondition: 1 * time.Second,
191-
WaitTimeoutForCondition: 5 * time.Second,
190+
WaitIntervalBetweenRestarts: 1 * time.Second,
191+
WaitTimeoutForStartAttempt: 5 * time.Second,
192192
},
193193
condition: func() (bool, error) {
194194
return false, nil
195195
},
196-
isErr: false,
196+
isErr: true,
197197
},
198198
}
199199
for name, mock := range tests {
200200
name := name
201201
mock := mock
202202
t.Run(name, func(t *testing.T) {
203203
ctl := mock.controller
204-
err := ctl.wait(mock.condition)
204+
err := ctl.startWithRetries(mock.condition)
205205
if mock.isErr && err == nil {
206206
t.Fatalf("Expected error got none")
207207
}

go.mod

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ go 1.13
66

77
require (
88
contrib.go.opencensus.io/exporter/prometheus v0.1.0
9-
github.com/coreos/etcd v3.3.15+incompatible // indirect
109
github.com/evanphx/json-patch v4.5.0+incompatible // indirect
1110
github.com/ghodss/yaml v1.0.0
1211
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b

server/server.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,10 @@ type ConfigServer struct {
240240
// ConfigPath has higher priority
241241
GenericControllerConfigLoadFn func() ([]*v1alpha1.GenericController, error)
242242

243+
// This will allow executing start logic to be retried
244+
// indefinitely till all the watch controllers are started
245+
RetryIndefinitelyForStart *bool
246+
243247
// Number of workers per watch controller
244248
workerCount int
245249
}
@@ -273,10 +277,9 @@ func (s *ConfigServer) Start(workerCount int) (stop func(), err error) {
273277

274278
// various generic meta controller options to setup meta controller
275279
configOpts := []generic.ConfigMetaControllerOption{
276-
generic.SetMetaControllerConfigLoadFn(
277-
s.GenericControllerConfigLoadFn,
278-
),
279-
generic.SetMetaControllerConfigPath(s.ConfigPath),
280+
generic.SetMetacConfigLoadFn(s.GenericControllerConfigLoadFn),
281+
generic.SetMetacConfigPath(s.ConfigPath),
282+
generic.SetMetacConfigToRetryIndefinitelyForStart(s.RetryIndefinitelyForStart),
280283
}
281284

282285
genericMetac, err := generic.NewConfigMetaController(

start/start.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,13 @@ var (
8888
"metac-config-path",
8989
"/etc/config/metac/",
9090
`Path to metac config file to let metac run as a self contained binary.
91-
Needs run-as-local set to true`,
91+
Applicable if run-as-local is set to true`,
92+
)
93+
retryIndefinitelyToStart = flag.Bool(
94+
"retry-indefinitely-to-start",
95+
false,
96+
`When true will let metac to retry continuously till all its controllers are started.
97+
Applicable if run-as-local is set to true`,
9298
)
9399
)
94100

@@ -145,8 +151,9 @@ func Start() {
145151
// looking up various MetaController resources as
146152
// config files
147153
configServer := &server.ConfigServer{
148-
Server: mserver,
149-
ConfigPath: *metacConfigPath,
154+
Server: mserver,
155+
ConfigPath: *metacConfigPath,
156+
RetryIndefinitelyForStart: retryIndefinitelyToStart,
150157
}
151158
stopServer, err = configServer.Start(*workerCount)
152159
} else {

0 commit comments

Comments
 (0)