Skip to content

Commit d7dbcd0

Browse files
committed
Add ignore-instance-creation-errors
1 parent 8e47b51 commit d7dbcd0

File tree

6 files changed

+482
-316
lines changed

6 files changed

+482
-316
lines changed

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.No
204204
return csr.nodeGroupConfigProcessor.GetMaxNodeProvisionTime(nodeGroup)
205205
}
206206

207+
// IgnoreInstanceCreationErrors returns IgnoreInstanceCreationErrors value that should be used for a given NodeGroup.
208+
func (csr *ClusterStateRegistry) IgnoreInstanceCreationErrors(nodeGroup cloudprovider.NodeGroup) (bool, error) {
209+
return csr.nodeGroupConfigProcessor.GetIgnoreInstanceCreationErrors(nodeGroup)
210+
}
211+
207212
func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time) {
208213
maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup)
209214
if err != nil {
@@ -1128,6 +1133,13 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
11281133
}
11291134
}
11301135

1136+
ignoreErrors, err := csr.nodeGroupConfigProcessor.GetIgnoreInstanceCreationErrors(nodeGroup)
1137+
if err != nil {
1138+
klog.V(1).Infof("Failed to find IgnoreInstanceCreationErrors for nodeGroup %v with error: %v", nodeGroup.Id(), err)
1139+
} else if ignoreErrors {
1140+
return // The scaleUp will timeout after MaxNodeProvisionTime is reached (ClusterStateRegistry::updateScaleRequests).
1141+
}
1142+
11311143
// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
11321144
// out-of-resources errors we:
11331145
// - emit event

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ type NodeGroupAutoscalingOptions struct {
5050
ScaleDownUnreadyTime time.Duration
5151
// Maximum time CA waits for node to be provisioned
5252
MaxNodeProvisionTime time.Duration
53+
// Whether CA should ignore instance creation errors
54+
IgnoreInstanceCreationErrors bool
5355
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
5456
ZeroOrMaxNodeScaling bool
5557
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down

cluster-autoscaler/config/flags/flags.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -119,13 +119,14 @@ var (
119119
maxBulkSoftTaintTime = flag.Duration("max-bulk-soft-taint-time", 3*time.Second, "Maximum duration of tainting/untainting nodes as PreferNoSchedule at the same time.")
120120
maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node. "+
121121
"This flag is mutually exclusion with drain-priority-config flag which allows more configuration options.")
122-
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
123-
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
124-
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
125-
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
126-
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
127-
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
128-
nodeGroupsFlag = multiStringFlag(
122+
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
123+
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
124+
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
125+
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
126+
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
127+
ignoreInstanceCreationErrors = flag.Bool("ignore-instance-creation-errors", false, "Whether CA should ignore instance creation errors")
128+
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
129+
nodeGroupsFlag = multiStringFlag(
129130
"nodes",
130131
"sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...>")
131132
nodeGroupAutoDiscoveryFlag = multiStringFlag(
@@ -289,6 +290,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
289290
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
290291
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
291292
MaxNodeProvisionTime: *maxNodeProvisionTime,
293+
IgnoreInstanceCreationErrors: *ignoreInstanceCreationErrors,
292294
},
293295
CloudConfig: *cloudConfig,
294296
CloudProviderName: *cloudProviderFlag,

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -879,8 +879,13 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
879879
nodeGroup := nodeGroups[nodeGroupId]
880880
if nodeGroup == nil {
881881
err = fmt.Errorf("node group %s not found", nodeGroupId)
882-
} else if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil {
883-
err = nodeGroup.DeleteNodes(nodesToDelete)
882+
} else {
883+
if ignoreErrors, _ := a.clusterStateRegistry.IgnoreInstanceCreationErrors(nodeGroup); ignoreErrors {
884+
continue // These nodes will be deleted after MaxNodeProvisionTime is reached.
885+
}
886+
if nodesToDelete, err = overrideNodesToDeleteForZeroOrMax(a.NodeGroupDefaults, nodeGroup, nodesToDelete); err == nil {
887+
err = nodeGroup.DeleteNodes(nodesToDelete)
888+
}
884889
}
885890

886891
if err != nil {

0 commit comments

Comments
 (0)