Skip to content

Commit 325a37c

Browse files
committed
Add ignore-instance-creation-errors
1 parent 8e47b51 commit 325a37c

File tree

6 files changed

+611
-323
lines changed

6 files changed

+611
-323
lines changed

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.No
204204
return csr.nodeGroupConfigProcessor.GetMaxNodeProvisionTime(nodeGroup)
205205
}
206206

207+
// IgnoreInstanceCreationStockoutErrors returns IgnoreInstanceCreationStockoutErrors value that should be used for a given NodeGroup.
208+
func (csr *ClusterStateRegistry) IgnoreInstanceCreationStockoutErrors(nodeGroup cloudprovider.NodeGroup) (bool, error) {
209+
return csr.nodeGroupConfigProcessor.GetIgnoreInstanceCreationStockoutErrors(nodeGroup)
210+
}
211+
207212
func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time) {
208213
maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup)
209214
if err != nil {
@@ -1128,13 +1133,22 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
11281133
}
11291134
}
11301135

1136+
ignoreStockoutErrors, err := csr.nodeGroupConfigProcessor.GetIgnoreInstanceCreationStockoutErrors(nodeGroup)
1137+
if err != nil {
1138+
klog.V(1).Infof("Failed to find IgnoreInstanceCreationStockoutErrors for nodeGroup %v with error: %v", nodeGroup.Id(), err)
1139+
}
1140+
11311141
// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
11321142
// out-of-resources errors we:
11331143
// - emit event
11341144
// - alter the scale-up
11351145
// - increase scale-up failure metric
11361146
// - backoff the node group
11371147
for errorCode, instances := range currentErrorCodeToInstance {
1148+
if errorCode.class == cloudprovider.OutOfResourcesErrorClass && ignoreStockoutErrors {
1149+
continue
1150+
}
1151+
11381152
unseenInstanceIds := make([]string, 0)
11391153
for _, instance := range instances {
11401154
if _, seen := previousInstanceToErrorCode[instance.Id]; !seen {
@@ -1228,14 +1242,29 @@ func (csr *ClusterStateRegistry) buildInstanceToErrorCodeMappings(instances []cl
12281242
}
12291243

12301244
// GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error.
1231-
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node {
1245+
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors(nodeGroups map[string]cloudprovider.NodeGroup) map[string][]*apiv1.Node {
12321246
csr.Lock()
12331247
defer csr.Unlock()
12341248

1249+
ignoreStockoutErrors := func(nodeGroupId string) bool {
1250+
nodeGroup := nodeGroups[nodeGroupId]
1251+
if nodeGroup == nil {
1252+
return false
1253+
}
1254+
ignore, err := csr.IgnoreInstanceCreationStockoutErrors(nodeGroup)
1255+
if err != nil {
1256+
klog.V(1).Infof("Failed to find IgnoreInstanceCreationStockoutErrors for nodeGroup %v with error: %v", nodeGroupId, err)
1257+
}
1258+
return ignore
1259+
}
1260+
12351261
nodesWithCreateErrors := make(map[string][]*apiv1.Node)
12361262
for nodeGroupId, nodeGroupInstances := range csr.cloudProviderNodeInstances {
12371263
_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
1238-
for _, instances := range instancesByErrorCode {
1264+
for errorCode, instances := range instancesByErrorCode {
1265+
if errorCode.class == cloudprovider.OutOfResourcesErrorClass && ignoreStockoutErrors(nodeGroupId) {
1266+
continue
1267+
}
12391268
for _, instance := range instances {
12401269
nodesWithCreateErrors[nodeGroupId] = append(nodesWithCreateErrors[nodeGroupId], FakeNode(instance, cloudprovider.FakeNodeCreateError))
12411270
}

cluster-autoscaler/config/autoscaling_options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ type NodeGroupAutoscalingOptions struct {
5050
ScaleDownUnreadyTime time.Duration
5151
// Maximum time CA waits for node to be provisioned
5252
MaxNodeProvisionTime time.Duration
53+
// Whether CA should ignore instance creation stockout errors
54+
IgnoreInstanceCreationStockoutErrors bool
5355
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
5456
ZeroOrMaxNodeScaling bool
5557
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down

cluster-autoscaler/config/flags/flags.go

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -119,13 +119,14 @@ var (
119119
maxBulkSoftTaintTime = flag.Duration("max-bulk-soft-taint-time", 3*time.Second, "Maximum duration of tainting/untainting nodes as PreferNoSchedule at the same time.")
120120
maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node. "+
121121
"This flag is mutually exclusion with drain-priority-config flag which allows more configuration options.")
122-
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
123-
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
124-
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
125-
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
126-
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
127-
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
128-
nodeGroupsFlag = multiStringFlag(
122+
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
123+
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
124+
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
125+
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
126+
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
127+
ignoreInstanceCreationStockoutErrors = flag.Bool("ignore-instance-creation-stockout-errors", false, "Whether CA should ignore instance creation stockout errors")
128+
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
129+
nodeGroupsFlag = multiStringFlag(
129130
"nodes",
130131
"sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...>")
131132
nodeGroupAutoDiscoveryFlag = multiStringFlag(
@@ -283,12 +284,13 @@ func createAutoscalingOptions() config.AutoscalingOptions {
283284

284285
return config.AutoscalingOptions{
285286
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
286-
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
287-
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
288-
ScaleDownUnneededTime: *scaleDownUnneededTime,
289-
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
290-
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
291-
MaxNodeProvisionTime: *maxNodeProvisionTime,
287+
ScaleDownUtilizationThreshold: *scaleDownUtilizationThreshold,
288+
ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
289+
ScaleDownUnneededTime: *scaleDownUnneededTime,
290+
ScaleDownUnreadyTime: *scaleDownUnreadyTime,
291+
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
292+
MaxNodeProvisionTime: *maxNodeProvisionTime,
293+
IgnoreInstanceCreationStockoutErrors: *ignoreInstanceCreationStockoutErrors,
292294
},
293295
CloudConfig: *cloudConfig,
294296
CloudProviderName: *cloudProviderFlag,

cluster-autoscaler/core/static_autoscaler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -868,7 +868,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
868868
// We always schedule deleting of incoming errornous nodes
869869
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
870870
nodeGroups := a.nodeGroupsById()
871-
nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()
871+
nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors(nodeGroups)
872872

873873
deletedAny := false
874874

0 commit comments

Comments
 (0)