Add ignore-instance-creation-errors

dsafdsa1 · dsafdsa1 · commit 325a37c0ffa1 · 2025-07-17T14:31:54.000Z
diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -204,6 +204,11 @@ func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.No
 	return csr.nodeGroupConfigProcessor.GetMaxNodeProvisionTime(nodeGroup)
 }
 
+// IgnoreInstanceCreationStockoutErrors returns IgnoreInstanceCreationStockoutErrors value that should be used for a given NodeGroup.
+func (csr *ClusterStateRegistry) IgnoreInstanceCreationStockoutErrors(nodeGroup cloudprovider.NodeGroup) (bool, error) {
+	return csr.nodeGroupConfigProcessor.GetIgnoreInstanceCreationStockoutErrors(nodeGroup)
+}
+
 func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time) {
 	maxNodeProvisionTime, err := csr.MaxNodeProvisionTime(nodeGroup)
 	if err != nil {
@@ -1128,13 +1133,22 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
 		}
 	}
 
+	ignoreStockoutErrors, err := csr.nodeGroupConfigProcessor.GetIgnoreInstanceCreationStockoutErrors(nodeGroup)
+	if err != nil {
+		klog.V(1).Infof("Failed to find IgnoreInstanceCreationStockoutErrors for nodeGroup %v with error: %v", nodeGroup.Id(), err)
+	}
+
 	// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
 	// out-of-resources errors we:
 	//  - emit event
 	//  - alter the scale-up
 	//  - increase scale-up failure metric
 	//  - backoff the node group
 	for errorCode, instances := range currentErrorCodeToInstance {
+		if errorCode.class == cloudprovider.OutOfResourcesErrorClass && ignoreStockoutErrors {
+			continue
+		}
+
 		unseenInstanceIds := make([]string, 0)
 		for _, instance := range instances {
 			if _, seen := previousInstanceToErrorCode[instance.Id]; !seen {
@@ -1228,14 +1242,29 @@ func (csr *ClusterStateRegistry) buildInstanceToErrorCodeMappings(instances []cl
 }
 
 // GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error.
-func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node {
+func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors(nodeGroups map[string]cloudprovider.NodeGroup) map[string][]*apiv1.Node {
 	csr.Lock()
 	defer csr.Unlock()
 
+	ignoreStockoutErrors := func(nodeGroupId string) bool {
+		nodeGroup := nodeGroups[nodeGroupId]
+		if nodeGroup == nil {
+			return false
+		}
+		ignore, err := csr.IgnoreInstanceCreationStockoutErrors(nodeGroup)
+		if err != nil {
+			klog.V(1).Infof("Failed to find IgnoreInstanceCreationStockoutErrors for nodeGroup %v with error: %v", nodeGroupId, err)
+		}
+		return ignore
+	}
+
 	nodesWithCreateErrors := make(map[string][]*apiv1.Node)
 	for nodeGroupId, nodeGroupInstances := range csr.cloudProviderNodeInstances {
 		_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
-		for _, instances := range instancesByErrorCode {
+		for errorCode, instances := range instancesByErrorCode {
+			if errorCode.class == cloudprovider.OutOfResourcesErrorClass && ignoreStockoutErrors(nodeGroupId) {
+				continue
+			}
 			for _, instance := range instances {
 				nodesWithCreateErrors[nodeGroupId] = append(nodesWithCreateErrors[nodeGroupId], FakeNode(instance, cloudprovider.FakeNodeCreateError))
 			}
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -50,6 +50,8 @@ type NodeGroupAutoscalingOptions struct {
 	ScaleDownUnreadyTime time.Duration
 	// Maximum time CA waits for node to be provisioned
 	MaxNodeProvisionTime time.Duration
+	// Whether CA should ignore instance creation stockout errors
+	IgnoreInstanceCreationStockoutErrors bool
 	// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
 	ZeroOrMaxNodeScaling bool
 	// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down
diff --git a/cluster-autoscaler/config/flags/flags.go b/cluster-autoscaler/config/flags/flags.go
@@ -119,13 +119,14 @@ var (
 	maxBulkSoftTaintTime       = flag.Duration("max-bulk-soft-taint-time", 3*time.Second, "Maximum duration of tainting/untainting nodes as PreferNoSchedule at the same time.")
 	maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node. "+
 		"This flag is mutually exclusion with drain-priority-config flag which allows more configuration options.")
-	maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster.  After this is exceeded, CA halts operations")
-	okTotalUnreadyCount       = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
-	scaleUpFromZero           = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
-	parallelScaleUp           = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
-	maxNodeProvisionTime      = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
-	maxPodEvictionTime        = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
-	nodeGroupsFlag            = multiStringFlag(
+	maxTotalUnreadyPercentage            = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster.  After this is exceeded, CA halts operations")
+	okTotalUnreadyCount                  = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
+	scaleUpFromZero                      = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
+	parallelScaleUp                      = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
+	maxNodeProvisionTime                 = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
+	ignoreInstanceCreationStockoutErrors = flag.Bool("ignore-instance-creation-stockout-errors", false, "Whether CA should ignore instance creation stockout errors")
+	maxPodEvictionTime                   = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
+	nodeGroupsFlag                       = multiStringFlag(
 		"nodes",
 		"sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...>")
 	nodeGroupAutoDiscoveryFlag = multiStringFlag(
@@ -283,12 +284,13 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 
 	return config.AutoscalingOptions{
 		NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
-			ScaleDownUtilizationThreshold:    *scaleDownUtilizationThreshold,
-			ScaleDownGpuUtilizationThreshold: *scaleDownGpuUtilizationThreshold,
-			ScaleDownUnneededTime:            *scaleDownUnneededTime,
-			ScaleDownUnreadyTime:             *scaleDownUnreadyTime,
-			IgnoreDaemonSetsUtilization:      *ignoreDaemonSetsUtilization,
-			MaxNodeProvisionTime:             *maxNodeProvisionTime,
+			ScaleDownUtilizationThreshold:        *scaleDownUtilizationThreshold,
+			ScaleDownGpuUtilizationThreshold:     *scaleDownGpuUtilizationThreshold,
+			ScaleDownUnneededTime:                *scaleDownUnneededTime,
+			ScaleDownUnreadyTime:                 *scaleDownUnreadyTime,
+			IgnoreDaemonSetsUtilization:          *ignoreDaemonSetsUtilization,
+			MaxNodeProvisionTime:                 *maxNodeProvisionTime,
+			IgnoreInstanceCreationStockoutErrors: *ignoreInstanceCreationStockoutErrors,
 		},
 		CloudConfig:                      *cloudConfig,
 		CloudProviderName:                *cloudProviderFlag,
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -868,7 +868,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
 	// We always schedule deleting of incoming errornous nodes
 	// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
 	nodeGroups := a.nodeGroupsById()
-	nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()
+	nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors(nodeGroups)
 
 	deletedAny := false
 
diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go
diff --git a/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go b/cluster-autoscaler/processors/nodegroupconfig/node_group_config_processor.go