Skip to content

Commit 1fbc7a9

Browse files
committed
Handle Out of host capacity scenario in OCI nodepools
1 parent 637d9ad commit 1fbc7a9

File tree

3 files changed

+33
-33
lines changed

3 files changed

+33
-33
lines changed

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -461,12 +461,6 @@ func (m *ociManagerImpl) GetExistingNodePoolSizeViaCompute(np NodePool) (int, er
461461
if !strings.HasPrefix(*item.DisplayName, displayNamePrefix) {
462462
continue
463463
}
464-
// A node pool can fail to scale up if there's no capacity in the region. In that case, the node pool will be
465-
// returned by the API, but it will not actually exist or have an ID, so we don't want to tell the autoscaler about it.
466-
if *item.Id == "" {
467-
klog.V(4).Infof("skipping node as it doesn't have a scaled-up instance")
468-
continue
469-
}
470464
switch item.LifecycleState {
471465
case core.InstanceLifecycleStateStopped, core.InstanceLifecycleStateTerminated:
472466
klog.V(4).Infof("skipping instance is in stopped/terminated state: %q", *item.Id)
@@ -525,25 +519,23 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
525519

526520
nodePool, err := m.nodePoolCache.get(np.Id())
527521
if err != nil {
522+
klog.Error(err, "error while performing GetNodePoolNodes call")
528523
return nil, err
529524
}
530525

531526
var instances []cloudprovider.Instance
532527
for _, node := range nodePool.Nodes {
533528

534-
// A node pool can fail to scale up if there's no capacity in the region. In that case, the node pool will be
535-
// returned by the API, but it will not actually exist or have an ID, so we don't want to tell the autoscaler about it.
536-
if *node.Id == "" {
537-
klog.V(4).Infof("skipping node as it doesn't have a scaled-up instance")
538-
continue
539-
}
540-
541529
if node.NodeError != nil {
542530

531+
// We should move away from the approach of determining a node error as a Out of host capacity
532+
// through string comparison. An error code specifically for Out of host capacity must be set
533+
// and returned in the API response.
543534
errorClass := cloudprovider.OtherErrorClass
544535
if *node.NodeError.Code == "LimitExceeded" ||
545-
(*node.NodeError.Code == "InternalServerError" &&
546-
strings.Contains(*node.NodeError.Message, "quota")) {
536+
*node.NodeError.Code == "QuotaExceeded" ||
537+
(*node.NodeError.Code == "InternalError" &&
538+
strings.Contains(*node.NodeError.Message, "Out of host capacity")) {
547539
errorClass = cloudprovider.OutOfResourcesErrorClass
548540
}
549541

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@ package nodepools
66

77
import (
88
"context"
9+
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"
910
"net/http"
1011
"reflect"
1112
"testing"
1213

13-
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/nodepools/consts"
14-
1514
apiv1 "k8s.io/api/core/v1"
1615
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
1716
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/vendor-internal/github.com/oracle/oci-go-sdk/v65/common"
@@ -120,16 +119,10 @@ func TestGetNodePoolNodes(t *testing.T) {
120119
{
121120
Id: common.String("node8"),
122121
NodeError: &oke.NodeError{
123-
Code: common.String("InternalServerError"),
124-
Message: common.String("blah blah quota exceeded blah blah"),
122+
Code: common.String("InternalError"),
123+
Message: common.String("blah blah Out of host capacity blah blah"),
125124
},
126125
},
127-
{
128-
// This case happens if a node fails to scale up due to lack of capacity in the region.
129-
// It's not a real node, so we shouldn't return it in the list of nodes.
130-
Id: common.String(""),
131-
LifecycleState: oke.NodeLifecycleStateCreating,
132-
},
133126
},
134127
}
135128

@@ -186,8 +179,8 @@ func TestGetNodePoolNodes(t *testing.T) {
186179
State: cloudprovider.InstanceCreating,
187180
ErrorInfo: &cloudprovider.InstanceErrorInfo{
188181
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
189-
ErrorCode: "InternalServerError",
190-
ErrorMessage: "blah blah quota exceeded blah blah",
182+
ErrorCode: "InternalError",
183+
ErrorMessage: "blah blah Out of host capacity blah blah",
191184
},
192185
},
193186
},

cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,27 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
214214
}
215215
}
216216
klog.V(4).Infof("DECREASE_TARGET_CHECK_VIA_COMPUTE: %v", decreaseTargetCheckViaComputeBool)
217+
np.manager.InvalidateAndRefreshCache()
218+
nodes, err := np.manager.GetNodePoolNodes(np)
219+
if err != nil {
220+
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
221+
return err
222+
}
223+
// We do not have an OCI API that allows us to delete a node with a compute instance. So we rely on
224+
// the below approach to determine the number running instance in a nodepool from the compute API and
225+
//update the size of the nodepool accordingly. We should move away from this approach once we have an API
226+
// to delete a specific node without a compute instance.
227+
if !decreaseTargetCheckViaComputeBool {
228+
for _, node := range nodes {
229+
if node.Status != nil && node.Status.ErrorInfo != nil {
230+
if node.Status.ErrorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass {
231+
klog.Infof("Using Compute to calculate nodepool size as nodepool may contain nodes without a compute instance.")
232+
decreaseTargetCheckViaComputeBool = true
233+
break
234+
}
235+
}
236+
}
237+
}
217238
var nodesLen int
218239
if decreaseTargetCheckViaComputeBool {
219240
nodesLen, err = np.manager.GetExistingNodePoolSizeViaCompute(np)
@@ -222,12 +243,6 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
222243
return err
223244
}
224245
} else {
225-
np.manager.InvalidateAndRefreshCache()
226-
nodes, err := np.manager.GetNodePoolNodes(np)
227-
if err != nil {
228-
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
229-
return err
230-
}
231246
nodesLen = len(nodes)
232247
}
233248

0 commit comments

Comments
 (0)