Skip to content

Commit e75b305

Browse files
authored
Merge pull request #8357 from k8s-infra-cherrypick-robot/cherry-pick-8315-to-cluster-autoscaler-release-1.30
[cluster-autoscaler-release-1.30] Handle Out of host capacity scenario in OCI nodepools
2 parents ecdc67a + 5e68acd commit e75b305

File tree

3 files changed

+32
-12
lines changed

3 files changed

+32
-12
lines changed

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,7 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
516516

517517
nodePool, err := m.nodePoolCache.get(np.Id())
518518
if err != nil {
519+
klog.Error(err, "error while performing GetNodePoolNodes call")
519520
return nil, err
520521
}
521522

@@ -524,10 +525,14 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
524525

525526
if node.NodeError != nil {
526527

528+
// We should move away from the approach of determining a node error as a Out of host capacity
529+
// through string comparison. An error code specifically for Out of host capacity must be set
530+
// and returned in the API response.
527531
errorClass := cloudprovider.OtherErrorClass
528532
if *node.NodeError.Code == "LimitExceeded" ||
529-
(*node.NodeError.Code == "InternalServerError" &&
530-
strings.Contains(*node.NodeError.Message, "quota")) {
533+
*node.NodeError.Code == "QuotaExceeded" ||
534+
(*node.NodeError.Code == "InternalError" &&
535+
strings.Contains(*node.NodeError.Message, "Out of host capacity")) {
531536
errorClass = cloudprovider.OutOfResourcesErrorClass
532537
}
533538

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ func TestGetNodePoolNodes(t *testing.T) {
119119
{
120120
Id: common.String("node8"),
121121
NodeError: &oke.NodeError{
122-
Code: common.String("InternalServerError"),
123-
Message: common.String("blah blah quota exceeded blah blah"),
122+
Code: common.String("InternalError"),
123+
Message: common.String("blah blah Out of host capacity blah blah"),
124124
},
125125
},
126126
},
@@ -176,8 +176,8 @@ func TestGetNodePoolNodes(t *testing.T) {
176176
Status: &cloudprovider.InstanceStatus{
177177
ErrorInfo: &cloudprovider.InstanceErrorInfo{
178178
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
179-
ErrorCode: "InternalServerError",
180-
ErrorMessage: "blah blah quota exceeded blah blah",
179+
ErrorCode: "InternalError",
180+
ErrorMessage: "blah blah Out of host capacity blah blah",
181181
},
182182
},
183183
},

cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,27 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
209209
}
210210
}
211211
klog.V(4).Infof("DECREASE_TARGET_CHECK_VIA_COMPUTE: %v", decreaseTargetCheckViaComputeBool)
212+
np.manager.InvalidateAndRefreshCache()
213+
nodes, err := np.manager.GetNodePoolNodes(np)
214+
if err != nil {
215+
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
216+
return err
217+
}
218+
// We do not have an OCI API that allows us to delete a node with a compute instance. So we rely on
219+
// the below approach to determine the number running instance in a nodepool from the compute API and
220+
//update the size of the nodepool accordingly. We should move away from this approach once we have an API
221+
// to delete a specific node without a compute instance.
222+
if !decreaseTargetCheckViaComputeBool {
223+
for _, node := range nodes {
224+
if node.Status != nil && node.Status.ErrorInfo != nil {
225+
if node.Status.ErrorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass {
226+
klog.Infof("Using Compute to calculate nodepool size as nodepool may contain nodes without a compute instance.")
227+
decreaseTargetCheckViaComputeBool = true
228+
break
229+
}
230+
}
231+
}
232+
}
212233
var nodesLen int
213234
if decreaseTargetCheckViaComputeBool {
214235
nodesLen, err = np.manager.GetExistingNodePoolSizeViaCompute(np)
@@ -217,12 +238,6 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
217238
return err
218239
}
219240
} else {
220-
np.manager.InvalidateAndRefreshCache()
221-
nodes, err := np.manager.GetNodePoolNodes(np)
222-
if err != nil {
223-
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
224-
return err
225-
}
226241
nodesLen = len(nodes)
227242
}
228243

0 commit comments

Comments
 (0)