Skip to content

Commit 9da0756

Browse files
committed
Handle Out of host capacity scenario in OCI nodepools
1 parent 637d9ad commit 9da0756

File tree

3 files changed

+32
-12
lines changed

3 files changed

+32
-12
lines changed

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,7 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
525525

526526
nodePool, err := m.nodePoolCache.get(np.Id())
527527
if err != nil {
528+
klog.Error(err, "error while performing GetNodePoolNodes call")
528529
return nil, err
529530
}
530531

@@ -540,10 +541,14 @@ func (m *ociManagerImpl) GetNodePoolNodes(np NodePool) ([]cloudprovider.Instance
540541

541542
if node.NodeError != nil {
542543

544+
// We should move away from the approach of determining a node error as a Out of host capacity
545+
// through string comparison. An error code specifically for Out of host capacity must be set
546+
// and returned in the API response.
543547
errorClass := cloudprovider.OtherErrorClass
544548
if *node.NodeError.Code == "LimitExceeded" ||
545-
(*node.NodeError.Code == "InternalServerError" &&
546-
strings.Contains(*node.NodeError.Message, "quota")) {
549+
*node.NodeError.Code == "QuotaExceeded" ||
550+
(*node.NodeError.Code == "InternalError" &&
551+
strings.Contains(*node.NodeError.Message, "Out of host capacity")) {
547552
errorClass = cloudprovider.OutOfResourcesErrorClass
548553
}
549554

cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ func TestGetNodePoolNodes(t *testing.T) {
120120
{
121121
Id: common.String("node8"),
122122
NodeError: &oke.NodeError{
123-
Code: common.String("InternalServerError"),
124-
Message: common.String("blah blah quota exceeded blah blah"),
123+
Code: common.String("InternalError"),
124+
Message: common.String("blah blah Out of host capacity blah blah"),
125125
},
126126
},
127127
{
@@ -186,8 +186,8 @@ func TestGetNodePoolNodes(t *testing.T) {
186186
State: cloudprovider.InstanceCreating,
187187
ErrorInfo: &cloudprovider.InstanceErrorInfo{
188188
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
189-
ErrorCode: "InternalServerError",
190-
ErrorMessage: "blah blah quota exceeded blah blah",
189+
ErrorCode: "InternalError",
190+
ErrorMessage: "blah blah Out of host capacity blah blah",
191191
},
192192
},
193193
},

cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,27 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
214214
}
215215
}
216216
klog.V(4).Infof("DECREASE_TARGET_CHECK_VIA_COMPUTE: %v", decreaseTargetCheckViaComputeBool)
217+
np.manager.InvalidateAndRefreshCache()
218+
nodes, err := np.manager.GetNodePoolNodes(np)
219+
if err != nil {
220+
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
221+
return err
222+
}
223+
// We do not have an OCI API that allows us to delete a node with a compute instance. So we rely on
224+
// the below approach to determine the number running instance in a nodepool from the compute API and
225+
//update the size of the nodepool accordingly. We should move away from this approach once we have an API
226+
// to delete a specific node without a compute instance.
227+
if !decreaseTargetCheckViaComputeBool {
228+
for _, node := range nodes {
229+
if node.Status != nil && node.Status.ErrorInfo != nil {
230+
if node.Status.ErrorInfo.ErrorClass == cloudprovider.OutOfResourcesErrorClass {
231+
klog.Infof("Using Compute to calculate nodepool size as nodepool may contain nodes without a compute instance.")
232+
decreaseTargetCheckViaComputeBool = true
233+
break
234+
}
235+
}
236+
}
237+
}
217238
var nodesLen int
218239
if decreaseTargetCheckViaComputeBool {
219240
nodesLen, err = np.manager.GetExistingNodePoolSizeViaCompute(np)
@@ -222,12 +243,6 @@ func (np *nodePool) DecreaseTargetSize(delta int) error {
222243
return err
223244
}
224245
} else {
225-
np.manager.InvalidateAndRefreshCache()
226-
nodes, err := np.manager.GetNodePoolNodes(np)
227-
if err != nil {
228-
klog.V(4).Error(err, "error while performing GetNodePoolNodes call")
229-
return err
230-
}
231246
nodesLen = len(nodes)
232247
}
233248

0 commit comments

Comments
 (0)