From 32e7d743b0763839ebcd13aab0163728adbd6165 Mon Sep 17 00:00:00 2001 From: Liangquan Li Date: Mon, 24 Nov 2025 16:21:03 +0800 Subject: [PATCH] fix(controllers): add watches to resolve AWSMachineTemplate e2e race condition Add MachineDeployment and KubeadmControlPlane watchers to trigger AWSMachineTemplate reconciliation, ensuring nodeInfo is populated before cache sync completes. Related: #5711 --- controllers/awsmachinetemplate_controller.go | 80 ++++++++++++++++++- main.go | 2 + .../unmanaged/unmanaged_functional_test.go | 58 ++++++++------ 3 files changed, 112 insertions(+), 28 deletions(-) diff --git a/controllers/awsmachinetemplate_controller.go b/controllers/awsmachinetemplate_controller.go index 3c098ff155..f692a3f386 100644 --- a/controllers/awsmachinetemplate_controller.go +++ b/controllers/awsmachinetemplate_controller.go @@ -33,6 +33,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" @@ -47,6 +48,11 @@ import ( "sigs.k8s.io/cluster-api/util/predicates" ) +const ( + // awsMachineTemplateKind is the Kind name for AWSMachineTemplate resources. + awsMachineTemplateKind = "AWSMachineTemplate" +) + // AWSMachineTemplateReconciler reconciles AWSMachineTemplate objects. // // This controller automatically populates capacity information for AWSMachineTemplate resources @@ -62,11 +68,27 @@ type AWSMachineTemplateReconciler struct { func (r *AWSMachineTemplateReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := logger.FromContext(ctx) - return ctrl.NewControllerManagedBy(mgr). + b := ctrl.NewControllerManagedBy(mgr). For(&infrav1.AWSMachineTemplate{}). WithOptions(options). WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(mgr.GetScheme(), log.GetLogger(), r.WatchFilterValue)). - Complete(r) + Watches( + &clusterv1.MachineDeployment{}, + handler.EnqueueRequestsFromMapFunc(r.machineDeploymentToAWSMachineTemplate), + ) + + // Optionally watch KubeadmControlPlane if the CRD exists + if _, err := mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: controlplanev1.GroupVersion.Group, Kind: "KubeadmControlPlane"}, controlplanev1.GroupVersion.Version); err == nil { + b = b.Watches(&controlplanev1.KubeadmControlPlane{}, + handler.EnqueueRequestsFromMapFunc(r.kubeadmControlPlaneToAWSMachineTemplate)) + } + + _, err := b.Build(r) + if err != nil { + return errors.Wrap(err, "failed setting up with a controller manager") + } + + return nil } // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates,verbs=get;list;watch @@ -373,7 +395,7 @@ func (r *AWSMachineTemplateReconciler) getKubernetesVersion(ctx context.Context, // Find MachineDeployments that reference this AWSMachineTemplate for _, md := range machineDeploymentList.Items { - if md.Spec.Template.Spec.InfrastructureRef.Kind == "AWSMachineTemplate" && + if md.Spec.Template.Spec.InfrastructureRef.Kind == awsMachineTemplateKind && md.Spec.Template.Spec.InfrastructureRef.Name == template.Name && md.Spec.Template.Spec.Version != "" { return md.Spec.Template.Spec.Version, nil @@ -388,7 +410,7 @@ func (r *AWSMachineTemplateReconciler) getKubernetesVersion(ctx context.Context, // Find KubeadmControlPlanes that reference this AWSMachineTemplate for _, kcp := range kcpList.Items { - if kcp.Spec.MachineTemplate.Spec.InfrastructureRef.Kind == "AWSMachineTemplate" && + if kcp.Spec.MachineTemplate.Spec.InfrastructureRef.Kind == awsMachineTemplateKind && kcp.Spec.MachineTemplate.Spec.InfrastructureRef.Name == template.Name && kcp.Spec.Version != "" { return kcp.Spec.Version, nil @@ -420,3 +442,53 @@ func getParentListOptions(obj metav1.ObjectMeta) ([]client.ListOption, error) { } return listOpts, nil } + +// kubeadmControlPlaneToAWSMachineTemplate maps KubeadmControlPlane to AWSMachineTemplate reconcile requests. +// This enables the controller to reconcile AWSMachineTemplate when its owner KubeadmControlPlane is created or updated, +// ensuring that nodeInfo can be populated even if the cache hasn't synced yet. +func (r *AWSMachineTemplateReconciler) kubeadmControlPlaneToAWSMachineTemplate(ctx context.Context, o client.Object) []ctrl.Request { + kcp, ok := o.(*controlplanev1.KubeadmControlPlane) + if !ok { + return nil + } + + // Check if it references an AWSMachineTemplate + if kcp.Spec.MachineTemplate.Spec.InfrastructureRef.Kind != awsMachineTemplateKind { + return nil + } + + // Return reconcile request for the referenced AWSMachineTemplate + return []ctrl.Request{ + { + NamespacedName: client.ObjectKey{ + Namespace: kcp.Namespace, + Name: kcp.Spec.MachineTemplate.Spec.InfrastructureRef.Name, + }, + }, + } +} + +// machineDeploymentToAWSMachineTemplate maps MachineDeployment to AWSMachineTemplate reconcile requests. +// This enables the controller to reconcile AWSMachineTemplate when its owner MachineDeployment is created or updated, +// ensuring that nodeInfo can be populated even if the cache hasn't synced yet. +func (r *AWSMachineTemplateReconciler) machineDeploymentToAWSMachineTemplate(ctx context.Context, o client.Object) []ctrl.Request { + md, ok := o.(*clusterv1.MachineDeployment) + if !ok { + return nil + } + + // Check if it references an AWSMachineTemplate + if md.Spec.Template.Spec.InfrastructureRef.Kind != awsMachineTemplateKind { + return nil + } + + // Return reconcile request for the referenced AWSMachineTemplate + return []ctrl.Request{ + { + NamespacedName: client.ObjectKey{ + Namespace: md.Namespace, + Name: md.Spec.Template.Spec.InfrastructureRef.Name, + }, + }, + } +} diff --git a/main.go b/main.go index 83330f4cdd..62a64cc53e 100644 --- a/main.go +++ b/main.go @@ -65,6 +65,7 @@ import ( "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" "sigs.k8s.io/cluster-api-provider-aws/v2/version" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/util/flags" ) @@ -79,6 +80,7 @@ func init() { _ = eksbootstrapv1beta1.AddToScheme(scheme) _ = cgscheme.AddToScheme(scheme) _ = clusterv1.AddToScheme(scheme) + _ = controlplanev1.AddToScheme(scheme) _ = ekscontrolplanev1.AddToScheme(scheme) _ = ekscontrolplanev1beta1.AddToScheme(scheme) _ = rosacontrolplanev1.AddToScheme(scheme) diff --git a/test/e2e/suites/unmanaged/unmanaged_functional_test.go b/test/e2e/suites/unmanaged/unmanaged_functional_test.go index 3c4d485ced..17ca07c3f0 100644 --- a/test/e2e/suites/unmanaged/unmanaged_functional_test.go +++ b/test/e2e/suites/unmanaged/unmanaged_functional_test.go @@ -355,32 +355,42 @@ var _ = ginkgo.Context("[unmanaged] [functional]", func() { Expect(len(controlPlaneMachines)).To(Equal(1)) ginkgo.By("Verifying AWSMachineTemplate capacity is populated for autoscaling from zero") - awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} - err := e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name)) - Expect(err).To(BeNil()) - Expect(len(awsMachineTemplateList.Items)).To(BeNumerically(">", 0), "Expected at least one AWSMachineTemplate") - - foundTemplateWithCapacity := false - foundTemplateWithNodeInfo := false - for _, template := range awsMachineTemplateList.Items { - if len(template.Status.Capacity) > 0 { - foundTemplateWithCapacity = true - ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has capacity populated: %v", template.Name, template.Status.Capacity)) - Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceCPU), "Expected CPU to be set in capacity") - Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceMemory), "Expected Memory to be set in capacity") + Eventually(func(g Gomega) { + awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} + g.Expect(e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name))).To(Succeed()) + g.Expect(awsMachineTemplateList.Items).ToNot(BeEmpty()) + + for _, template := range awsMachineTemplateList.Items { + capacity := template.Status.Capacity + _, hasCPU := capacity[corev1.ResourceCPU] + _, hasMemory := capacity[corev1.ResourceMemory] + if hasCPU && hasMemory { + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has capacity populated: %v", template.Name, capacity)) + return + } } - if template.Status.NodeInfo != nil { - foundTemplateWithNodeInfo = true - ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has nodeInfo populated: %v", template.Name, template.Status.NodeInfo)) - // Verify architecture is set (should be either amd64 or arm64 for AWS) - Expect(template.Status.NodeInfo.Architecture).ToNot(BeEmpty(), "Expected architecture to be set in nodeInfo") - Expect(string(template.Status.NodeInfo.Architecture)).To(MatchRegexp("^(amd64|arm64)$"), "Expected architecture to be amd64 or arm64") - // Verify operating system is set - Expect(template.Status.NodeInfo.OperatingSystem).ToNot(BeEmpty(), "Expected operatingSystem to be set in nodeInfo") + g.Expect(false).To(BeTrue(), "Expected at least one AWSMachineTemplate to have capacity with CPU and Memory") + }, e2eCtx.E2EConfig.GetIntervals(specName, "wait-deployment")...).Should(Succeed()) + + ginkgo.By("Verifying AWSMachineTemplate nodeInfo is populated") + Eventually(func(g Gomega) { + awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} + g.Expect(e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name))).To(Succeed()) + g.Expect(awsMachineTemplateList.Items).ToNot(BeEmpty()) + + for _, template := range awsMachineTemplateList.Items { + nodeInfo := template.Status.NodeInfo + if nodeInfo == nil { + continue + } + arch := string(nodeInfo.Architecture) + if (arch == "amd64" || arch == "arm64") && nodeInfo.OperatingSystem != "" { + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has nodeInfo populated: %v", template.Name, nodeInfo)) + return + } } - } - Expect(foundTemplateWithCapacity).To(BeTrue(), "Expected at least one AWSMachineTemplate to have capacity populated") - Expect(foundTemplateWithNodeInfo).To(BeTrue(), "Expected at least one AWSMachineTemplate to have nodeInfo populated") + g.Expect(false).To(BeTrue(), "Expected at least one AWSMachineTemplate to have valid nodeInfo") + }, e2eCtx.E2EConfig.GetIntervals(specName, "wait-deployment")...).Should(Succeed()) }) })