@@ -1137,6 +1137,21 @@ func (r *RayClusterReconciler) shouldRecreatePodsForUpgrade(ctx context.Context,
11371137 return false
11381138 }
11391139
1140+ // Case 1: If the KubeRay version has changed, update annotations then check in the next reconciliation
1141+ for _ , pod := range allPods .Items {
1142+ podVersion := pod .Annotations [utils .KubeRayVersion ]
1143+ if podVersion != "" && podVersion != utils .KUBERAY_VERSION {
1144+ logger .Info ("Pods have different KubeRay version, updating pod annotations" ,
1145+ "pod" , pod .Name ,
1146+ "podVersion" , podVersion ,
1147+ "currentVersion" , utils .KUBERAY_VERSION )
1148+ if err := r .updatePodsAnnotations (ctx , instance , & allPods ); err != nil {
1149+ logger .Error (err , "Failed to update pod annotations for KubeRay version change" )
1150+ }
1151+ return false
1152+ }
1153+ }
1154+
11401155 headHash , err := common .GeneratePodTemplateHash (instance .Spec .HeadGroupSpec .Template )
11411156 if err != nil {
11421157 logger .Error (err , "Failed to generate head template hash" )
@@ -1153,7 +1168,7 @@ func (r *RayClusterReconciler) shouldRecreatePodsForUpgrade(ctx context.Context,
11531168 workerHashMap [workerGroup .GroupName ] = hash
11541169 }
11551170
1156- // Check each pod to see if its template hash matches the current spec
1171+ // Case 2: If the pod template hash has changed, recreate all pods
11571172 for _ , pod := range allPods .Items {
11581173 nodeType := pod .Labels [utils .RayNodeTypeLabelKey ]
11591174 actualHash := pod .Annotations [utils .PodTemplateHashKey ]
@@ -1183,6 +1198,61 @@ func (r *RayClusterReconciler) shouldRecreatePodsForUpgrade(ctx context.Context,
11831198 return false
11841199}
11851200
1201+ // updatePodsAnnotations updates pod annotations to match the current KubeRay version and PodTemplateHashKey
1202+ func (r * RayClusterReconciler ) updatePodsAnnotations (ctx context.Context , instance * rayv1.RayCluster , allPods * corev1.PodList ) error {
1203+ logger := ctrl .LoggerFrom (ctx )
1204+
1205+ for i := range allPods .Items {
1206+ pod := & allPods .Items [i ]
1207+ podVersion := pod .Annotations [utils .KubeRayVersion ]
1208+
1209+ if podVersion == utils .KUBERAY_VERSION || podVersion == "" {
1210+ continue
1211+ }
1212+
1213+ newHash , err := r .calculatePodTemplateHash (instance , pod )
1214+ if err != nil {
1215+ return err
1216+ }
1217+
1218+ if pod .Annotations == nil {
1219+ pod .Annotations = make (map [string ]string )
1220+ }
1221+ pod .Annotations [utils .KubeRayVersion ] = utils .KUBERAY_VERSION
1222+ pod .Annotations [utils .PodTemplateHashKey ] = newHash
1223+
1224+ if err := r .Update (ctx , pod ); err != nil {
1225+ return err
1226+ }
1227+
1228+ logger .Info ("Updated pod annotations" , "pod" , pod .Name , "version" , utils .KUBERAY_VERSION )
1229+ }
1230+
1231+ return nil
1232+ }
1233+
1234+ // calculatePodTemplateHash calculates the hash for a pod's template based on its node type and group
1235+ func (r * RayClusterReconciler ) calculatePodTemplateHash (instance * rayv1.RayCluster , pod * corev1.Pod ) (string , error ) {
1236+ nodeType := pod .Labels [utils .RayNodeTypeLabelKey ]
1237+
1238+ switch rayv1 .RayNodeType (nodeType ) {
1239+ case rayv1 .HeadNode :
1240+ return common .GeneratePodTemplateHash (instance .Spec .HeadGroupSpec .Template )
1241+
1242+ case rayv1 .WorkerNode :
1243+ groupName := pod .Labels [utils .RayNodeGroupLabelKey ]
1244+ for _ , workerGroup := range instance .Spec .WorkerGroupSpecs {
1245+ if workerGroup .GroupName == groupName {
1246+ return common .GeneratePodTemplateHash (workerGroup .Template )
1247+ }
1248+ }
1249+ return "" , fmt .Errorf ("worker group %s not found in RayCluster spec" , groupName )
1250+
1251+ default :
1252+ return "" , fmt .Errorf ("unknown node type: %s" , nodeType )
1253+ }
1254+ }
1255+
11861256// shouldDeletePod returns whether the Pod should be deleted and the reason
11871257//
11881258// @param pod: The Pod to be checked.
0 commit comments