From 714f132e0a14826cd51dc71ca74001b1bad4a43d Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 15:33:18 -0700 Subject: [PATCH 01/23] feat: added two new fields to cr and generated boilerplate --- api/v1alpha1/vmdiskimage_types.go | 6 +++++- api/v1alpha1/zz_generated.deepcopy.go | 4 ++++ config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 9 +++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index 51c82aa..606a82f 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -34,6 +34,7 @@ const ( // Condition Reasons const ( + MissingSourceArtifact string = "MissingSourceArtifact" ReasonResourceCreationFailed string = "ResourceCreationFailed" ReasonResouceUpdateFailed string = "ResourceUpdateFailed" ReasonQueued string = "Queued" @@ -103,13 +104,16 @@ type VMDiskImageStatus struct { // Conditions of the VMDiskImage resource. Conditions []metav1.Condition `json:"conditions,omitempty"` - FailureCount int `json:"failureCount,omitempty"` + FailureCount int `json:"failureCount,omitempty"` + MissingSourceArtifact bool `json:"missingSourceArtifact,omitempty"` + FirstFailureTimestamp *metav1.Time `json:"firstFailureTimestamp,omitempty"` } // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:path=vmdiskimages,scope=Namespaced,shortName=vmdi,singular=vmdiskimage // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The current phase of the VMDiskImage." +// +kubebuilder:printcolumn:name="Missing Source Artifact",type="boolean",JSONPath=".status.missingSourceArtifact",description="If the referenced artifact can be found at the given URL." // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" type VMDiskImage struct { metav1.TypeMeta `json:",inline"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 93ddcca..7e05fb3 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -124,6 +124,10 @@ func (in *VMDiskImageStatus) DeepCopyInto(out *VMDiskImageStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.FirstFailureTimestamp != nil { + in, out := &in.FirstFailureTimestamp, &out.FirstFailureTimestamp + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VMDiskImageStatus. diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml index 2188d92..ac31962 100644 --- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml +++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml @@ -21,6 +21,10 @@ spec: jsonPath: .status.phase name: Phase type: string + - description: If the referenced artifact can be found at the given URL. + jsonPath: .status.missingSourceArtifact + name: Missing Source Artifact + type: boolean - jsonPath: .metadata.creationTimestamp name: Age type: date @@ -140,10 +144,15 @@ spec: type: array failureCount: type: integer + firstFailureTimestamp: + format: date-time + type: string message: description: A human-readable message providing more details about the current phase. type: string + missingSourceArtifact: + type: boolean phase: enum: - Queued From 6af6e457f1d6b0bad5d2b36036227502bad46704 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 15:35:49 -0700 Subject: [PATCH 02/23] feat: add checks to see if we have a missing artifact in provisioner --- internal/vm-disk-image/service/provisioner.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go index 55b2b4f..667844c 100644 --- a/internal/vm-disk-image/service/provisioner.go +++ b/internal/vm-disk-image/service/provisioner.go @@ -2,8 +2,10 @@ package service import ( "context" + "errors" "fmt" crdv1 "pelotech/data-sync-operator/api/v1alpha1" + "strings" "time" snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" @@ -31,6 +33,8 @@ type K8sVMDIProvisioner struct { const dataVolumeDonePhase = "Succeeded" +var ErrMissingSourceArtifact = errors.New("the requested artifact does not exist") + // Create resources for a given VMDiskImage. Stops creating them if // a single resource fails to create. Does not cleanup after itself func (p K8sVMDIProvisioner) CreateResources( @@ -190,6 +194,12 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors( if dv.Status.RestartCount >= int32(p.RetryLimit) { return fmt.Errorf("a datavolume has restarted more than the max for a sync") } + + for _, cond := range dv.Status.Conditions { + if strings.Contains(cond.Message, "404") || strings.Contains(strings.ToLower(cond.Message), "not found") { + return ErrMissingSourceArtifact + } + } } return nil From 48b244cd864463c8c00c356940a2e58a6af0dd42 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 15:36:35 -0700 Subject: [PATCH 03/23] refactor: rename fields to be more descriptive --- .../vm-disk-image/controller/controller.go | 12 +++++----- .../vm-disk-image/service/orchestrator.go | 24 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index 5675549..4c70e6a 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -118,12 +118,12 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error { RetryLimit: config.RetryLimit, } orchestrator := vmdi.Orchestrator{ - Client: client, - Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), - Provisioner: vmdiProvisioner, - RetryLimit: config.RetryLimit, - RetryBackoff: config.RetryBackoffDuration, - SyncLimit: config.Concurrency, + Client: client, + Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), + Provisioner: vmdiProvisioner, + SyncRetryLimit: config.RetryLimit, + SyncAttemptRetryBackoff: config.RetryBackoffDuration, + ConcurrentSyncLimit: config.Concurrency, } reconciler := &VMDiskImageReconciler{ Scheme: mgr.GetScheme(), diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index 4b8ddd5..1109913 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -32,11 +32,11 @@ type VMDiskImageOrchestrator interface { type Orchestrator struct { client.Client - Recorder record.EventRecorder - Provisioner VMDiskImageProvisioner - RetryLimit int - RetryBackoff time.Duration - SyncLimit int + Recorder record.EventRecorder + Provisioner VMDiskImageProvisioner + SyncRetryLimit int + SyncAttemptRetryBackoff time.Duration + ConcurrentSyncLimit int } func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error { @@ -106,9 +106,9 @@ func (o Orchestrator) AttemptSyncingOfResource( logger.Error(err, "Failed to list syncing resources") return ctrl.Result{}, err } - if len(syncingList.Items) >= o.SyncLimit { - o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.SyncLimit) - return ctrl.Result{RequeueAfter: o.RetryBackoff}, nil + if len(syncingList.Items) >= o.ConcurrentSyncLimit { + o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.ConcurrentSyncLimit) + return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil } err = o.Provisioner.CreateResources(ctx, vmdi) @@ -152,7 +152,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi } if !isDone { logger.Info("Sync is not complete. Requeuing.") - return ctrl.Result{RequeueAfter: o.RetryBackoff}, nil + return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil } vmdi.Status.Phase = crdv1.PhaseReady @@ -246,11 +246,11 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma if err := o.Status().Update(ctx, vmdi); err != nil { logger.Error(err, "Failed to update resource failure count") } - if vmdi.Status.FailureCount < o.RetryLimit { - return ctrl.Result{RequeueAfter: o.RetryBackoff}, nil + if vmdi.Status.FailureCount < o.SyncRetryLimit { + return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil } - o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.RetryLimit) + o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.SyncRetryLimit) vmdi.Status.Phase = crdv1.PhaseFailed vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{ From 3cab66b2446524dedcfb2c825cb2acd998a9af59 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 15:48:19 -0700 Subject: [PATCH 04/23] feat: add new values to config to handle retrying totally failed syncs --- .../config/vmdi-controller-config-reader.go | 44 ++++++++++++------- .../vm-disk-image/controller/controller.go | 6 +-- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go index 3c5cd49..a648ff3 100644 --- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go +++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go @@ -6,38 +6,50 @@ import ( ) const ( - defaultConcurrency = 10 // TODO: We will need to tune this default - defaultRetryLimit = 2 - defaultBackoffDuration = 10 * time.Second - defaultMaxSyncDuration = 1 * time.Hour + defaultConcurrency = 10 // TODO: We will need to tune this default + defaultSyncRetryLimit = 2 + defaultSyncBackoffDuration = 10 * time.Second + defaultMaxSyncDuration = 1 * time.Hour + defaultResurrectionTimeout = 8 * time.Hour + defaultResurrectionBackoffDuration = 30 * time.Minute ) type VMDiskImageControllerConfig struct { - Concurrency int - RetryLimit int - RetryBackoffDuration time.Duration - MaxSyncDuration time.Duration + Concurrency int + SyncRetryLimit int + SyncRetryBackoffDuration time.Duration + MaxSyncDuration time.Duration + ResurrectionTimeout time.Duration + ResurrectionBackoffDuration time.Duration } // This function will allow us to get the required config variables from the environment. // Locally this is your "env" and in production these values will come from a configmap func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig { // The max amount of VMDIs we can have syncing at one time. - concurrency := corecfg.GetIntEnvOrDefault("CONCURRENCY", defaultConcurrency) + concurrency := corecfg.GetIntEnvOrDefault("VMDI_SYNC_CONCURRENCY", defaultConcurrency) // How many times we will retry a failed sync. - retryLimit := corecfg.GetIntEnvOrDefault("RETRY_LIMIT", defaultRetryLimit) + retryLimit := corecfg.GetIntEnvOrDefault("SYNC_RETRY_LIMIT", defaultSyncRetryLimit) - // How long we want to wait before trying to resync a failed VMDI. - retryBackoffDuration := corecfg.GetDurationEnvOrDefault("RETRY_BACKOFF_DURATION", defaultBackoffDuration) + // How long we want to wait before trying another sync on a VMDI in SYNCING status if it fails. + retryBackoffDuration := corecfg.GetDurationEnvOrDefault("SYNC_RETRY_BACKOFF_DURATION", defaultSyncBackoffDuration) // How long we will let a VMDI sit in syncing status. maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration) + // How long we will attempt to resend a VMDI through loop until it is permanently failed + resurrectionTimeout := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_TIMEOUT", defaultResurrectionTimeout) + + // How long we will wait before attempting to send a VMDI through the loop if it enters a failed state + resurrectionBackoffDuration := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_BACKOFF_DURATION", defaultResurrectionBackoffDuration) + return VMDiskImageControllerConfig{ - Concurrency: concurrency, - RetryLimit: retryLimit, - RetryBackoffDuration: retryBackoffDuration, - MaxSyncDuration: maxSyncDuration, + Concurrency: concurrency, + SyncRetryLimit: retryLimit, + SyncRetryBackoffDuration: retryBackoffDuration, + MaxSyncDuration: maxSyncDuration, + ResurrectionTimeout: resurrectionTimeout, + ResurrectionBackoffDuration: resurrectionBackoffDuration, } } diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index 4c70e6a..ec9703d 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -115,14 +115,14 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error { Client: client, ResourceGenerator: resourceGenerator, MaxSyncDuration: config.MaxSyncDuration, - RetryLimit: config.RetryLimit, + RetryLimit: config.SyncRetryLimit, } orchestrator := vmdi.Orchestrator{ Client: client, Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), Provisioner: vmdiProvisioner, - SyncRetryLimit: config.RetryLimit, - SyncAttemptRetryBackoff: config.RetryBackoffDuration, + SyncRetryLimit: config.SyncRetryLimit, + SyncAttemptRetryBackoff: config.SyncRetryBackoffDuration, ConcurrentSyncLimit: config.Concurrency, } reconciler := &VMDiskImageReconciler{ From 4b5d33d9df6691a53f8ad7dc5883b12d3ca42978 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 19:30:04 -0700 Subject: [PATCH 05/23] feat: wired up --- .../vm-disk-image/controller/controller.go | 18 +++++---- .../vm-disk-image/service/orchestrator.go | 40 ++++++++++++++++--- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index ec9703d..7eac241 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -93,7 +93,9 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request) return r.AttemptSyncingOfResource(ctx, &VMDiskImage) case crdv1.PhaseSyncing: return r.TransitonFromSyncing(ctx, &VMDiskImage) - case crdv1.PhaseReady, crdv1.PhaseFailed: + case crdv1.PhaseFailed: + return r.AttemptResurrectionOfResource(ctx, &VMDiskImage) + case crdv1.PhaseReady: return ctrl.Result{}, nil default: logger.Error(nil, "Unknown phase detected", "Phase", currentPhase) @@ -118,12 +120,14 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error { RetryLimit: config.SyncRetryLimit, } orchestrator := vmdi.Orchestrator{ - Client: client, - Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), - Provisioner: vmdiProvisioner, - SyncRetryLimit: config.SyncRetryLimit, - SyncAttemptRetryBackoff: config.SyncRetryBackoffDuration, - ConcurrentSyncLimit: config.Concurrency, + Client: client, + Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), + Provisioner: vmdiProvisioner, + SyncRetryLimit: config.SyncRetryLimit, + SyncAttemptRetryBackoff: config.SyncRetryBackoffDuration, + ConcurrentSyncLimit: config.Concurrency, + ResurrectionTimeout: config.ResurrectionTimeout, + ResurrectionBackoffDuration: config.ResurrectionBackoffDuration, } reconciler := &VMDiskImageReconciler{ Scheme: mgr.GetScheme(), diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index 1109913..bddce9f 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -2,6 +2,7 @@ package service import ( "context" + "errors" crdv1 "pelotech/data-sync-operator/api/v1alpha1" "time" @@ -25,6 +26,7 @@ type VMDiskImageOrchestrator interface { QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) AttemptSyncingOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) + AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) HandleResourceUpdateError(ctx context.Context, ds *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error) @@ -32,11 +34,13 @@ type VMDiskImageOrchestrator interface { type Orchestrator struct { client.Client - Recorder record.EventRecorder - Provisioner VMDiskImageProvisioner - SyncRetryLimit int - SyncAttemptRetryBackoff time.Duration - ConcurrentSyncLimit int + Recorder record.EventRecorder + Provisioner VMDiskImageProvisioner + SyncRetryLimit int + SyncAttemptRetryBackoff time.Duration + ConcurrentSyncLimit int + ResurrectionTimeout time.Duration + ResurrectionBackoffDuration time.Duration } func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error { @@ -172,6 +176,26 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi return ctrl.Result{}, nil } +func (o Orchestrator) AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { + isRetryable := + vmdi.Status.Phase == crdv1.PhaseFailed && + time.Since(vmdi.Status.FirstFailureTimestamp.Time) > o.ResurrectionTimeout && + vmdi.Status.MissingSourceArtifact + + if !isRetryable { + return ctrl.Result{}, nil + } + + vmdi.Status.MissingSourceArtifact = false + vmdi.Status.FailureCount = 0 + vmdi.Status.Phase = "" + if err := o.Status().Update(ctx, vmdi); err != nil { + return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on resurrection") + } + + return ctrl.Result{RequeueAfter: o.ResurrectionBackoffDuration}, nil +} + func (o Orchestrator) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { logger := logf.FromContext(ctx) @@ -250,6 +274,12 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil } + if errors.Is(originalErr, ErrMissingSourceArtifact) { + vmdi.Status.MissingSourceArtifact = true + } + + now := metav1.Now() + vmdi.Status.FirstFailureTimestamp = &now o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.SyncRetryLimit) vmdi.Status.Phase = crdv1.PhaseFailed vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() From 8d84152091e47f66271f39eefc325d79f286cf29 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 22:13:19 -0700 Subject: [PATCH 06/23] refactor: moved to time based failure --- api/v1alpha1/vmdiskimage_types.go | 1 - .../config/vmdi-controller-config-reader.go | 50 ++++++++----------- .../vm-disk-image/controller/controller.go | 23 ++++----- .../vm-disk-image/service/orchestrator.go | 50 ++++++++----------- internal/vm-disk-image/service/provisioner.go | 20 ++------ 5 files changed, 57 insertions(+), 87 deletions(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index 606a82f..981567d 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -105,7 +105,6 @@ type VMDiskImageStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` FailureCount int `json:"failureCount,omitempty"` - MissingSourceArtifact bool `json:"missingSourceArtifact,omitempty"` FirstFailureTimestamp *metav1.Time `json:"firstFailureTimestamp,omitempty"` } diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go index a648ff3..50cd2d6 100644 --- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go +++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go @@ -6,21 +6,19 @@ import ( ) const ( - defaultConcurrency = 10 // TODO: We will need to tune this default - defaultSyncRetryLimit = 2 - defaultSyncBackoffDuration = 10 * time.Second - defaultMaxSyncDuration = 1 * time.Hour - defaultResurrectionTimeout = 8 * time.Hour - defaultResurrectionBackoffDuration = 30 * time.Minute + defaultConcurrency = 10 // TODO: We will need to tune this default + defaultMaxBackoffDelay = 1 * time.Hour + defaultMaxSyncDuration = 12 * time.Hour + defaultMaxSyncAttemptRetry = 3 + defaultMaxSyncAttemptDuration = 1 * time.Hour ) type VMDiskImageControllerConfig struct { - Concurrency int - SyncRetryLimit int - SyncRetryBackoffDuration time.Duration - MaxSyncDuration time.Duration - ResurrectionTimeout time.Duration - ResurrectionBackoffDuration time.Duration + Concurrency int + MaxBackoffDelay time.Duration + MaxSyncDuration time.Duration + MaxSyncAttemptDuration time.Duration + MaxSyncAttemptRetry int } // This function will allow us to get the required config variables from the environment. @@ -29,27 +27,23 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig { // The max amount of VMDIs we can have syncing at one time. concurrency := corecfg.GetIntEnvOrDefault("VMDI_SYNC_CONCURRENCY", defaultConcurrency) - // How many times we will retry a failed sync. - retryLimit := corecfg.GetIntEnvOrDefault("SYNC_RETRY_LIMIT", defaultSyncRetryLimit) + // The longest we will ever wait to retry. + maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay) - // How long we want to wait before trying another sync on a VMDI in SYNCING status if it fails. - retryBackoffDuration := corecfg.GetDurationEnvOrDefault("SYNC_RETRY_BACKOFF_DURATION", defaultSyncBackoffDuration) + // How long we will try to run a sync before we fail it forever. + maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxSyncDuration) // How long we will let a VMDI sit in syncing status. - maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration) + maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration) - // How long we will attempt to resend a VMDI through loop until it is permanently failed - resurrectionTimeout := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_TIMEOUT", defaultResurrectionTimeout) - - // How long we will wait before attempting to send a VMDI through the loop if it enters a failed state - resurrectionBackoffDuration := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_BACKOFF_DURATION", defaultResurrectionBackoffDuration) + // How many times we will retry on a given attempt. + maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptRetry) return VMDiskImageControllerConfig{ - Concurrency: concurrency, - SyncRetryLimit: retryLimit, - SyncRetryBackoffDuration: retryBackoffDuration, - MaxSyncDuration: maxSyncDuration, - ResurrectionTimeout: resurrectionTimeout, - ResurrectionBackoffDuration: resurrectionBackoffDuration, + Concurrency: concurrency, + MaxBackoffDelay: maxBackoffDelay, + MaxSyncAttemptDuration: maxAttemptDuration, + MaxSyncAttemptRetry: maxRetryPerAttempt, + MaxSyncDuration: maxSyncDuration, } } diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index 7eac241..00df396 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -94,7 +94,7 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request) case crdv1.PhaseSyncing: return r.TransitonFromSyncing(ctx, &VMDiskImage) case crdv1.PhaseFailed: - return r.AttemptResurrectionOfResource(ctx, &VMDiskImage) + return r.AttemptRetry(ctx, &VMDiskImage) case crdv1.PhaseReady: return ctrl.Result{}, nil default: @@ -114,20 +114,17 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error { resourceGenerator := &vmdi.Generator{} vmdiProvisioner := vmdi.K8sVMDIProvisioner{ - Client: client, - ResourceGenerator: resourceGenerator, - MaxSyncDuration: config.MaxSyncDuration, - RetryLimit: config.SyncRetryLimit, + Client: client, + ResourceGenerator: resourceGenerator, + MaxSyncAttemptDuration: config.MaxSyncAttemptDuration, + MaxRetryPerAttempt: config.MaxSyncAttemptRetry, } orchestrator := vmdi.Orchestrator{ - Client: client, - Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), - Provisioner: vmdiProvisioner, - SyncRetryLimit: config.SyncRetryLimit, - SyncAttemptRetryBackoff: config.SyncRetryBackoffDuration, - ConcurrentSyncLimit: config.Concurrency, - ResurrectionTimeout: config.ResurrectionTimeout, - ResurrectionBackoffDuration: config.ResurrectionBackoffDuration, + Client: client, + Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), + Provisioner: vmdiProvisioner, + MaxSyncAttemptBackoff: config.MaxBackoffDelay, + ConcurrentSyncLimit: config.Concurrency, } reconciler := &VMDiskImageReconciler{ Scheme: mgr.GetScheme(), diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index bddce9f..7ad30dc 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -2,7 +2,7 @@ package service import ( "context" - "errors" + "math" crdv1 "pelotech/data-sync-operator/api/v1alpha1" "time" @@ -26,7 +26,7 @@ type VMDiskImageOrchestrator interface { QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) AttemptSyncingOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) - AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) + AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) HandleResourceUpdateError(ctx context.Context, ds *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error) @@ -34,13 +34,11 @@ type VMDiskImageOrchestrator interface { type Orchestrator struct { client.Client - Recorder record.EventRecorder - Provisioner VMDiskImageProvisioner - SyncRetryLimit int - SyncAttemptRetryBackoff time.Duration - ConcurrentSyncLimit int - ResurrectionTimeout time.Duration - ResurrectionBackoffDuration time.Duration + Recorder record.EventRecorder + Provisioner VMDiskImageProvisioner + MaxSyncAttemptBackoff time.Duration + MaxSyncTime time.Duration + ConcurrentSyncLimit int } func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error { @@ -112,7 +110,7 @@ func (o Orchestrator) AttemptSyncingOfResource( } if len(syncingList.Items) >= o.ConcurrentSyncLimit { o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.ConcurrentSyncLimit) - return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil + return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil } err = o.Provisioner.CreateResources(ctx, vmdi) @@ -156,7 +154,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi } if !isDone { logger.Info("Sync is not complete. Requeuing.") - return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil + return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil } vmdi.Status.Phase = crdv1.PhaseReady @@ -176,24 +174,26 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi return ctrl.Result{}, nil } -func (o Orchestrator) AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { - isRetryable := - vmdi.Status.Phase == crdv1.PhaseFailed && - time.Since(vmdi.Status.FirstFailureTimestamp.Time) > o.ResurrectionTimeout && - vmdi.Status.MissingSourceArtifact +func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { + syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime)) - if !isRetryable { + // leave it in a failed state + if syncDeadline.After(time.Now()) { return ctrl.Result{}, nil } - vmdi.Status.MissingSourceArtifact = false - vmdi.Status.FailureCount = 0 vmdi.Status.Phase = "" if err := o.Status().Update(ctx, vmdi); err != nil { return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on resurrection") } - return ctrl.Result{RequeueAfter: o.ResurrectionBackoffDuration}, nil + // Expontential retry + var backoffInterval time.Duration + nextBackoffMinutes := int(math.Floor(math.Pow(3, float64(vmdi.Status.FailureCount)))) + nextBackoffDuration := time.Duration(nextBackoffMinutes) * time.Minute + backoffInterval = min(nextBackoffDuration, o.MaxSyncAttemptBackoff) + + return ctrl.Result{RequeueAfter: backoffInterval}, nil } func (o Orchestrator) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { @@ -270,17 +270,7 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma if err := o.Status().Update(ctx, vmdi); err != nil { logger.Error(err, "Failed to update resource failure count") } - if vmdi.Status.FailureCount < o.SyncRetryLimit { - return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil - } - - if errors.Is(originalErr, ErrMissingSourceArtifact) { - vmdi.Status.MissingSourceArtifact = true - } - now := metav1.Now() - vmdi.Status.FirstFailureTimestamp = &now - o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.SyncRetryLimit) vmdi.Status.Phase = crdv1.PhaseFailed vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{ diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go index 667844c..c08e24d 100644 --- a/internal/vm-disk-image/service/provisioner.go +++ b/internal/vm-disk-image/service/provisioner.go @@ -2,10 +2,8 @@ package service import ( "context" - "errors" "fmt" crdv1 "pelotech/data-sync-operator/api/v1alpha1" - "strings" "time" snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" @@ -26,15 +24,13 @@ type VMDiskImageProvisioner interface { type K8sVMDIProvisioner struct { client.Client - ResourceGenerator VMDIResourceGenerator - MaxSyncDuration time.Duration - RetryLimit int + ResourceGenerator VMDIResourceGenerator + MaxSyncAttemptDuration time.Duration + MaxRetryPerAttempt int } const dataVolumeDonePhase = "Succeeded" -var ErrMissingSourceArtifact = errors.New("the requested artifact does not exist") - // Create resources for a given VMDiskImage. Stops creating them if // a single resource fails to create. Does not cleanup after itself func (p K8sVMDIProvisioner) CreateResources( @@ -176,7 +172,7 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors( // Normal calculation timeSyncing = now.Sub(syncStartTime) } - if timeSyncing > p.MaxSyncDuration { + if timeSyncing > p.MaxSyncAttemptDuration { return fmt.Errorf("the VMDiskImage %s has been syncing longer than the allowed sync time", vmdi.Name) } @@ -191,15 +187,9 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors( } for _, dv := range dataVolumeList.Items { - if dv.Status.RestartCount >= int32(p.RetryLimit) { + if dv.Status.RestartCount >= int32(p.MaxRetryPerAttempt) { return fmt.Errorf("a datavolume has restarted more than the max for a sync") } - - for _, cond := range dv.Status.Conditions { - if strings.Contains(cond.Message, "404") || strings.Contains(strings.ToLower(cond.Message), "not found") { - return ErrMissingSourceArtifact - } - } } return nil From 0e36d0029cad218283c19878089e77ec978c9410 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 22:17:29 -0700 Subject: [PATCH 07/23] refactor: let library check on sync --- internal/vm-disk-image/service/orchestrator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index 7ad30dc..c29dbe9 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -154,7 +154,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi } if !isDone { logger.Info("Sync is not complete. Requeuing.") - return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil + return ctrl.Result{}, nil } vmdi.Status.Phase = crdv1.PhaseReady From a40b2c411110e017fcfaa807ea51b0dda7045a54 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 22:19:32 -0700 Subject: [PATCH 08/23] refactor: remove other unneeded fields --- api/v1alpha1/vmdiskimage_types.go | 3 --- api/v1alpha1/zz_generated.deepcopy.go | 4 ---- config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 5 ----- 3 files changed, 12 deletions(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index 981567d..87c447e 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -34,7 +34,6 @@ const ( // Condition Reasons const ( - MissingSourceArtifact string = "MissingSourceArtifact" ReasonResourceCreationFailed string = "ResourceCreationFailed" ReasonResouceUpdateFailed string = "ResourceUpdateFailed" ReasonQueued string = "Queued" @@ -105,14 +104,12 @@ type VMDiskImageStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` FailureCount int `json:"failureCount,omitempty"` - FirstFailureTimestamp *metav1.Time `json:"firstFailureTimestamp,omitempty"` } // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:path=vmdiskimages,scope=Namespaced,shortName=vmdi,singular=vmdiskimage // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The current phase of the VMDiskImage." -// +kubebuilder:printcolumn:name="Missing Source Artifact",type="boolean",JSONPath=".status.missingSourceArtifact",description="If the referenced artifact can be found at the given URL." // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" type VMDiskImage struct { metav1.TypeMeta `json:",inline"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 7e05fb3..93ddcca 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -124,10 +124,6 @@ func (in *VMDiskImageStatus) DeepCopyInto(out *VMDiskImageStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - if in.FirstFailureTimestamp != nil { - in, out := &in.FirstFailureTimestamp, &out.FirstFailureTimestamp - *out = (*in).DeepCopy() - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VMDiskImageStatus. diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml index ac31962..0025a0c 100644 --- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml +++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml @@ -144,15 +144,10 @@ spec: type: array failureCount: type: integer - firstFailureTimestamp: - format: date-time - type: string message: description: A human-readable message providing more details about the current phase. type: string - missingSourceArtifact: - type: boolean phase: enum: - Queued From 2477554842f009afa8144cb20ba49a52ef61e15c Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 22:20:16 -0700 Subject: [PATCH 09/23] refactor: formatting --- api/v1alpha1/vmdiskimage_types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index 87c447e..51c82aa 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -103,7 +103,7 @@ type VMDiskImageStatus struct { // Conditions of the VMDiskImage resource. Conditions []metav1.Condition `json:"conditions,omitempty"` - FailureCount int `json:"failureCount,omitempty"` + FailureCount int `json:"failureCount,omitempty"` } // +kubebuilder:object:root=true From 19ac0229a6365eaa543fd6d181a7cea55d054235 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 22:21:04 -0700 Subject: [PATCH 10/23] chore: generate manifests again --- config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml index 0025a0c..2188d92 100644 --- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml +++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml @@ -21,10 +21,6 @@ spec: jsonPath: .status.phase name: Phase type: string - - description: If the referenced artifact can be found at the given URL. - jsonPath: .status.missingSourceArtifact - name: Missing Source Artifact - type: boolean - jsonPath: .metadata.creationTimestamp name: Age type: date From 2fbcbdeb0751e05d806f9dcdfe533f4a69d2240e Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Thu, 11 Dec 2025 22:22:47 -0700 Subject: [PATCH 11/23] fix: messy names and duplicate env var targeting --- .../vm-disk-image/config/vmdi-controller-config-reader.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go index 50cd2d6..99ef396 100644 --- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go +++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go @@ -9,7 +9,7 @@ const ( defaultConcurrency = 10 // TODO: We will need to tune this default defaultMaxBackoffDelay = 1 * time.Hour defaultMaxSyncDuration = 12 * time.Hour - defaultMaxSyncAttemptRetry = 3 + defaultMaxSyncAttemptRetries = 3 defaultMaxSyncAttemptDuration = 1 * time.Hour ) @@ -31,13 +31,13 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig { maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay) // How long we will try to run a sync before we fail it forever. - maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxSyncDuration) + maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration) // How long we will let a VMDI sit in syncing status. maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration) // How many times we will retry on a given attempt. - maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptRetry) + maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries) return VMDiskImageControllerConfig{ Concurrency: concurrency, From b11ce83291f986b0c1db749914f64e02b2c3f294 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 09:07:18 -0700 Subject: [PATCH 12/23] feat: added more condition reasons --- api/v1alpha1/vmdiskimage_types.go | 15 +++++++----- .../vm-disk-image/service/orchestrator.go | 24 +++++++++++++------ internal/vm-disk-image/service/provisioner.go | 18 ++++++++++++-- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index 51c82aa..72a7342 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -34,12 +34,15 @@ const ( // Condition Reasons const ( - ReasonResourceCreationFailed string = "ResourceCreationFailed" - ReasonResouceUpdateFailed string = "ResourceUpdateFailed" - ReasonQueued string = "Queued" - ReasonSyncing string = "Syncing" - ReasonRetryLimitExceeded string = "RetryLimitExceeded" - ReasonSynced string = "Synced" + ReasonResourceCreationFailed string = "ResourceCreationFailed" + ReasonResouceUpdateFailed string = "ResourceUpdateFailed" + ReasonQueued string = "Queued" + ReasonSyncing string = "Syncing" + ReasonRetryLimitExceeded string = "RetryLimitExceeded" + ReasonMissingSourceArtifact string = "MissingSourceArtifact" + ReasonSyncAttemptDurationExceeded string = "SyncAttemptDurationExceeded" + ReasonUnknownSyncFailure string = "UnknownSyncFailure" + ReasonSynced string = "Synced" ) // CRD phases diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index c29dbe9..e50400d 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -2,11 +2,13 @@ package service import ( "context" + "errors" "math" crdv1 "pelotech/data-sync-operator/api/v1alpha1" "time" types "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" crutils "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -176,15 +178,16 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime)) + exceededSyncDeadline := syncDeadline.Before(ptr.To(metav1.Now())) // leave it in a failed state - if syncDeadline.After(time.Now()) { + if exceededSyncDeadline { return ctrl.Result{}, nil } vmdi.Status.Phase = "" if err := o.Status().Update(ctx, vmdi); err != nil { - return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on resurrection") + return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry") } // Expontential retry @@ -267,16 +270,23 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma o.Recorder.Eventf(vmdi, "Warning", "SyncErrorOccurred", originalErr.Error()) vmdi.Status.FailureCount += 1 - if err := o.Status().Update(ctx, vmdi); err != nil { - logger.Error(err, "Failed to update resource failure count") - } - vmdi.Status.Phase = crdv1.PhaseFailed vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() + + reason := crdv1.ReasonUnknownSyncFailure + switch { + case errors.Is(originalErr, ErrSyncAttemptExceedsMaxDuration): + reason = crdv1.ReasonSyncAttemptDurationExceeded + case errors.Is(originalErr, ErrSyncAttemptExceedsRetries): + reason = crdv1.ReasonRetryLimitExceeded + case errors.Is(originalErr, ErrMissingSourceArtifact): + reason = crdv1.ReasonMissingSourceArtifact + } + meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{ Type: crdv1.ConditionTypeReady, Status: metav1.ConditionFalse, - Reason: crdv1.ReasonRetryLimitExceeded, + Reason: reason, Message: originalErr.Error(), }) diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go index c08e24d..cd8581b 100644 --- a/internal/vm-disk-image/service/provisioner.go +++ b/internal/vm-disk-image/service/provisioner.go @@ -2,8 +2,10 @@ package service import ( "context" + "errors" "fmt" crdv1 "pelotech/data-sync-operator/api/v1alpha1" + "strings" "time" snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1" @@ -31,6 +33,10 @@ type K8sVMDIProvisioner struct { const dataVolumeDonePhase = "Succeeded" +var ErrMissingSourceArtifact = errors.New("the requested artifact does not exist") +var ErrSyncAttemptExceedsRetries = errors.New("the sync attempt has failed beyond the retry limit") +var ErrSyncAttemptExceedsMaxDuration = errors.New("the sync attempt has lasted beyond its max duration") + // Create resources for a given VMDiskImage. Stops creating them if // a single resource fails to create. Does not cleanup after itself func (p K8sVMDIProvisioner) CreateResources( @@ -173,7 +179,7 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors( timeSyncing = now.Sub(syncStartTime) } if timeSyncing > p.MaxSyncAttemptDuration { - return fmt.Errorf("the VMDiskImage %s has been syncing longer than the allowed sync time", vmdi.Name) + return ErrSyncAttemptExceedsMaxDuration } searchLabels := getLabelsToMatch(vmdi) @@ -187,9 +193,17 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors( } for _, dv := range dataVolumeList.Items { + for _, cond := range dv.Status.Conditions { + missingSourceArtifact := strings.Contains(cond.Message, "404") || strings.Contains(strings.ToLower(cond.Message), "not found") + if missingSourceArtifact { + return ErrMissingSourceArtifact + } + } + if dv.Status.RestartCount >= int32(p.MaxRetryPerAttempt) { - return fmt.Errorf("a datavolume has restarted more than the max for a sync") + return ErrSyncAttemptExceedsRetries } + } return nil From e884a61ce5aea3d8b50b314c281f9515600611ba Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 09:19:01 -0700 Subject: [PATCH 13/23] feat: added new phase for visibility into retry loop --- api/v1alpha1/vmdiskimage_types.go | 11 ++++++----- config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index 72a7342..da0d571 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -47,10 +47,11 @@ const ( // CRD phases const ( - PhaseQueued string = "Queued" - PhaseSyncing string = "Syncing" - PhaseReady string = "Ready" - PhaseFailed string = "Failed" + PhaseQueued string = "Queued" + PhaseSyncing string = "Syncing" + PhaseReady string = "Ready" + PhaseRetryableFailure string = "RetryableFailure" + PhaseFailed string = "Failed" ) // VMDiskImage Labels @@ -97,7 +98,7 @@ type VMDiskImageStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file - // +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed + // +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed;RetryableFailure Phase string `json:"phase"` // A human-readable message providing more details about the current phase. diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml index 2188d92..242985e 100644 --- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml +++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml @@ -150,6 +150,7 @@ spec: - Syncing - Ready - Failed + - RetryableFailure type: string required: - phase From 25c789a7b8ece47880c6c2bcdf9fc6a5611420e9 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 09:19:31 -0700 Subject: [PATCH 14/23] feat: handling new phase in controller and service code --- internal/vm-disk-image/controller/controller.go | 4 ++-- internal/vm-disk-image/service/orchestrator.go | 17 ++++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index 00df396..e18fb07 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -93,9 +93,9 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request) return r.AttemptSyncingOfResource(ctx, &VMDiskImage) case crdv1.PhaseSyncing: return r.TransitonFromSyncing(ctx, &VMDiskImage) - case crdv1.PhaseFailed: + case crdv1.PhaseRetryableFailure: return r.AttemptRetry(ctx, &VMDiskImage) - case crdv1.PhaseReady: + case crdv1.PhaseReady, crdv1.PhaseFailed: return ctrl.Result{}, nil default: logger.Error(nil, "Unknown phase detected", "Phase", currentPhase) diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index e50400d..289c13c 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -180,17 +180,20 @@ func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime)) exceededSyncDeadline := syncDeadline.Before(ptr.To(metav1.Now())) - // leave it in a failed state + // Fail forever if we're past the deadline if exceededSyncDeadline { - return ctrl.Result{}, nil + vmdi.Status.Phase = crdv1.PhaseFailed + vmdi.Status.Message = "Exceeded overall sync retry window. Failed Permanently" + } else { + vmdi.Status.Phase = "" + vmdi.Status.Message = "" } - vmdi.Status.Phase = "" if err := o.Status().Update(ctx, vmdi); err != nil { return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry") } - // Expontential retry + // Exponential retry var backoffInterval time.Duration nextBackoffMinutes := int(math.Floor(math.Pow(3, float64(vmdi.Status.FailureCount)))) nextBackoffDuration := time.Duration(nextBackoffMinutes) * time.Minute @@ -220,7 +223,7 @@ func (o Orchestrator) HandleResourceUpdateError( logger.Error(originalErr, message) // Mark the resource as Failed - vmdi.Status.Phase = crdv1.PhaseFailed + vmdi.Status.Phase = crdv1.PhaseRetryableFailure vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{ Type: crdv1.ConditionTypeReady, @@ -242,7 +245,7 @@ func (o Orchestrator) HandleResourceCreationError(ctx context.Context, vmdi *crd logger.Error(originalErr, "Failed to create a resource.") o.Recorder.Eventf(vmdi, "Warning", "ResourceCreationFailed", "Failed to create resources.") - vmdi.Status.Phase = crdv1.PhaseFailed + vmdi.Status.Phase = crdv1.PhaseRetryableFailure vmdi.Status.Message = "Failed while creating resources: " + originalErr.Error() meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{ Type: crdv1.ConditionTypeReady, @@ -270,7 +273,7 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma o.Recorder.Eventf(vmdi, "Warning", "SyncErrorOccurred", originalErr.Error()) vmdi.Status.FailureCount += 1 - vmdi.Status.Phase = crdv1.PhaseFailed + vmdi.Status.Phase = crdv1.PhaseRetryableFailure vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() reason := crdv1.ReasonUnknownSyncFailure From d0dd06201d0bdd53fdee9e167d726f2b8d05402a Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 09:20:21 -0700 Subject: [PATCH 15/23] chore: regenerate chart to include new phase --- .../templates/crd/vmdiskimages.crd.pelotech.ot.yaml | 1 + dist/install.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml index c86c8d0..ec110ee 100644 --- a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml +++ b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml @@ -146,6 +146,7 @@ spec: - Syncing - Ready - Failed + - RetryableFailure type: string required: - phase diff --git a/dist/install.yaml b/dist/install.yaml index e43aff7..63fccb0 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -158,6 +158,7 @@ spec: - Syncing - Ready - Failed + - RetryableFailure type: string required: - phase From f43bbb0d6cfcf153981194c95032e786696e9ccd Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 10:02:11 -0700 Subject: [PATCH 16/23] fix: wait 10 seconds before checking sync status again --- internal/vm-disk-image/service/orchestrator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index 289c13c..c3020b2 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -156,7 +156,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi } if !isDone { logger.Info("Sync is not complete. Requeuing.") - return ctrl.Result{}, nil + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } vmdi.Status.Phase = crdv1.PhaseReady From 2ef59b3b13f6966a585cb9f939a04fd761873642 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 13:50:20 -0700 Subject: [PATCH 17/23] chore: regenerate resources after adding LastFailureTime to spec --- api/v1alpha1/vmdiskimage_types.go | 3 ++- api/v1alpha1/zz_generated.deepcopy.go | 4 ++++ config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go index da0d571..c997f1a 100644 --- a/api/v1alpha1/vmdiskimage_types.go +++ b/api/v1alpha1/vmdiskimage_types.go @@ -107,7 +107,8 @@ type VMDiskImageStatus struct { // Conditions of the VMDiskImage resource. Conditions []metav1.Condition `json:"conditions,omitempty"` - FailureCount int `json:"failureCount,omitempty"` + FailureCount int `json:"failureCount,omitempty"` + LastFailureTime *metav1.Time `json:"lastFailureTime,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 93ddcca..db29f87 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -124,6 +124,10 @@ func (in *VMDiskImageStatus) DeepCopyInto(out *VMDiskImageStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.LastFailureTime != nil { + in, out := &in.LastFailureTime, &out.LastFailureTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VMDiskImageStatus. diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml index 242985e..8d60673 100644 --- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml +++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml @@ -140,6 +140,9 @@ spec: type: array failureCount: type: integer + lastFailureTime: + format: date-time + type: string message: description: A human-readable message providing more details about the current phase. From c9f8e6bb63cec0f7a946ae5cc27cce6fc1ce7412 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 13:54:01 -0700 Subject: [PATCH 18/23] fix: resolve finalizer conflict and now respecting backoff behavior --- .../vm-disk-image/controller/controller.go | 17 +++--- .../vm-disk-image/service/orchestrator.go | 55 +++++++++++-------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index e18fb07..cb9ae07 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -77,11 +77,7 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request) resourceHasFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer) if resourceHasFinalizer { - err := r.AddControllerFinalizer(ctx, &VMDiskImage) - if err != nil { - return r.HandleResourceUpdateError(ctx, &VMDiskImage, err, "Failed to add finalizer to our resource") - } - + return r.AddControllerFinalizer(ctx, &VMDiskImage) } currentPhase := VMDiskImage.Status.Phase @@ -120,11 +116,12 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error { MaxRetryPerAttempt: config.MaxSyncAttemptRetry, } orchestrator := vmdi.Orchestrator{ - Client: client, - Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), - Provisioner: vmdiProvisioner, - MaxSyncAttemptBackoff: config.MaxBackoffDelay, - ConcurrentSyncLimit: config.Concurrency, + Client: client, + Recorder: mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName), + Provisioner: vmdiProvisioner, + MaxRetryBackoff: config.MaxBackoffDelay, + MaxSyncTime: config.MaxSyncDuration, + ConcurrentSyncLimit: config.Concurrency, } reconciler := &VMDiskImageReconciler{ Scheme: mgr.GetScheme(), diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go index c3020b2..87a8116 100644 --- a/internal/vm-disk-image/service/orchestrator.go +++ b/internal/vm-disk-image/service/orchestrator.go @@ -22,7 +22,7 @@ import ( type VMDiskImageOrchestrator interface { GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error - AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) error + AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) IndexVMDiskImageByPhase(rawObj client.Object) []string ListVMDiskImagesByPhase(ctx context.Context, phase string) (*crdv1.VMDiskImageList, error) QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) @@ -30,27 +30,30 @@ type VMDiskImageOrchestrator interface { TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) - HandleResourceUpdateError(ctx context.Context, ds *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error) - HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error) } type Orchestrator struct { client.Client - Recorder record.EventRecorder - Provisioner VMDiskImageProvisioner - MaxSyncAttemptBackoff time.Duration - MaxSyncTime time.Duration - ConcurrentSyncLimit int + Recorder record.EventRecorder + Provisioner VMDiskImageProvisioner + MaxRetryBackoff time.Duration + MaxSyncTime time.Duration + ConcurrentSyncLimit int } func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error { return o.Get(ctx, namespace, vmdi) } -func (o Orchestrator) AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) error { +func (o Orchestrator) AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { crutils.AddFinalizer(vmdi, crdv1.VMDiskImageFinalizer) - return o.Update(ctx, vmdi) + err := o.Update(ctx, vmdi) + if err != nil { + return o.HandleResourceUpdateError(ctx, vmdi, err, "Failed to add finalizer to our resource") + } + + return ctrl.Result{}, nil } func (o Orchestrator) ListVMDiskImagesByPhase(ctx context.Context, phase string) (*crdv1.VMDiskImageList, error) { @@ -96,7 +99,7 @@ func (o Orchestrator) QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMD o.Recorder.Eventf(vmdi, "Normal", "Queued", "Resource successfully queued for sync orchestration") - return ctrl.Result{Requeue: true}, nil + return ctrl.Result{}, nil } func (o Orchestrator) AttemptSyncingOfResource( @@ -112,7 +115,7 @@ func (o Orchestrator) AttemptSyncingOfResource( } if len(syncingList.Items) >= o.ConcurrentSyncLimit { o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.ConcurrentSyncLimit) - return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } err = o.Provisioner.CreateResources(ctx, vmdi) @@ -145,7 +148,6 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi // Check if there is an error occurring in the sync syncError := o.Provisioner.ResourcesHaveErrors(ctx, vmdi) if syncError != nil { - logger.Error(syncError, "A sync error has occurred.") return o.HandleSyncError(ctx, vmdi, syncError, "A error has occurred while syncing") } @@ -178,28 +180,34 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime)) - exceededSyncDeadline := syncDeadline.Before(ptr.To(metav1.Now())) + exceededSyncDeadline := metav1.Now().After(syncDeadline.Time) // Fail forever if we're past the deadline if exceededSyncDeadline { vmdi.Status.Phase = crdv1.PhaseFailed vmdi.Status.Message = "Exceeded overall sync retry window. Failed Permanently" - } else { - vmdi.Status.Phase = "" - vmdi.Status.Message = "" - } - if err := o.Status().Update(ctx, vmdi); err != nil { - return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry") + if err := o.Status().Update(ctx, vmdi); err != nil { + return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry") + } + + return ctrl.Result{}, nil } // Exponential retry var backoffInterval time.Duration nextBackoffMinutes := int(math.Floor(math.Pow(3, float64(vmdi.Status.FailureCount)))) nextBackoffDuration := time.Duration(nextBackoffMinutes) * time.Minute - backoffInterval = min(nextBackoffDuration, o.MaxSyncAttemptBackoff) + backoffInterval = min(nextBackoffDuration, o.MaxRetryBackoff) + timeSinceFailure := time.Since(vmdi.Status.LastFailureTime.Time) + // If we haven't waited as long as we need to backoff and requeue + if timeSinceFailure < backoffInterval { + remaingWaitTime := backoffInterval - timeSinceFailure + return ctrl.Result{RequeueAfter: remaingWaitTime}, nil + } + + return o.QueueResourceCreation(ctx, vmdi) - return ctrl.Result{RequeueAfter: backoffInterval}, nil } func (o Orchestrator) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) { @@ -274,6 +282,7 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma vmdi.Status.FailureCount += 1 vmdi.Status.Phase = crdv1.PhaseRetryableFailure + vmdi.Status.LastFailureTime = ptr.To(metav1.Now()) vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error() reason := crdv1.ReasonUnknownSyncFailure @@ -302,5 +311,5 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma logger.Error(err, "Failed to teardown resources.") } - return ctrl.Result{}, originalErr + return ctrl.Result{}, nil } From 597ac9e0368483603a9df68ad541ba956f9ecbba Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Fri, 12 Dec 2025 13:55:00 -0700 Subject: [PATCH 19/23] chore: regenerate the chart --- .../templates/crd/vmdiskimages.crd.pelotech.ot.yaml | 3 +++ dist/install.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml index ec110ee..032c1a7 100644 --- a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml +++ b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml @@ -137,6 +137,9 @@ spec: type: array failureCount: type: integer + lastFailureTime: + format: date-time + type: string message: description: A human-readable message providing more details about the current phase. type: string diff --git a/dist/install.yaml b/dist/install.yaml index 63fccb0..18b4cc7 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -148,6 +148,9 @@ spec: type: array failureCount: type: integer + lastFailureTime: + format: date-time + type: string message: description: A human-readable message providing more details about the current phase. From 9bb15bd2f6c091a5143791ba54b0f97cfce8c929 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Tue, 16 Dec 2025 08:41:07 -0700 Subject: [PATCH 20/23] fix: pr feedback --- .../vm-disk-image/config/vmdi-controller-config-reader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go index 99ef396..69f724b 100644 --- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go +++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go @@ -6,7 +6,7 @@ import ( ) const ( - defaultConcurrency = 10 // TODO: We will need to tune this default + defaultConcurrency = 5 // TODO: We will need to tune this default defaultMaxBackoffDelay = 1 * time.Hour defaultMaxSyncDuration = 12 * time.Hour defaultMaxSyncAttemptRetries = 3 @@ -25,7 +25,7 @@ type VMDiskImageControllerConfig struct { // Locally this is your "env" and in production these values will come from a configmap func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig { // The max amount of VMDIs we can have syncing at one time. - concurrency := corecfg.GetIntEnvOrDefault("VMDI_SYNC_CONCURRENCY", defaultConcurrency) + concurrency := corecfg.GetIntEnvOrDefault("MAX_VMDI_SYNC_CONCURRENCY", defaultConcurrency) // The longest we will ever wait to retry. maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay) From cb5996f958589c8d8efe0d5d2c9bcd12a73f97a8 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Tue, 16 Dec 2025 08:42:06 -0700 Subject: [PATCH 21/23] fix: resolve nit --- .../vm-disk-image/config/vmdi-controller-config-reader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go index 69f724b..4c396c2 100644 --- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go +++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go @@ -37,13 +37,13 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig { maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration) // How many times we will retry on a given attempt. - maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries) + maxRetriesPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries) return VMDiskImageControllerConfig{ Concurrency: concurrency, MaxBackoffDelay: maxBackoffDelay, MaxSyncAttemptDuration: maxAttemptDuration, - MaxSyncAttemptRetry: maxRetryPerAttempt, + MaxSyncAttemptRetry: maxRetriesPerAttempt, MaxSyncDuration: maxSyncDuration, } } From db2350c3891e6b9f0886c76f772893c28a24012a Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Tue, 16 Dec 2025 08:45:54 -0700 Subject: [PATCH 22/23] fix: consistent names --- .../vm-disk-image/config/vmdi-controller-config-reader.go | 6 +++--- internal/vm-disk-image/controller/controller.go | 2 +- internal/vm-disk-image/service/provisioner.go | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go index 4c396c2..8583781 100644 --- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go +++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go @@ -18,7 +18,7 @@ type VMDiskImageControllerConfig struct { MaxBackoffDelay time.Duration MaxSyncDuration time.Duration MaxSyncAttemptDuration time.Duration - MaxSyncAttemptRetry int + MaxSyncAttemptRetries int } // This function will allow us to get the required config variables from the environment. @@ -37,13 +37,13 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig { maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration) // How many times we will retry on a given attempt. - maxRetriesPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries) + maxSyncAttemptRetries := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries) return VMDiskImageControllerConfig{ Concurrency: concurrency, MaxBackoffDelay: maxBackoffDelay, MaxSyncAttemptDuration: maxAttemptDuration, - MaxSyncAttemptRetry: maxRetriesPerAttempt, + MaxSyncAttemptRetries: maxSyncAttemptRetries, MaxSyncDuration: maxSyncDuration, } } diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index cb9ae07..c9ccb7e 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -113,7 +113,7 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error { Client: client, ResourceGenerator: resourceGenerator, MaxSyncAttemptDuration: config.MaxSyncAttemptDuration, - MaxRetryPerAttempt: config.MaxSyncAttemptRetry, + MaxSyncAttemptRetries: config.MaxSyncAttemptRetries, } orchestrator := vmdi.Orchestrator{ Client: client, diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go index cd8581b..4e3bcae 100644 --- a/internal/vm-disk-image/service/provisioner.go +++ b/internal/vm-disk-image/service/provisioner.go @@ -28,7 +28,7 @@ type K8sVMDIProvisioner struct { client.Client ResourceGenerator VMDIResourceGenerator MaxSyncAttemptDuration time.Duration - MaxRetryPerAttempt int + MaxSyncAttemptRetries int } const dataVolumeDonePhase = "Succeeded" @@ -200,7 +200,7 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors( } } - if dv.Status.RestartCount >= int32(p.MaxRetryPerAttempt) { + if dv.Status.RestartCount >= int32(p.MaxSyncAttemptRetries) { return ErrSyncAttemptExceedsRetries } From dca4302a936666024ba1507a389b43ae9acfc092 Mon Sep 17 00:00:00 2001 From: Harrison Billings Date: Tue, 16 Dec 2025 08:48:48 -0700 Subject: [PATCH 23/23] fix: better boolean name --- internal/vm-disk-image/controller/controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go index c9ccb7e..7d92cd2 100644 --- a/internal/vm-disk-image/controller/controller.go +++ b/internal/vm-disk-image/controller/controller.go @@ -75,8 +75,8 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request) return r.VMDiskImageOrchestrator.DeleteResource(ctx, &VMDiskImage) } - resourceHasFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer) - if resourceHasFinalizer { + resourceMissingFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer) + if resourceMissingFinalizer { return r.AddControllerFinalizer(ctx, &VMDiskImage) }