From 714f132e0a14826cd51dc71ca74001b1bad4a43d Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 15:33:18 -0700
Subject: [PATCH 01/23] feat: added two new fields to cr and generated
 boilerplate

---
 api/v1alpha1/vmdiskimage_types.go                  | 6 +++++-
 api/v1alpha1/zz_generated.deepcopy.go              | 4 ++++
 config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 9 +++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index 51c82aa..606a82f 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -34,6 +34,7 @@ const (
 
 // Condition Reasons
 const (
+	MissingSourceArtifact        string = "MissingSourceArtifact"
 	ReasonResourceCreationFailed string = "ResourceCreationFailed"
 	ReasonResouceUpdateFailed    string = "ResourceUpdateFailed"
 	ReasonQueued                 string = "Queued"
@@ -103,13 +104,16 @@ type VMDiskImageStatus struct {
 	// Conditions of the VMDiskImage resource.
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
-	FailureCount int `json:"failureCount,omitempty"`
+	FailureCount          int          `json:"failureCount,omitempty"`
+	MissingSourceArtifact bool         `json:"missingSourceArtifact,omitempty"`
+	FirstFailureTimestamp *metav1.Time `json:"firstFailureTimestamp,omitempty"`
 }
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:path=vmdiskimages,scope=Namespaced,shortName=vmdi,singular=vmdiskimage
 // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The current phase of the VMDiskImage."
+// +kubebuilder:printcolumn:name="Missing Source Artifact",type="boolean",JSONPath=".status.missingSourceArtifact",description="If the referenced artifact can be found at the given URL."
 // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
 type VMDiskImage struct {
 	metav1.TypeMeta   `json:",inline"`
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index 93ddcca..7e05fb3 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -124,6 +124,10 @@ func (in *VMDiskImageStatus) DeepCopyInto(out *VMDiskImageStatus) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.FirstFailureTimestamp != nil {
+		in, out := &in.FirstFailureTimestamp, &out.FirstFailureTimestamp
+		*out = (*in).DeepCopy()
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VMDiskImageStatus.
diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
index 2188d92..ac31962 100644
--- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
+++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
@@ -21,6 +21,10 @@ spec:
       jsonPath: .status.phase
       name: Phase
       type: string
+    - description: If the referenced artifact can be found at the given URL.
+      jsonPath: .status.missingSourceArtifact
+      name: Missing Source Artifact
+      type: boolean
     - jsonPath: .metadata.creationTimestamp
       name: Age
       type: date
@@ -140,10 +144,15 @@ spec:
                 type: array
               failureCount:
                 type: integer
+              firstFailureTimestamp:
+                format: date-time
+                type: string
               message:
                 description: A human-readable message providing more details about
                   the current phase.
                 type: string
+              missingSourceArtifact:
+                type: boolean
               phase:
                 enum:
                 - Queued

From 6af6e457f1d6b0bad5d2b36036227502bad46704 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 15:35:49 -0700
Subject: [PATCH 02/23] feat: add checks to see if we have a missing artifact
 in provisioner

---
 internal/vm-disk-image/service/provisioner.go | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go
index 55b2b4f..667844c 100644
--- a/internal/vm-disk-image/service/provisioner.go
+++ b/internal/vm-disk-image/service/provisioner.go
@@ -2,8 +2,10 @@ package service
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	crdv1 "pelotech/data-sync-operator/api/v1alpha1"
+	"strings"
 	"time"
 
 	snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
@@ -31,6 +33,8 @@ type K8sVMDIProvisioner struct {
 
 const dataVolumeDonePhase = "Succeeded"
 
+var ErrMissingSourceArtifact = errors.New("the requested artifact does not exist")
+
 // Create resources for a given VMDiskImage. Stops creating them if
 // a single resource fails to create. Does not cleanup after itself
 func (p K8sVMDIProvisioner) CreateResources(
@@ -190,6 +194,12 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors(
 		if dv.Status.RestartCount >= int32(p.RetryLimit) {
 			return fmt.Errorf("a datavolume has restarted more than the max for a sync")
 		}
+
+		for _, cond := range dv.Status.Conditions {
+			if strings.Contains(cond.Message, "404") || strings.Contains(strings.ToLower(cond.Message), "not found") {
+				return ErrMissingSourceArtifact
+			}
+		}
 	}
 
 	return nil

From 48b244cd864463c8c00c356940a2e58a6af0dd42 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 15:36:35 -0700
Subject: [PATCH 03/23] refactor: rename fields to be more descriptive

---
 .../vm-disk-image/controller/controller.go    | 12 +++++-----
 .../vm-disk-image/service/orchestrator.go     | 24 +++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index 5675549..4c70e6a 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -118,12 +118,12 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		RetryLimit:        config.RetryLimit,
 	}
 	orchestrator := vmdi.Orchestrator{
-		Client:       client,
-		Recorder:     mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
-		Provisioner:  vmdiProvisioner,
-		RetryLimit:   config.RetryLimit,
-		RetryBackoff: config.RetryBackoffDuration,
-		SyncLimit:    config.Concurrency,
+		Client:                  client,
+		Recorder:                mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
+		Provisioner:             vmdiProvisioner,
+		SyncRetryLimit:          config.RetryLimit,
+		SyncAttemptRetryBackoff: config.RetryBackoffDuration,
+		ConcurrentSyncLimit:     config.Concurrency,
 	}
 	reconciler := &VMDiskImageReconciler{
 		Scheme:                  mgr.GetScheme(),
diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index 4b8ddd5..1109913 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -32,11 +32,11 @@ type VMDiskImageOrchestrator interface {
 
 type Orchestrator struct {
 	client.Client
-	Recorder     record.EventRecorder
-	Provisioner  VMDiskImageProvisioner
-	RetryLimit   int
-	RetryBackoff time.Duration
-	SyncLimit    int
+	Recorder                record.EventRecorder
+	Provisioner             VMDiskImageProvisioner
+	SyncRetryLimit          int
+	SyncAttemptRetryBackoff time.Duration
+	ConcurrentSyncLimit     int
 }
 
 func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error {
@@ -106,9 +106,9 @@ func (o Orchestrator) AttemptSyncingOfResource(
 		logger.Error(err, "Failed to list syncing resources")
 		return ctrl.Result{}, err
 	}
-	if len(syncingList.Items) >= o.SyncLimit {
-		o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.SyncLimit)
-		return ctrl.Result{RequeueAfter: o.RetryBackoff}, nil
+	if len(syncingList.Items) >= o.ConcurrentSyncLimit {
+		o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.ConcurrentSyncLimit)
+		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
 	}
 
 	err = o.Provisioner.CreateResources(ctx, vmdi)
@@ -152,7 +152,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	}
 	if !isDone {
 		logger.Info("Sync is not complete. Requeuing.")
-		return ctrl.Result{RequeueAfter: o.RetryBackoff}, nil
+		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
 	}
 
 	vmdi.Status.Phase = crdv1.PhaseReady
@@ -246,11 +246,11 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 	if err := o.Status().Update(ctx, vmdi); err != nil {
 		logger.Error(err, "Failed to update resource failure count")
 	}
-	if vmdi.Status.FailureCount < o.RetryLimit {
-		return ctrl.Result{RequeueAfter: o.RetryBackoff}, nil
+	if vmdi.Status.FailureCount < o.SyncRetryLimit {
+		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
 	}
 
-	o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.RetryLimit)
+	o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.SyncRetryLimit)
 	vmdi.Status.Phase = crdv1.PhaseFailed
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()
 	meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{

From 3cab66b2446524dedcfb2c825cb2acd998a9af59 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 15:48:19 -0700
Subject: [PATCH 04/23] feat: add new values to config to handle retrying
 totally failed syncs

---
 .../config/vmdi-controller-config-reader.go   | 44 ++++++++++++-------
 .../vm-disk-image/controller/controller.go    |  6 +--
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
index 3c5cd49..a648ff3 100644
--- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go
+++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
@@ -6,38 +6,50 @@ import (
 )
 
 const (
-	defaultConcurrency     = 10 // TODO: We will need to tune this default
-	defaultRetryLimit      = 2
-	defaultBackoffDuration = 10 * time.Second
-	defaultMaxSyncDuration = 1 * time.Hour
+	defaultConcurrency                 = 10 // TODO: We will need to tune this default
+	defaultSyncRetryLimit              = 2
+	defaultSyncBackoffDuration         = 10 * time.Second
+	defaultMaxSyncDuration             = 1 * time.Hour
+	defaultResurrectionTimeout         = 8 * time.Hour
+	defaultResurrectionBackoffDuration = 30 * time.Minute
 )
 
 type VMDiskImageControllerConfig struct {
-	Concurrency          int
-	RetryLimit           int
-	RetryBackoffDuration time.Duration
-	MaxSyncDuration      time.Duration
+	Concurrency                 int
+	SyncRetryLimit              int
+	SyncRetryBackoffDuration    time.Duration
+	MaxSyncDuration             time.Duration
+	ResurrectionTimeout         time.Duration
+	ResurrectionBackoffDuration time.Duration
 }
 
 // This function will allow us to get the required config variables from the environment.
 // Locally this is your "env" and in production these values will come from a configmap
 func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
 	// The max amount of VMDIs we can have syncing at one time.
-	concurrency := corecfg.GetIntEnvOrDefault("CONCURRENCY", defaultConcurrency)
+	concurrency := corecfg.GetIntEnvOrDefault("VMDI_SYNC_CONCURRENCY", defaultConcurrency)
 
 	// How many times we will retry a failed sync.
-	retryLimit := corecfg.GetIntEnvOrDefault("RETRY_LIMIT", defaultRetryLimit)
+	retryLimit := corecfg.GetIntEnvOrDefault("SYNC_RETRY_LIMIT", defaultSyncRetryLimit)
 
-	// How long we want to wait before trying to resync a failed VMDI.
-	retryBackoffDuration := corecfg.GetDurationEnvOrDefault("RETRY_BACKOFF_DURATION", defaultBackoffDuration)
+	// How long we want to wait before trying another sync on a VMDI in SYNCING status if it fails.
+	retryBackoffDuration := corecfg.GetDurationEnvOrDefault("SYNC_RETRY_BACKOFF_DURATION", defaultSyncBackoffDuration)
 
 	// How long we will let a VMDI sit in syncing status.
 	maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)
 
+	// How long we will attempt to resend a VMDI through loop until it is permanently failed
+	resurrectionTimeout := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_TIMEOUT", defaultResurrectionTimeout)
+
+	// How long we will wait before attempting to send a VMDI through the loop if it enters a failed state
+	resurrectionBackoffDuration := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_BACKOFF_DURATION", defaultResurrectionBackoffDuration)
+
 	return VMDiskImageControllerConfig{
-		Concurrency:          concurrency,
-		RetryLimit:           retryLimit,
-		RetryBackoffDuration: retryBackoffDuration,
-		MaxSyncDuration:      maxSyncDuration,
+		Concurrency:                 concurrency,
+		SyncRetryLimit:              retryLimit,
+		SyncRetryBackoffDuration:    retryBackoffDuration,
+		MaxSyncDuration:             maxSyncDuration,
+		ResurrectionTimeout:         resurrectionTimeout,
+		ResurrectionBackoffDuration: resurrectionBackoffDuration,
 	}
 }
diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index 4c70e6a..ec9703d 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -115,14 +115,14 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		Client:            client,
 		ResourceGenerator: resourceGenerator,
 		MaxSyncDuration:   config.MaxSyncDuration,
-		RetryLimit:        config.RetryLimit,
+		RetryLimit:        config.SyncRetryLimit,
 	}
 	orchestrator := vmdi.Orchestrator{
 		Client:                  client,
 		Recorder:                mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
 		Provisioner:             vmdiProvisioner,
-		SyncRetryLimit:          config.RetryLimit,
-		SyncAttemptRetryBackoff: config.RetryBackoffDuration,
+		SyncRetryLimit:          config.SyncRetryLimit,
+		SyncAttemptRetryBackoff: config.SyncRetryBackoffDuration,
 		ConcurrentSyncLimit:     config.Concurrency,
 	}
 	reconciler := &VMDiskImageReconciler{

From 4b5d33d9df6691a53f8ad7dc5883b12d3ca42978 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 19:30:04 -0700
Subject: [PATCH 05/23] feat: wired up

---
 .../vm-disk-image/controller/controller.go    | 18 +++++----
 .../vm-disk-image/service/orchestrator.go     | 40 ++++++++++++++++---
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index ec9703d..7eac241 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -93,7 +93,9 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		return r.AttemptSyncingOfResource(ctx, &VMDiskImage)
 	case crdv1.PhaseSyncing:
 		return r.TransitonFromSyncing(ctx, &VMDiskImage)
-	case crdv1.PhaseReady, crdv1.PhaseFailed:
+	case crdv1.PhaseFailed:
+		return r.AttemptResurrectionOfResource(ctx, &VMDiskImage)
+	case crdv1.PhaseReady:
 		return ctrl.Result{}, nil
 	default:
 		logger.Error(nil, "Unknown phase detected", "Phase", currentPhase)
@@ -118,12 +120,14 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		RetryLimit:        config.SyncRetryLimit,
 	}
 	orchestrator := vmdi.Orchestrator{
-		Client:                  client,
-		Recorder:                mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
-		Provisioner:             vmdiProvisioner,
-		SyncRetryLimit:          config.SyncRetryLimit,
-		SyncAttemptRetryBackoff: config.SyncRetryBackoffDuration,
-		ConcurrentSyncLimit:     config.Concurrency,
+		Client:                      client,
+		Recorder:                    mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
+		Provisioner:                 vmdiProvisioner,
+		SyncRetryLimit:              config.SyncRetryLimit,
+		SyncAttemptRetryBackoff:     config.SyncRetryBackoffDuration,
+		ConcurrentSyncLimit:         config.Concurrency,
+		ResurrectionTimeout:         config.ResurrectionTimeout,
+		ResurrectionBackoffDuration: config.ResurrectionBackoffDuration,
 	}
 	reconciler := &VMDiskImageReconciler{
 		Scheme:                  mgr.GetScheme(),
diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index 1109913..bddce9f 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -2,6 +2,7 @@ package service
 
 import (
 	"context"
+	"errors"
 	crdv1 "pelotech/data-sync-operator/api/v1alpha1"
 	"time"
 
@@ -25,6 +26,7 @@ type VMDiskImageOrchestrator interface {
 	QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	AttemptSyncingOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
+	AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	HandleResourceUpdateError(ctx context.Context, ds *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error)
 	HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error)
@@ -32,11 +34,13 @@ type VMDiskImageOrchestrator interface {
 
 type Orchestrator struct {
 	client.Client
-	Recorder                record.EventRecorder
-	Provisioner             VMDiskImageProvisioner
-	SyncRetryLimit          int
-	SyncAttemptRetryBackoff time.Duration
-	ConcurrentSyncLimit     int
+	Recorder                    record.EventRecorder
+	Provisioner                 VMDiskImageProvisioner
+	SyncRetryLimit              int
+	SyncAttemptRetryBackoff     time.Duration
+	ConcurrentSyncLimit         int
+	ResurrectionTimeout         time.Duration
+	ResurrectionBackoffDuration time.Duration
 }
 
 func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error {
@@ -172,6 +176,26 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	return ctrl.Result{}, nil
 }
 
+func (o Orchestrator) AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
+	isRetryable :=
+		vmdi.Status.Phase == crdv1.PhaseFailed &&
+			time.Since(vmdi.Status.FirstFailureTimestamp.Time) > o.ResurrectionTimeout &&
+			vmdi.Status.MissingSourceArtifact
+
+	if !isRetryable {
+		return ctrl.Result{}, nil
+	}
+
+	vmdi.Status.MissingSourceArtifact = false
+	vmdi.Status.FailureCount = 0
+	vmdi.Status.Phase = ""
+	if err := o.Status().Update(ctx, vmdi); err != nil {
+		return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on resurrection")
+	}
+
+	return ctrl.Result{RequeueAfter: o.ResurrectionBackoffDuration}, nil
+}
+
 func (o Orchestrator) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
 	logger := logf.FromContext(ctx)
 
@@ -250,6 +274,12 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
 	}
 
+	if errors.Is(originalErr, ErrMissingSourceArtifact) {
+		vmdi.Status.MissingSourceArtifact = true
+	}
+
+	now := metav1.Now()
+	vmdi.Status.FirstFailureTimestamp = &now
 	o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.SyncRetryLimit)
 	vmdi.Status.Phase = crdv1.PhaseFailed
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()

From 8d84152091e47f66271f39eefc325d79f286cf29 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 22:13:19 -0700
Subject: [PATCH 06/23] refactor: moved to time based failure

---
 api/v1alpha1/vmdiskimage_types.go             |  1 -
 .../config/vmdi-controller-config-reader.go   | 50 ++++++++-----------
 .../vm-disk-image/controller/controller.go    | 23 ++++-----
 .../vm-disk-image/service/orchestrator.go     | 50 ++++++++-----------
 internal/vm-disk-image/service/provisioner.go | 20 ++------
 5 files changed, 57 insertions(+), 87 deletions(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index 606a82f..981567d 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -105,7 +105,6 @@ type VMDiskImageStatus struct {
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
 	FailureCount          int          `json:"failureCount,omitempty"`
-	MissingSourceArtifact bool         `json:"missingSourceArtifact,omitempty"`
 	FirstFailureTimestamp *metav1.Time `json:"firstFailureTimestamp,omitempty"`
 }
 
diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
index a648ff3..50cd2d6 100644
--- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go
+++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
@@ -6,21 +6,19 @@ import (
 )
 
 const (
-	defaultConcurrency                 = 10 // TODO: We will need to tune this default
-	defaultSyncRetryLimit              = 2
-	defaultSyncBackoffDuration         = 10 * time.Second
-	defaultMaxSyncDuration             = 1 * time.Hour
-	defaultResurrectionTimeout         = 8 * time.Hour
-	defaultResurrectionBackoffDuration = 30 * time.Minute
+	defaultConcurrency            = 10 // TODO: We will need to tune this default
+	defaultMaxBackoffDelay        = 1 * time.Hour
+	defaultMaxSyncDuration        = 12 * time.Hour
+	defaultMaxSyncAttemptRetry    = 3
+	defaultMaxSyncAttemptDuration = 1 * time.Hour
 )
 
 type VMDiskImageControllerConfig struct {
-	Concurrency                 int
-	SyncRetryLimit              int
-	SyncRetryBackoffDuration    time.Duration
-	MaxSyncDuration             time.Duration
-	ResurrectionTimeout         time.Duration
-	ResurrectionBackoffDuration time.Duration
+	Concurrency            int
+	MaxBackoffDelay        time.Duration
+	MaxSyncDuration        time.Duration
+	MaxSyncAttemptDuration time.Duration
+	MaxSyncAttemptRetry    int
 }
 
 // This function will allow us to get the required config variables from the environment.
@@ -29,27 +27,23 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
 	// The max amount of VMDIs we can have syncing at one time.
 	concurrency := corecfg.GetIntEnvOrDefault("VMDI_SYNC_CONCURRENCY", defaultConcurrency)
 
-	// How many times we will retry a failed sync.
-	retryLimit := corecfg.GetIntEnvOrDefault("SYNC_RETRY_LIMIT", defaultSyncRetryLimit)
+	// The longest we will ever wait to retry.
+	maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay)
 
-	// How long we want to wait before trying another sync on a VMDI in SYNCING status if it fails.
-	retryBackoffDuration := corecfg.GetDurationEnvOrDefault("SYNC_RETRY_BACKOFF_DURATION", defaultSyncBackoffDuration)
+	// How long we will try to run a sync before we fail it forever.
+	maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxSyncDuration)
 
 	// How long we will let a VMDI sit in syncing status.
-	maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)
+	maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration)
 
-	// How long we will attempt to resend a VMDI through loop until it is permanently failed
-	resurrectionTimeout := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_TIMEOUT", defaultResurrectionTimeout)
-
-	// How long we will wait before attempting to send a VMDI through the loop if it enters a failed state
-	resurrectionBackoffDuration := corecfg.GetDurationEnvOrDefault("VMDI_RESURRECTION_BACKOFF_DURATION", defaultResurrectionBackoffDuration)
+	// How many times we will retry on a given attempt.
+	maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptRetry)
 
 	return VMDiskImageControllerConfig{
-		Concurrency:                 concurrency,
-		SyncRetryLimit:              retryLimit,
-		SyncRetryBackoffDuration:    retryBackoffDuration,
-		MaxSyncDuration:             maxSyncDuration,
-		ResurrectionTimeout:         resurrectionTimeout,
-		ResurrectionBackoffDuration: resurrectionBackoffDuration,
+		Concurrency:            concurrency,
+		MaxBackoffDelay:        maxBackoffDelay,
+		MaxSyncAttemptDuration: maxAttemptDuration,
+		MaxSyncAttemptRetry:    maxRetryPerAttempt,
+		MaxSyncDuration:        maxSyncDuration,
 	}
 }
diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index 7eac241..00df396 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -94,7 +94,7 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	case crdv1.PhaseSyncing:
 		return r.TransitonFromSyncing(ctx, &VMDiskImage)
 	case crdv1.PhaseFailed:
-		return r.AttemptResurrectionOfResource(ctx, &VMDiskImage)
+		return r.AttemptRetry(ctx, &VMDiskImage)
 	case crdv1.PhaseReady:
 		return ctrl.Result{}, nil
 	default:
@@ -114,20 +114,17 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
 
 	resourceGenerator := &vmdi.Generator{}
 	vmdiProvisioner := vmdi.K8sVMDIProvisioner{
-		Client:            client,
-		ResourceGenerator: resourceGenerator,
-		MaxSyncDuration:   config.MaxSyncDuration,
-		RetryLimit:        config.SyncRetryLimit,
+		Client:                 client,
+		ResourceGenerator:      resourceGenerator,
+		MaxSyncAttemptDuration: config.MaxSyncAttemptDuration,
+		MaxRetryPerAttempt:     config.MaxSyncAttemptRetry,
 	}
 	orchestrator := vmdi.Orchestrator{
-		Client:                      client,
-		Recorder:                    mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
-		Provisioner:                 vmdiProvisioner,
-		SyncRetryLimit:              config.SyncRetryLimit,
-		SyncAttemptRetryBackoff:     config.SyncRetryBackoffDuration,
-		ConcurrentSyncLimit:         config.Concurrency,
-		ResurrectionTimeout:         config.ResurrectionTimeout,
-		ResurrectionBackoffDuration: config.ResurrectionBackoffDuration,
+		Client:                client,
+		Recorder:              mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
+		Provisioner:           vmdiProvisioner,
+		MaxSyncAttemptBackoff: config.MaxBackoffDelay,
+		ConcurrentSyncLimit:   config.Concurrency,
 	}
 	reconciler := &VMDiskImageReconciler{
 		Scheme:                  mgr.GetScheme(),
diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index bddce9f..7ad30dc 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -2,7 +2,7 @@ package service
 
 import (
 	"context"
-	"errors"
+	"math"
 	crdv1 "pelotech/data-sync-operator/api/v1alpha1"
 	"time"
 
@@ -26,7 +26,7 @@ type VMDiskImageOrchestrator interface {
 	QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	AttemptSyncingOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
-	AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
+	AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	HandleResourceUpdateError(ctx context.Context, ds *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error)
 	HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error)
@@ -34,13 +34,11 @@ type VMDiskImageOrchestrator interface {
 
 type Orchestrator struct {
 	client.Client
-	Recorder                    record.EventRecorder
-	Provisioner                 VMDiskImageProvisioner
-	SyncRetryLimit              int
-	SyncAttemptRetryBackoff     time.Duration
-	ConcurrentSyncLimit         int
-	ResurrectionTimeout         time.Duration
-	ResurrectionBackoffDuration time.Duration
+	Recorder              record.EventRecorder
+	Provisioner           VMDiskImageProvisioner
+	MaxSyncAttemptBackoff time.Duration
+	MaxSyncTime           time.Duration
+	ConcurrentSyncLimit   int
 }
 
 func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error {
@@ -112,7 +110,7 @@ func (o Orchestrator) AttemptSyncingOfResource(
 	}
 	if len(syncingList.Items) >= o.ConcurrentSyncLimit {
 		o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.ConcurrentSyncLimit)
-		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
+		return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil
 	}
 
 	err = o.Provisioner.CreateResources(ctx, vmdi)
@@ -156,7 +154,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	}
 	if !isDone {
 		logger.Info("Sync is not complete. Requeuing.")
-		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
+		return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil
 	}
 
 	vmdi.Status.Phase = crdv1.PhaseReady
@@ -176,24 +174,26 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	return ctrl.Result{}, nil
 }
 
-func (o Orchestrator) AttemptResurrectionOfResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
-	isRetryable :=
-		vmdi.Status.Phase == crdv1.PhaseFailed &&
-			time.Since(vmdi.Status.FirstFailureTimestamp.Time) > o.ResurrectionTimeout &&
-			vmdi.Status.MissingSourceArtifact
+func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
+	syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime))
 
-	if !isRetryable {
+	// leave it in a failed state
+	if syncDeadline.After(time.Now()) {
 		return ctrl.Result{}, nil
 	}
 
-	vmdi.Status.MissingSourceArtifact = false
-	vmdi.Status.FailureCount = 0
 	vmdi.Status.Phase = ""
 	if err := o.Status().Update(ctx, vmdi); err != nil {
 		return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on resurrection")
 	}
 
-	return ctrl.Result{RequeueAfter: o.ResurrectionBackoffDuration}, nil
+	// Expontential retry
+	var backoffInterval time.Duration
+	nextBackoffMinutes := int(math.Floor(math.Pow(3, float64(vmdi.Status.FailureCount))))
+	nextBackoffDuration := time.Duration(nextBackoffMinutes) * time.Minute
+	backoffInterval = min(nextBackoffDuration, o.MaxSyncAttemptBackoff)
+
+	return ctrl.Result{RequeueAfter: backoffInterval}, nil
 }
 
 func (o Orchestrator) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
@@ -270,17 +270,7 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 	if err := o.Status().Update(ctx, vmdi); err != nil {
 		logger.Error(err, "Failed to update resource failure count")
 	}
-	if vmdi.Status.FailureCount < o.SyncRetryLimit {
-		return ctrl.Result{RequeueAfter: o.SyncAttemptRetryBackoff}, nil
-	}
-
-	if errors.Is(originalErr, ErrMissingSourceArtifact) {
-		vmdi.Status.MissingSourceArtifact = true
-	}
 
-	now := metav1.Now()
-	vmdi.Status.FirstFailureTimestamp = &now
-	o.Recorder.Eventf(vmdi, "Warning", "SyncExceededRetryCount", "The sync has failed beyond the set retry limit of %d", o.SyncRetryLimit)
 	vmdi.Status.Phase = crdv1.PhaseFailed
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()
 	meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{
diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go
index 667844c..c08e24d 100644
--- a/internal/vm-disk-image/service/provisioner.go
+++ b/internal/vm-disk-image/service/provisioner.go
@@ -2,10 +2,8 @@ package service
 
 import (
 	"context"
-	"errors"
 	"fmt"
 	crdv1 "pelotech/data-sync-operator/api/v1alpha1"
-	"strings"
 	"time"
 
 	snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
@@ -26,15 +24,13 @@ type VMDiskImageProvisioner interface {
 
 type K8sVMDIProvisioner struct {
 	client.Client
-	ResourceGenerator VMDIResourceGenerator
-	MaxSyncDuration   time.Duration
-	RetryLimit        int
+	ResourceGenerator      VMDIResourceGenerator
+	MaxSyncAttemptDuration time.Duration
+	MaxRetryPerAttempt     int
 }
 
 const dataVolumeDonePhase = "Succeeded"
 
-var ErrMissingSourceArtifact = errors.New("the requested artifact does not exist")
-
 // Create resources for a given VMDiskImage. Stops creating them if
 // a single resource fails to create. Does not cleanup after itself
 func (p K8sVMDIProvisioner) CreateResources(
@@ -176,7 +172,7 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors(
 		// Normal calculation
 		timeSyncing = now.Sub(syncStartTime)
 	}
-	if timeSyncing > p.MaxSyncDuration {
+	if timeSyncing > p.MaxSyncAttemptDuration {
 		return fmt.Errorf("the VMDiskImage %s has been syncing longer than the allowed sync time", vmdi.Name)
 	}
 
@@ -191,15 +187,9 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors(
 	}
 
 	for _, dv := range dataVolumeList.Items {
-		if dv.Status.RestartCount >= int32(p.RetryLimit) {
+		if dv.Status.RestartCount >= int32(p.MaxRetryPerAttempt) {
 			return fmt.Errorf("a datavolume has restarted more than the max for a sync")
 		}
-
-		for _, cond := range dv.Status.Conditions {
-			if strings.Contains(cond.Message, "404") || strings.Contains(strings.ToLower(cond.Message), "not found") {
-				return ErrMissingSourceArtifact
-			}
-		}
 	}
 
 	return nil

From 0e36d0029cad218283c19878089e77ec978c9410 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 22:17:29 -0700
Subject: [PATCH 07/23] refactor: let library check on sync

---
 internal/vm-disk-image/service/orchestrator.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index 7ad30dc..c29dbe9 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -154,7 +154,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	}
 	if !isDone {
 		logger.Info("Sync is not complete. Requeuing.")
-		return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil
+		return ctrl.Result{}, nil
 	}
 
 	vmdi.Status.Phase = crdv1.PhaseReady

From a40b2c411110e017fcfaa807ea51b0dda7045a54 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 22:19:32 -0700
Subject: [PATCH 08/23] refactor: remove other unneeded fields

---
 api/v1alpha1/vmdiskimage_types.go                  | 3 ---
 api/v1alpha1/zz_generated.deepcopy.go              | 4 ----
 config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 5 -----
 3 files changed, 12 deletions(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index 981567d..87c447e 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -34,7 +34,6 @@ const (
 
 // Condition Reasons
 const (
-	MissingSourceArtifact        string = "MissingSourceArtifact"
 	ReasonResourceCreationFailed string = "ResourceCreationFailed"
 	ReasonResouceUpdateFailed    string = "ResourceUpdateFailed"
 	ReasonQueued                 string = "Queued"
@@ -105,14 +104,12 @@ type VMDiskImageStatus struct {
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
 	FailureCount          int          `json:"failureCount,omitempty"`
-	FirstFailureTimestamp *metav1.Time `json:"firstFailureTimestamp,omitempty"`
 }
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:resource:path=vmdiskimages,scope=Namespaced,shortName=vmdi,singular=vmdiskimage
 // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The current phase of the VMDiskImage."
-// +kubebuilder:printcolumn:name="Missing Source Artifact",type="boolean",JSONPath=".status.missingSourceArtifact",description="If the referenced artifact can be found at the given URL."
 // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
 type VMDiskImage struct {
 	metav1.TypeMeta   `json:",inline"`
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index 7e05fb3..93ddcca 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -124,10 +124,6 @@ func (in *VMDiskImageStatus) DeepCopyInto(out *VMDiskImageStatus) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
-	if in.FirstFailureTimestamp != nil {
-		in, out := &in.FirstFailureTimestamp, &out.FirstFailureTimestamp
-		*out = (*in).DeepCopy()
-	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VMDiskImageStatus.
diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
index ac31962..0025a0c 100644
--- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
+++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
@@ -144,15 +144,10 @@ spec:
                 type: array
               failureCount:
                 type: integer
-              firstFailureTimestamp:
-                format: date-time
-                type: string
               message:
                 description: A human-readable message providing more details about
                   the current phase.
                 type: string
-              missingSourceArtifact:
-                type: boolean
               phase:
                 enum:
                 - Queued

From 2477554842f009afa8144cb20ba49a52ef61e15c Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 22:20:16 -0700
Subject: [PATCH 09/23] refactor: formatting

---
 api/v1alpha1/vmdiskimage_types.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index 87c447e..51c82aa 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -103,7 +103,7 @@ type VMDiskImageStatus struct {
 	// Conditions of the VMDiskImage resource.
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
-	FailureCount          int          `json:"failureCount,omitempty"`
+	FailureCount int `json:"failureCount,omitempty"`
 }
 
 // +kubebuilder:object:root=true

From 19ac0229a6365eaa543fd6d181a7cea55d054235 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 22:21:04 -0700
Subject: [PATCH 10/23] chore: generate manifests again

---
 config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
index 0025a0c..2188d92 100644
--- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
+++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
@@ -21,10 +21,6 @@ spec:
       jsonPath: .status.phase
       name: Phase
       type: string
-    - description: If the referenced artifact can be found at the given URL.
-      jsonPath: .status.missingSourceArtifact
-      name: Missing Source Artifact
-      type: boolean
     - jsonPath: .metadata.creationTimestamp
       name: Age
       type: date

From 2fbcbdeb0751e05d806f9dcdfe533f4a69d2240e Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Thu, 11 Dec 2025 22:22:47 -0700
Subject: [PATCH 11/23] fix: messy names and duplicate env var targeting

---
 .../vm-disk-image/config/vmdi-controller-config-reader.go   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
index 50cd2d6..99ef396 100644
--- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go
+++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
@@ -9,7 +9,7 @@ const (
 	defaultConcurrency            = 10 // TODO: We will need to tune this default
 	defaultMaxBackoffDelay        = 1 * time.Hour
 	defaultMaxSyncDuration        = 12 * time.Hour
-	defaultMaxSyncAttemptRetry    = 3
+	defaultMaxSyncAttemptRetries  = 3
 	defaultMaxSyncAttemptDuration = 1 * time.Hour
 )
 
@@ -31,13 +31,13 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
 	maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay)
 
 	// How long we will try to run a sync before we fail it forever.
-	maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxSyncDuration)
+	maxSyncDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_DURATION", defaultMaxSyncDuration)
 
 	// How long we will let a VMDI sit in syncing status.
 	maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration)
 
 	// How many times we will retry on a given attempt.
-	maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptRetry)
+	maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)
 
 	return VMDiskImageControllerConfig{
 		Concurrency:            concurrency,

From b11ce83291f986b0c1db749914f64e02b2c3f294 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 09:07:18 -0700
Subject: [PATCH 12/23] feat: added more condition reasons

---
 api/v1alpha1/vmdiskimage_types.go             | 15 +++++++-----
 .../vm-disk-image/service/orchestrator.go     | 24 +++++++++++++------
 internal/vm-disk-image/service/provisioner.go | 18 ++++++++++++--
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index 51c82aa..72a7342 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -34,12 +34,15 @@ const (
 
 // Condition Reasons
 const (
-	ReasonResourceCreationFailed string = "ResourceCreationFailed"
-	ReasonResouceUpdateFailed    string = "ResourceUpdateFailed"
-	ReasonQueued                 string = "Queued"
-	ReasonSyncing                string = "Syncing"
-	ReasonRetryLimitExceeded     string = "RetryLimitExceeded"
-	ReasonSynced                 string = "Synced"
+	ReasonResourceCreationFailed      string = "ResourceCreationFailed"
+	ReasonResouceUpdateFailed         string = "ResourceUpdateFailed"
+	ReasonQueued                      string = "Queued"
+	ReasonSyncing                     string = "Syncing"
+	ReasonRetryLimitExceeded          string = "RetryLimitExceeded"
+	ReasonMissingSourceArtifact       string = "MissingSourceArtifact"
+	ReasonSyncAttemptDurationExceeded string = "SyncAttemptDurationExceeded"
+	ReasonUnknownSyncFailure          string = "UnknownSyncFailure"
+	ReasonSynced                      string = "Synced"
 )
 
 // CRD phases
diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index c29dbe9..e50400d 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -2,11 +2,13 @@ package service
 
 import (
 	"context"
+	"errors"
 	"math"
 	crdv1 "pelotech/data-sync-operator/api/v1alpha1"
 	"time"
 
 	types "k8s.io/apimachinery/pkg/types"
+	"k8s.io/utils/ptr"
 	crutils "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -176,15 +178,16 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 
 func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
 	syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime))
+	exceededSyncDeadline := syncDeadline.Before(ptr.To(metav1.Now()))
 
 	// leave it in a failed state
-	if syncDeadline.After(time.Now()) {
+	if exceededSyncDeadline {
 		return ctrl.Result{}, nil
 	}
 
 	vmdi.Status.Phase = ""
 	if err := o.Status().Update(ctx, vmdi); err != nil {
-		return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on resurrection")
+		return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry")
 	}
 
 	// Expontential retry
@@ -267,16 +270,23 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 	o.Recorder.Eventf(vmdi, "Warning", "SyncErrorOccurred", originalErr.Error())
 
 	vmdi.Status.FailureCount += 1
-	if err := o.Status().Update(ctx, vmdi); err != nil {
-		logger.Error(err, "Failed to update resource failure count")
-	}
-
 	vmdi.Status.Phase = crdv1.PhaseFailed
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()
+
+	reason := crdv1.ReasonUnknownSyncFailure
+	switch {
+	case errors.Is(originalErr, ErrSyncAttemptExceedsMaxDuration):
+		reason = crdv1.ReasonSyncAttemptDurationExceeded
+	case errors.Is(originalErr, ErrSyncAttemptExceedsRetries):
+		reason = crdv1.ReasonRetryLimitExceeded
+	case errors.Is(originalErr, ErrMissingSourceArtifact):
+		reason = crdv1.ReasonMissingSourceArtifact
+	}
+
 	meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{
 		Type:    crdv1.ConditionTypeReady,
 		Status:  metav1.ConditionFalse,
-		Reason:  crdv1.ReasonRetryLimitExceeded,
+		Reason:  reason,
 		Message: originalErr.Error(),
 	})
 
diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go
index c08e24d..cd8581b 100644
--- a/internal/vm-disk-image/service/provisioner.go
+++ b/internal/vm-disk-image/service/provisioner.go
@@ -2,8 +2,10 @@ package service
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	crdv1 "pelotech/data-sync-operator/api/v1alpha1"
+	"strings"
 	"time"
 
 	snapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
@@ -31,6 +33,10 @@ type K8sVMDIProvisioner struct {
 
 const dataVolumeDonePhase = "Succeeded"
 
+var ErrMissingSourceArtifact = errors.New("the requested artifact does not exist")
+var ErrSyncAttemptExceedsRetries = errors.New("the sync attempt has failed beyond the retry limit")
+var ErrSyncAttemptExceedsMaxDuration = errors.New("the sync attempt has lasted beyond its max duration")
+
 // Create resources for a given VMDiskImage. Stops creating them if
 // a single resource fails to create. Does not cleanup after itself
 func (p K8sVMDIProvisioner) CreateResources(
@@ -173,7 +179,7 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors(
 		timeSyncing = now.Sub(syncStartTime)
 	}
 	if timeSyncing > p.MaxSyncAttemptDuration {
-		return fmt.Errorf("the VMDiskImage %s has been syncing longer than the allowed sync time", vmdi.Name)
+		return ErrSyncAttemptExceedsMaxDuration
 	}
 
 	searchLabels := getLabelsToMatch(vmdi)
@@ -187,9 +193,17 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors(
 	}
 
 	for _, dv := range dataVolumeList.Items {
+		for _, cond := range dv.Status.Conditions {
+			missingSourceArtifact := strings.Contains(cond.Message, "404") || strings.Contains(strings.ToLower(cond.Message), "not found")
+			if missingSourceArtifact {
+				return ErrMissingSourceArtifact
+			}
+		}
+
 		if dv.Status.RestartCount >= int32(p.MaxRetryPerAttempt) {
-			return fmt.Errorf("a datavolume has restarted more than the max for a sync")
+			return ErrSyncAttemptExceedsRetries
 		}
+
 	}
 
 	return nil

From e884a61ce5aea3d8b50b314c281f9515600611ba Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 09:19:01 -0700
Subject: [PATCH 13/23] feat: added new phase for visibility into retry loop

---
 api/v1alpha1/vmdiskimage_types.go                  | 11 ++++++-----
 config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index 72a7342..da0d571 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -47,10 +47,11 @@ const (
 
 // CRD phases
 const (
-	PhaseQueued  string = "Queued"
-	PhaseSyncing string = "Syncing"
-	PhaseReady   string = "Ready"
-	PhaseFailed  string = "Failed"
+	PhaseQueued           string = "Queued"
+	PhaseSyncing          string = "Syncing"
+	PhaseReady            string = "Ready"
+	PhaseRetryableFailure string = "RetryableFailure"
+	PhaseFailed           string = "Failed"
 )
 
 // VMDiskImage Labels
@@ -97,7 +98,7 @@ type VMDiskImageStatus struct {
 	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
 	// Important: Run "make" to regenerate code after modifying this file
 
-	// +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed
+	// +kubebuilder:validation:Enum=Queued;Syncing;Ready;Failed;RetryableFailure
 	Phase string `json:"phase"`
 
 	// A human-readable message providing more details about the current phase.
diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
index 2188d92..242985e 100644
--- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
+++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
@@ -150,6 +150,7 @@ spec:
                 - Syncing
                 - Ready
                 - Failed
+                - RetryableFailure
                 type: string
             required:
             - phase

From 25c789a7b8ece47880c6c2bcdf9fc6a5611420e9 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 09:19:31 -0700
Subject: [PATCH 14/23] feat: handling new phase in controller and service code

---
 internal/vm-disk-image/controller/controller.go |  4 ++--
 internal/vm-disk-image/service/orchestrator.go  | 17 ++++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index 00df396..e18fb07 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -93,9 +93,9 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		return r.AttemptSyncingOfResource(ctx, &VMDiskImage)
 	case crdv1.PhaseSyncing:
 		return r.TransitonFromSyncing(ctx, &VMDiskImage)
-	case crdv1.PhaseFailed:
+	case crdv1.PhaseRetryableFailure:
 		return r.AttemptRetry(ctx, &VMDiskImage)
-	case crdv1.PhaseReady:
+	case crdv1.PhaseReady, crdv1.PhaseFailed:
 		return ctrl.Result{}, nil
 	default:
 		logger.Error(nil, "Unknown phase detected", "Phase", currentPhase)
diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index e50400d..289c13c 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -180,17 +180,20 @@ func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage)
 	syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime))
 	exceededSyncDeadline := syncDeadline.Before(ptr.To(metav1.Now()))
 
-	// leave it in a failed state
+	// Fail forever if we're past the deadline
 	if exceededSyncDeadline {
-		return ctrl.Result{}, nil
+		vmdi.Status.Phase = crdv1.PhaseFailed
+		vmdi.Status.Message = "Exceeded overall sync retry window. Failed Permanently"
+	} else {
+		vmdi.Status.Phase = ""
+		vmdi.Status.Message = ""
 	}
 
-	vmdi.Status.Phase = ""
 	if err := o.Status().Update(ctx, vmdi); err != nil {
 		return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry")
 	}
 
-	// Expontential retry
+	// Exponential retry
 	var backoffInterval time.Duration
 	nextBackoffMinutes := int(math.Floor(math.Pow(3, float64(vmdi.Status.FailureCount))))
 	nextBackoffDuration := time.Duration(nextBackoffMinutes) * time.Minute
@@ -220,7 +223,7 @@ func (o Orchestrator) HandleResourceUpdateError(
 	logger.Error(originalErr, message)
 
 	// Mark the resource as Failed
-	vmdi.Status.Phase = crdv1.PhaseFailed
+	vmdi.Status.Phase = crdv1.PhaseRetryableFailure
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()
 	meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{
 		Type:    crdv1.ConditionTypeReady,
@@ -242,7 +245,7 @@ func (o Orchestrator) HandleResourceCreationError(ctx context.Context, vmdi *crd
 	logger.Error(originalErr, "Failed to create a resource.")
 
 	o.Recorder.Eventf(vmdi, "Warning", "ResourceCreationFailed", "Failed to create resources.")
-	vmdi.Status.Phase = crdv1.PhaseFailed
+	vmdi.Status.Phase = crdv1.PhaseRetryableFailure
 	vmdi.Status.Message = "Failed while creating resources: " + originalErr.Error()
 	meta.SetStatusCondition(&vmdi.Status.Conditions, metav1.Condition{
 		Type:    crdv1.ConditionTypeReady,
@@ -270,7 +273,7 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 	o.Recorder.Eventf(vmdi, "Warning", "SyncErrorOccurred", originalErr.Error())
 
 	vmdi.Status.FailureCount += 1
-	vmdi.Status.Phase = crdv1.PhaseFailed
+	vmdi.Status.Phase = crdv1.PhaseRetryableFailure
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()
 
 	reason := crdv1.ReasonUnknownSyncFailure

From d0dd06201d0bdd53fdee9e167d726f2b8d05402a Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 09:20:21 -0700
Subject: [PATCH 15/23] chore: regenerate chart to include new phase

---
 .../templates/crd/vmdiskimages.crd.pelotech.ot.yaml              | 1 +
 dist/install.yaml                                                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml
index c86c8d0..ec110ee 100644
--- a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml
+++ b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml
@@ -146,6 +146,7 @@ spec:
                                     - Syncing
                                     - Ready
                                     - Failed
+                                    - RetryableFailure
                                 type: string
                         required:
                             - phase
diff --git a/dist/install.yaml b/dist/install.yaml
index e43aff7..63fccb0 100644
--- a/dist/install.yaml
+++ b/dist/install.yaml
@@ -158,6 +158,7 @@ spec:
                 - Syncing
                 - Ready
                 - Failed
+                - RetryableFailure
                 type: string
             required:
             - phase

From f43bbb0d6cfcf153981194c95032e786696e9ccd Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 10:02:11 -0700
Subject: [PATCH 16/23] fix: wait 10 seconds before checking sync status again

---
 internal/vm-disk-image/service/orchestrator.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index 289c13c..c3020b2 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -156,7 +156,7 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	}
 	if !isDone {
 		logger.Info("Sync is not complete. Requeuing.")
-		return ctrl.Result{}, nil
+		return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
 	}
 
 	vmdi.Status.Phase = crdv1.PhaseReady

From 2ef59b3b13f6966a585cb9f939a04fd761873642 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 13:50:20 -0700
Subject: [PATCH 17/23] chore: regenerate resources after adding
 LastFailureTime to spec

---
 api/v1alpha1/vmdiskimage_types.go                  | 3 ++-
 api/v1alpha1/zz_generated.deepcopy.go              | 4 ++++
 config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/api/v1alpha1/vmdiskimage_types.go b/api/v1alpha1/vmdiskimage_types.go
index da0d571..c997f1a 100644
--- a/api/v1alpha1/vmdiskimage_types.go
+++ b/api/v1alpha1/vmdiskimage_types.go
@@ -107,7 +107,8 @@ type VMDiskImageStatus struct {
 	// Conditions of the VMDiskImage resource.
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 
-	FailureCount int `json:"failureCount,omitempty"`
+	FailureCount    int          `json:"failureCount,omitempty"`
+	LastFailureTime *metav1.Time `json:"lastFailureTime,omitempty"`
 }
 
 // +kubebuilder:object:root=true
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index 93ddcca..db29f87 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -124,6 +124,10 @@ func (in *VMDiskImageStatus) DeepCopyInto(out *VMDiskImageStatus) {
 			(*in)[i].DeepCopyInto(&(*out)[i])
 		}
 	}
+	if in.LastFailureTime != nil {
+		in, out := &in.LastFailureTime, &out.LastFailureTime
+		*out = (*in).DeepCopy()
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VMDiskImageStatus.
diff --git a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
index 242985e..8d60673 100644
--- a/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
+++ b/config/crd/bases/crd.pelotech.ot_vmdiskimages.yaml
@@ -140,6 +140,9 @@ spec:
                 type: array
               failureCount:
                 type: integer
+              lastFailureTime:
+                format: date-time
+                type: string
               message:
                 description: A human-readable message providing more details about
                   the current phase.

From c9f8e6bb63cec0f7a946ae5cc27cce6fc1ce7412 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 13:54:01 -0700
Subject: [PATCH 18/23] fix: resolve finalizer conflict and now respecting
 backoff behavior

---
 .../vm-disk-image/controller/controller.go    | 17 +++---
 .../vm-disk-image/service/orchestrator.go     | 55 +++++++++++--------
 2 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index e18fb07..cb9ae07 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -77,11 +77,7 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	resourceHasFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
 	if resourceHasFinalizer {
-		err := r.AddControllerFinalizer(ctx, &VMDiskImage)
-		if err != nil {
-			return r.HandleResourceUpdateError(ctx, &VMDiskImage, err, "Failed to add finalizer to our resource")
-		}
-
+		return r.AddControllerFinalizer(ctx, &VMDiskImage)
 	}
 
 	currentPhase := VMDiskImage.Status.Phase
@@ -120,11 +116,12 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		MaxRetryPerAttempt:     config.MaxSyncAttemptRetry,
 	}
 	orchestrator := vmdi.Orchestrator{
-		Client:                client,
-		Recorder:              mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
-		Provisioner:           vmdiProvisioner,
-		MaxSyncAttemptBackoff: config.MaxBackoffDelay,
-		ConcurrentSyncLimit:   config.Concurrency,
+		Client:              client,
+		Recorder:            mgr.GetEventRecorderFor(crdv1.VMDiskImageControllerName),
+		Provisioner:         vmdiProvisioner,
+		MaxRetryBackoff:     config.MaxBackoffDelay,
+		MaxSyncTime:         config.MaxSyncDuration,
+		ConcurrentSyncLimit: config.Concurrency,
 	}
 	reconciler := &VMDiskImageReconciler{
 		Scheme:                  mgr.GetScheme(),
diff --git a/internal/vm-disk-image/service/orchestrator.go b/internal/vm-disk-image/service/orchestrator.go
index c3020b2..87a8116 100644
--- a/internal/vm-disk-image/service/orchestrator.go
+++ b/internal/vm-disk-image/service/orchestrator.go
@@ -22,7 +22,7 @@ import (
 
 type VMDiskImageOrchestrator interface {
 	GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error
-	AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) error
+	AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	IndexVMDiskImageByPhase(rawObj client.Object) []string
 	ListVMDiskImagesByPhase(ctx context.Context, phase string) (*crdv1.VMDiskImageList, error)
 	QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
@@ -30,27 +30,30 @@ type VMDiskImageOrchestrator interface {
 	TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
 	DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error)
-	HandleResourceUpdateError(ctx context.Context, ds *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error)
-	HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskImage, originalErr error, message string) (ctrl.Result, error)
 }
 
 type Orchestrator struct {
 	client.Client
-	Recorder              record.EventRecorder
-	Provisioner           VMDiskImageProvisioner
-	MaxSyncAttemptBackoff time.Duration
-	MaxSyncTime           time.Duration
-	ConcurrentSyncLimit   int
+	Recorder            record.EventRecorder
+	Provisioner         VMDiskImageProvisioner
+	MaxRetryBackoff     time.Duration
+	MaxSyncTime         time.Duration
+	ConcurrentSyncLimit int
 }
 
 func (o Orchestrator) GetVMDiskImage(ctx context.Context, namespace types.NamespacedName, vmdi *crdv1.VMDiskImage) error {
 	return o.Get(ctx, namespace, vmdi)
 }
 
-func (o Orchestrator) AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) error {
+func (o Orchestrator) AddControllerFinalizer(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
 	crutils.AddFinalizer(vmdi, crdv1.VMDiskImageFinalizer)
 
-	return o.Update(ctx, vmdi)
+	err := o.Update(ctx, vmdi)
+	if err != nil {
+		return o.HandleResourceUpdateError(ctx, vmdi, err, "Failed to add finalizer to our resource")
+	}
+
+	return ctrl.Result{}, nil
 }
 
 func (o Orchestrator) ListVMDiskImagesByPhase(ctx context.Context, phase string) (*crdv1.VMDiskImageList, error) {
@@ -96,7 +99,7 @@ func (o Orchestrator) QueueResourceCreation(ctx context.Context, vmdi *crdv1.VMD
 
 	o.Recorder.Eventf(vmdi, "Normal", "Queued", "Resource successfully queued for sync orchestration")
 
-	return ctrl.Result{Requeue: true}, nil
+	return ctrl.Result{}, nil
 }
 
 func (o Orchestrator) AttemptSyncingOfResource(
@@ -112,7 +115,7 @@ func (o Orchestrator) AttemptSyncingOfResource(
 	}
 	if len(syncingList.Items) >= o.ConcurrentSyncLimit {
 		o.Recorder.Eventf(vmdi, "Normal", "WaitingToSync", "No more than %d VMDiskImages can be syncing at once. Waiting...", o.ConcurrentSyncLimit)
-		return ctrl.Result{RequeueAfter: o.MaxSyncAttemptBackoff}, nil
+		return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
 	}
 
 	err = o.Provisioner.CreateResources(ctx, vmdi)
@@ -145,7 +148,6 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 	// Check if there is an error occurring in the sync
 	syncError := o.Provisioner.ResourcesHaveErrors(ctx, vmdi)
 	if syncError != nil {
-		logger.Error(syncError, "A sync error has occurred.")
 		return o.HandleSyncError(ctx, vmdi, syncError, "A error has occurred while syncing")
 	}
 
@@ -178,28 +180,34 @@ func (o Orchestrator) TransitonFromSyncing(ctx context.Context, vmdi *crdv1.VMDi
 
 func (o Orchestrator) AttemptRetry(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
 	syncDeadline := metav1.NewTime(vmdi.CreationTimestamp.Add(o.MaxSyncTime))
-	exceededSyncDeadline := syncDeadline.Before(ptr.To(metav1.Now()))
+	exceededSyncDeadline := metav1.Now().After(syncDeadline.Time)
 
 	// Fail forever if we're past the deadline
 	if exceededSyncDeadline {
 		vmdi.Status.Phase = crdv1.PhaseFailed
 		vmdi.Status.Message = "Exceeded overall sync retry window. Failed Permanently"
-	} else {
-		vmdi.Status.Phase = ""
-		vmdi.Status.Message = ""
-	}
 
-	if err := o.Status().Update(ctx, vmdi); err != nil {
-		return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry")
+		if err := o.Status().Update(ctx, vmdi); err != nil {
+			return o.HandleResourceUpdateError(ctx, vmdi, err, "failed to reset VMDI on retry")
+		}
+
+		return ctrl.Result{}, nil
 	}
 
 	// Exponential retry
 	var backoffInterval time.Duration
 	nextBackoffMinutes := int(math.Floor(math.Pow(3, float64(vmdi.Status.FailureCount))))
 	nextBackoffDuration := time.Duration(nextBackoffMinutes) * time.Minute
-	backoffInterval = min(nextBackoffDuration, o.MaxSyncAttemptBackoff)
+	backoffInterval = min(nextBackoffDuration, o.MaxRetryBackoff)
+	timeSinceFailure := time.Since(vmdi.Status.LastFailureTime.Time)
+	// If we haven't waited as long as we need to backoff and requeue
+	if timeSinceFailure < backoffInterval {
+		remaingWaitTime := backoffInterval - timeSinceFailure
+		return ctrl.Result{RequeueAfter: remaingWaitTime}, nil
+	}
+
+	return o.QueueResourceCreation(ctx, vmdi)
 
-	return ctrl.Result{RequeueAfter: backoffInterval}, nil
 }
 
 func (o Orchestrator) DeleteResource(ctx context.Context, vmdi *crdv1.VMDiskImage) (ctrl.Result, error) {
@@ -274,6 +282,7 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 
 	vmdi.Status.FailureCount += 1
 	vmdi.Status.Phase = crdv1.PhaseRetryableFailure
+	vmdi.Status.LastFailureTime = ptr.To(metav1.Now())
 	vmdi.Status.Message = "An error occurred during reconciliation: " + originalErr.Error()
 
 	reason := crdv1.ReasonUnknownSyncFailure
@@ -302,5 +311,5 @@ func (o Orchestrator) HandleSyncError(ctx context.Context, vmdi *crdv1.VMDiskIma
 		logger.Error(err, "Failed to teardown resources.")
 	}
 
-	return ctrl.Result{}, originalErr
+	return ctrl.Result{}, nil
 }

From 597ac9e0368483603a9df68ad541ba956f9ecbba Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Fri, 12 Dec 2025 13:55:00 -0700
Subject: [PATCH 19/23] chore: regenerate the chart

---
 .../templates/crd/vmdiskimages.crd.pelotech.ot.yaml            | 3 +++
 dist/install.yaml                                              | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml
index ec110ee..032c1a7 100644
--- a/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml
+++ b/charts/data-sync-operator/templates/crd/vmdiskimages.crd.pelotech.ot.yaml
@@ -137,6 +137,9 @@ spec:
                                 type: array
                             failureCount:
                                 type: integer
+                            lastFailureTime:
+                                format: date-time
+                                type: string
                             message:
                                 description: A human-readable message providing more details about the current phase.
                                 type: string
diff --git a/dist/install.yaml b/dist/install.yaml
index 63fccb0..18b4cc7 100644
--- a/dist/install.yaml
+++ b/dist/install.yaml
@@ -148,6 +148,9 @@ spec:
                 type: array
               failureCount:
                 type: integer
+              lastFailureTime:
+                format: date-time
+                type: string
               message:
                 description: A human-readable message providing more details about
                   the current phase.

From 9bb15bd2f6c091a5143791ba54b0f97cfce8c929 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Tue, 16 Dec 2025 08:41:07 -0700
Subject: [PATCH 20/23] fix: pr feedback

---
 .../vm-disk-image/config/vmdi-controller-config-reader.go     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
index 99ef396..69f724b 100644
--- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go
+++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
@@ -6,7 +6,7 @@ import (
 )
 
 const (
-	defaultConcurrency            = 10 // TODO: We will need to tune this default
+	defaultConcurrency            = 5 // TODO: We will need to tune this default
 	defaultMaxBackoffDelay        = 1 * time.Hour
 	defaultMaxSyncDuration        = 12 * time.Hour
 	defaultMaxSyncAttemptRetries  = 3
@@ -25,7 +25,7 @@ type VMDiskImageControllerConfig struct {
 // Locally this is your "env" and in production these values will come from a configmap
 func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
 	// The max amount of VMDIs we can have syncing at one time.
-	concurrency := corecfg.GetIntEnvOrDefault("VMDI_SYNC_CONCURRENCY", defaultConcurrency)
+	concurrency := corecfg.GetIntEnvOrDefault("MAX_VMDI_SYNC_CONCURRENCY", defaultConcurrency)
 
 	// The longest we will ever wait to retry.
 	maxBackoffDelay := corecfg.GetDurationEnvOrDefault("MAX_SYNC_RETRY_BACKOFF_DURATION", defaultMaxBackoffDelay)

From cb5996f958589c8d8efe0d5d2c9bcd12a73f97a8 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Tue, 16 Dec 2025 08:42:06 -0700
Subject: [PATCH 21/23] fix: resolve nit

---
 .../vm-disk-image/config/vmdi-controller-config-reader.go     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
index 69f724b..4c396c2 100644
--- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go
+++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
@@ -37,13 +37,13 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
 	maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration)
 
 	// How many times we will retry on a given attempt.
-	maxRetryPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)
+	maxRetriesPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)
 
 	return VMDiskImageControllerConfig{
 		Concurrency:            concurrency,
 		MaxBackoffDelay:        maxBackoffDelay,
 		MaxSyncAttemptDuration: maxAttemptDuration,
-		MaxSyncAttemptRetry:    maxRetryPerAttempt,
+		MaxSyncAttemptRetry:    maxRetriesPerAttempt,
 		MaxSyncDuration:        maxSyncDuration,
 	}
 }

From db2350c3891e6b9f0886c76f772893c28a24012a Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Tue, 16 Dec 2025 08:45:54 -0700
Subject: [PATCH 22/23] fix: consistent names

---
 .../vm-disk-image/config/vmdi-controller-config-reader.go   | 6 +++---
 internal/vm-disk-image/controller/controller.go             | 2 +-
 internal/vm-disk-image/service/provisioner.go               | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/internal/vm-disk-image/config/vmdi-controller-config-reader.go b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
index 4c396c2..8583781 100644
--- a/internal/vm-disk-image/config/vmdi-controller-config-reader.go
+++ b/internal/vm-disk-image/config/vmdi-controller-config-reader.go
@@ -18,7 +18,7 @@ type VMDiskImageControllerConfig struct {
 	MaxBackoffDelay        time.Duration
 	MaxSyncDuration        time.Duration
 	MaxSyncAttemptDuration time.Duration
-	MaxSyncAttemptRetry    int
+	MaxSyncAttemptRetries  int
 }
 
 // This function will allow us to get the required config variables from the environment.
@@ -37,13 +37,13 @@ func LoadVMDIControllerConfigFromEnv() VMDiskImageControllerConfig {
 	maxAttemptDuration := corecfg.GetDurationEnvOrDefault("MAX_SYNC_ATTEMPT_DURATION", defaultMaxSyncAttemptDuration)
 
 	// How many times we will retry on a given attempt.
-	maxRetriesPerAttempt := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)
+	maxSyncAttemptRetries := corecfg.GetIntEnvOrDefault("MAX_SYNC_ATTEMPT_RETRIES", defaultMaxSyncAttemptRetries)
 
 	return VMDiskImageControllerConfig{
 		Concurrency:            concurrency,
 		MaxBackoffDelay:        maxBackoffDelay,
 		MaxSyncAttemptDuration: maxAttemptDuration,
-		MaxSyncAttemptRetry:    maxRetriesPerAttempt,
+		MaxSyncAttemptRetries:  maxSyncAttemptRetries,
 		MaxSyncDuration:        maxSyncDuration,
 	}
 }
diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index cb9ae07..c9ccb7e 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -113,7 +113,7 @@ func (r *VMDiskImageReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		Client:                 client,
 		ResourceGenerator:      resourceGenerator,
 		MaxSyncAttemptDuration: config.MaxSyncAttemptDuration,
-		MaxRetryPerAttempt:     config.MaxSyncAttemptRetry,
+		MaxSyncAttemptRetries:  config.MaxSyncAttemptRetries,
 	}
 	orchestrator := vmdi.Orchestrator{
 		Client:              client,
diff --git a/internal/vm-disk-image/service/provisioner.go b/internal/vm-disk-image/service/provisioner.go
index cd8581b..4e3bcae 100644
--- a/internal/vm-disk-image/service/provisioner.go
+++ b/internal/vm-disk-image/service/provisioner.go
@@ -28,7 +28,7 @@ type K8sVMDIProvisioner struct {
 	client.Client
 	ResourceGenerator      VMDIResourceGenerator
 	MaxSyncAttemptDuration time.Duration
-	MaxRetryPerAttempt     int
+	MaxSyncAttemptRetries  int
 }
 
 const dataVolumeDonePhase = "Succeeded"
@@ -200,7 +200,7 @@ func (p K8sVMDIProvisioner) ResourcesHaveErrors(
 			}
 		}
 
-		if dv.Status.RestartCount >= int32(p.MaxRetryPerAttempt) {
+		if dv.Status.RestartCount >= int32(p.MaxSyncAttemptRetries) {
 			return ErrSyncAttemptExceedsRetries
 		}
 

From dca4302a936666024ba1507a389b43ae9acfc092 Mon Sep 17 00:00:00 2001
From: Harrison Billings <harrison.billings@pelo.tech>
Date: Tue, 16 Dec 2025 08:48:48 -0700
Subject: [PATCH 23/23] fix: better boolean name

---
 internal/vm-disk-image/controller/controller.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/vm-disk-image/controller/controller.go b/internal/vm-disk-image/controller/controller.go
index c9ccb7e..7d92cd2 100644
--- a/internal/vm-disk-image/controller/controller.go
+++ b/internal/vm-disk-image/controller/controller.go
@@ -75,8 +75,8 @@ func (r *VMDiskImageReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		return r.VMDiskImageOrchestrator.DeleteResource(ctx, &VMDiskImage)
 	}
 
-	resourceHasFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
-	if resourceHasFinalizer {
+	resourceMissingFinalizer := !crutils.ContainsFinalizer(&VMDiskImage, crdv1.VMDiskImageFinalizer)
+	if resourceMissingFinalizer {
 		return r.AddControllerFinalizer(ctx, &VMDiskImage)
 	}