feat: implement ancestor cleanup during termination

ParidelPooya · ParidelPooya · commit 6f8c74b09da5 · 2025-12-16T13:42:37.000-08:00
- Add ancestor cleanup logic in checkpoint manager termination evaluation
- Use original stepId from metadata to avoid hashed stepId issues
- Clean up operations with finished ancestors in RETRY_WAITING, IDLE_NOT_AWAITED, and IDLE_AWAITED states
- Add comprehensive unit tests for ancestor cleanup functionality
- Update CompletionConfig TSDoc with race condition behavior note

Fixes memory leaks and ensures proper resource cleanup when ancestor contexts complete.
diff --git a/packages/aws-durable-execution-sdk-js-examples/src/examples/parallel/min-successful-with-passing-threshold/min-successful-with-passing-threshold.test.ts b/packages/aws-durable-execution-sdk-js-examples/src/examples/parallel/min-successful-with-passing-threshold/min-successful-with-passing-threshold.test.ts
@@ -0,0 +1,42 @@
+import { handler } from "./min-successful-with-passing-threshold";
+import { createTests } from "../../../utils/test-helper";
+import { OperationStatus } from "@aws/durable-execution-sdk-js-testing";
+
+createTests({
+  localRunnerConfig: {
+    skipTime: false,
+    checkpointDelay: 100,
+  },
+  handler,
+  tests: (runner) => {
+    it("should complete early when minSuccessful is reached", async () => {
+      const execution = await runner.run();
+      const result = execution.getResult() as any;
+
+      // Assert overall results
+      expect(result.successCount).toBe(2);
+      expect(result.completionReason).toBe("MIN_SUCCESSFUL_REACHED");
+      expect(result.totalCount).toBe(5);
+
+      // Get the parallel operation to verify individual branch results
+      // Get individual branch operations
+      const branch1 = runner.getOperation("branch-1");
+      const branch2 = runner.getOperation("branch-2");
+      const branch3 = runner.getOperation("branch-3");
+      const branch4 = runner.getOperation("branch-4");
+      const branch5 = runner.getOperation("branch-5");
+
+      // First two branches should succeed (branch-1 and branch-2 complete fastest)
+      expect(branch1?.getStatus()).toBe(OperationStatus.SUCCEEDED);
+      expect(branch2?.getStatus()).toBe(OperationStatus.SUCCEEDED);
+      expect(branch3?.getStatus()).toBe(OperationStatus.SUCCEEDED);
+      expect(branch4?.getStatus()).toBe(OperationStatus.SUCCEEDED);
+
+      // Remaining branches should be in STARTED state (not completed)
+      expect(branch5?.getStatus()).toBe(OperationStatus.STARTED);
+
+      // Verify the results array matches
+      expect(result.results).toEqual(["Branch 1 result", "Branch 2 result"]);
+    });
+  },
+});
diff --git a/packages/aws-durable-execution-sdk-js-examples/src/examples/parallel/min-successful-with-passing-threshold/min-successful-with-passing-threshold.ts b/packages/aws-durable-execution-sdk-js-examples/src/examples/parallel/min-successful-with-passing-threshold/min-successful-with-passing-threshold.ts
@@ -0,0 +1,88 @@
+import {
+  DurableContext,
+  withDurableExecution,
+} from "@aws/durable-execution-sdk-js";
+import { ExampleConfig } from "../../../types";
+import { log } from "../../../utils/logger";
+
+export const config: ExampleConfig = {
+  name: "Parallel minSuccessful",
+  description: "Parallel execution with minSuccessful completion config",
+};
+
+export const handler = withDurableExecution(
+  async (event: any, context: DurableContext) => {
+    log("Starting parallel execution with minSuccessful: 2");
+
+    // First brach finishes first
+    // Branch 2 to 4 finish in the same time
+    // Branc 5 will finish later
+    const results = await context.parallel(
+      "min-successful-branches",
+      [
+        {
+          name: "branch-1",
+          func: async (ctx) => {
+            return await ctx.step("branch-1", async () => {
+              await new Promise((resolve) => setTimeout(resolve, 10));
+              return "Branch 1 result";
+            });
+          },
+        },
+        {
+          name: "branch-2",
+          func: async (ctx) => {
+            return await ctx.step("branch-2", async () => {
+              await new Promise((resolve) => setTimeout(resolve, 50));
+              return "Branch 2 result";
+            });
+          },
+        },
+        {
+          name: "branch-3",
+          func: async (ctx) => {
+            return await ctx.step("branch-3", async () => {
+              await new Promise((resolve) => setTimeout(resolve, 50));
+              return "Branch 3 result";
+            });
+          },
+        },
+        {
+          name: "branch-4",
+          func: async (ctx) => {
+            return await ctx.step("branch-4", async () => {
+              await new Promise((resolve) => setTimeout(resolve, 50));
+              return "Branch 4 result";
+            });
+          },
+        },
+        {
+          name: "branch-5",
+          func: async (ctx) => {
+            return await ctx.step("branch-4", async () => {
+              await new Promise((resolve) => setTimeout(resolve, 2000));
+              return "Branch 4 result";
+            });
+          },
+        },
+      ],
+      {
+        completionConfig: {
+          minSuccessful: 2,
+        },
+      },
+    );
+
+    await context.wait({ seconds: 1 });
+
+    log(`Completed with ${results.successCount} successes`);
+    log(`Completion reason: ${results.completionReason}`);
+
+    return {
+      successCount: results.successCount,
+      totalCount: results.totalCount,
+      completionReason: results.completionReason,
+      results: results.getResults(),
+    };
+  },
+);
diff --git a/packages/aws-durable-execution-sdk-js-examples/template.yml b/packages/aws-durable-execution-sdk-js-examples/template.yml
@@ -1123,6 +1123,31 @@ Resources:
           DURABLE_EXAMPLES_VERBOSE: "true"
     Metadata:
       SkipBuild: "True"
+  MinSuccessfulWithPassingThreshold:
+    Type: AWS::Serverless::Function
+    Properties:
+      FunctionName: ParallelminSuccessful-22x-NodeJS-Local
+      CodeUri: ./dist
+      Handler: min-successful-with-passing-threshold.handler
+      Runtime: nodejs22.x
+      Architectures:
+        - x86_64
+      MemorySize: 128
+      Timeout: 60
+      Role:
+        Fn::GetAtt:
+          - DurableFunctionRole
+          - Arn
+      DurableConfig:
+        ExecutionTimeout: 60
+        RetentionPeriodInDays: 7
+      Environment:
+        Variables:
+          AWS_ENDPOINT_URL_LAMBDA: http://host.docker.internal:5000
+          DURABLE_VERBOSE_MODE: "false"
+          DURABLE_EXAMPLES_VERBOSE: "true"
+    Metadata:
+      SkipBuild: "True"
   ParallelToleratedFailureCount:
     Type: AWS::Serverless::Function
     Properties:
diff --git a/packages/aws-durable-execution-sdk-js/src/types/batch.ts b/packages/aws-durable-execution-sdk-js/src/types/batch.ts
@@ -69,6 +69,14 @@ export interface BatchResult<TResult> {
 }
 
 /**
+ * Configuration for early completion of map/parallel operations
+ *
+ * @remarks
+ * **Race Condition Behavior**: When multiple children complete simultaneously,
+ * the parent operation may have more completed children than the specified threshold
+ * by the time the completion check occurs. This is expected behavior due to the
+ * asynchronous nature of concurrent execution.
+ *
  * @public
  */
 export interface CompletionConfig {
diff --git a/packages/aws-durable-execution-sdk-js/src/utils/checkpoint/checkpoint-ancestor.test.ts b/packages/aws-durable-execution-sdk-js/src/utils/checkpoint/checkpoint-ancestor.test.ts
@@ -4,6 +4,8 @@ import { createTestCheckpointManager } from "../../testing/create-test-checkpoin
 import { createMockExecutionContext } from "../../testing/mock-context";
 import { EventEmitter } from "events";
 import { createDefaultLogger } from "../logger/default-logger";
+import { OperationLifecycleState } from "../../types/operation-lifecycle-state";
+import { OperationSubType } from "../../types/core";
 
 describe("CheckpointManager - Ancestor Functionality", () => {
   let checkpointManager: CheckpointManager;
@@ -92,6 +94,107 @@ describe("CheckpointManager - Ancestor Functionality", () => {
     });
   });
 
+  describe("ancestor cleanup during termination", () => {
+    it("should clean up operations with finished ancestors during termination", () => {
+      // Create operations first (before marking ancestors as finished)
+      checkpointManager.markOperationState(
+        "1-2-3",
+        OperationLifecycleState.RETRY_WAITING,
+        {
+          metadata: {
+            stepId: "1-2-3",
+            name: "test-step",
+            type: "STEP",
+            subType: OperationSubType.STEP,
+            parentId: "1-2",
+          },
+        },
+      );
+
+      checkpointManager.markOperationState(
+        "1-2-4",
+        OperationLifecycleState.IDLE_AWAITED,
+        {
+          metadata: {
+            stepId: "1-2-4",
+            name: "test-step-2",
+            type: "STEP",
+            subType: OperationSubType.STEP,
+            parentId: "1-2",
+          },
+        },
+      );
+
+      // Create operation without finished ancestor (should not be cleaned up)
+      checkpointManager.markOperationState(
+        "1-3-1",
+        OperationLifecycleState.RETRY_WAITING,
+        {
+          metadata: {
+            stepId: "1-3-1",
+            name: "test-step-3",
+            type: "STEP",
+            subType: OperationSubType.STEP,
+            parentId: "1-3",
+          },
+        },
+      );
+
+      // Verify operations exist before cleanup
+      expect(checkpointManager.getOperationState("1-2-3")).toBe(
+        OperationLifecycleState.RETRY_WAITING,
+      );
+      expect(checkpointManager.getOperationState("1-2-4")).toBe(
+        OperationLifecycleState.IDLE_AWAITED,
+      );
+      expect(checkpointManager.getOperationState("1-3-1")).toBe(
+        OperationLifecycleState.RETRY_WAITING,
+      );
+
+      // Now mark ancestor as finished
+      checkpointManager.markAncestorFinished("1-2");
+
+      // Trigger termination logic that includes ancestor cleanup
+      (checkpointManager as any).checkAndTerminate();
+
+      // Operations with finished ancestors should be cleaned up
+      expect(checkpointManager.getOperationState("1-2-3")).toBeUndefined();
+      expect(checkpointManager.getOperationState("1-2-4")).toBeUndefined();
+
+      // Operation without finished ancestor should remain
+      expect(checkpointManager.getOperationState("1-3-1")).toBe(
+        OperationLifecycleState.RETRY_WAITING,
+      );
+    });
+
+    it("should not clean up operations in EXECUTING state even with finished ancestors", () => {
+      // Create operation first
+      checkpointManager.markOperationState(
+        "1-2-3",
+        OperationLifecycleState.EXECUTING,
+        {
+          metadata: {
+            stepId: "1-2-3",
+            name: "test-step",
+            type: "STEP",
+            subType: OperationSubType.STEP,
+            parentId: "1-2",
+          },
+        },
+      );
+
+      // Then mark ancestor as finished
+      checkpointManager.markAncestorFinished("1-2");
+
+      (checkpointManager as any).checkAndTerminate();
+
+      // EXECUTING operation should not be cleaned up
+      expect(checkpointManager.getOperationState("1-2-3")).toBe(
+        OperationLifecycleState.EXECUTING,
+      );
+    });
+  });
+
   describe("checkpoint with finished ancestors", () => {
     it("should skip checkpoint when ancestor is finished", async () => {
       checkpointManager.markAncestorFinished("1-2");
diff --git a/packages/aws-durable-execution-sdk-js/src/utils/checkpoint/checkpoint-manager.ts b/packages/aws-durable-execution-sdk-js/src/utils/checkpoint/checkpoint-manager.ts
@@ -612,7 +612,16 @@ export class CheckpointManager implements Checkpoint {
         op.state === OperationLifecycleState.IDLE_NOT_AWAITED ||
         op.state === OperationLifecycleState.IDLE_AWAITED
       ) {
-        // Note: Ancestor completion checking removed - operations will continue normally
+        // Use the original stepId from metadata, not the potentially hashed op.stepId
+        const originalStepId = op.metadata.stepId;
+        if (this.hasFinishedAncestor(originalStepId)) {
+          log(
+            "🧹",
+            `Cleaning up operation with completed ancestor: ${originalStepId}`,
+          );
+          this.cleanupOperation(op.stepId);
+          this.operations.delete(op.stepId);
+        }
       }
     }