Add directly modified and restatement triggers

treysp · treysp · commit 22c272c73bd7 · 2025-08-05T21:56:31.000-05:00
diff --git a/sqlmesh/core/console.py b/sqlmesh/core/console.py
@@ -3830,12 +3830,10 @@ def update_snapshot_evaluation_progress(
                 message += f" | auto_restatement_triggers={','.join(trigger.name for trigger in snapshot_evaluation_triggers.auto_restatement_triggers)}"
             if snapshot_evaluation_triggers.select_snapshot_triggers:
                 message += f" | select_snapshot_triggers={','.join(trigger.name for trigger in snapshot_evaluation_triggers.select_snapshot_triggers)}"
-
-        if snapshot_evaluation_triggers:
-            if snapshot_evaluation_triggers.auto_restatement_triggers:
-                message += f" | auto_restatement_triggers={','.join(trigger.name for trigger in snapshot_evaluation_triggers.auto_restatement_triggers)}"
-            if snapshot_evaluation_triggers.select_snapshot_triggers:
-                message += f" | select_snapshot_triggers={','.join(trigger.name for trigger in snapshot_evaluation_triggers.select_snapshot_triggers)}"
+            if snapshot_evaluation_triggers.directly_modified_triggers:
+                message += f" | directly_modified_triggers={','.join(trigger.name for trigger in snapshot_evaluation_triggers.directly_modified_triggers)}"
+            if snapshot_evaluation_triggers.restatement_triggers:
+                message += f" | restatement_triggers={','.join(trigger.name for trigger in snapshot_evaluation_triggers.restatement_triggers)}"
 
         if audit_only:
             message = f"Audited {snapshot.name} duration={duration_ms}ms | num_audits_passed={num_audits_passed} | num_audits_failed={num_audits_failed}"
diff --git a/sqlmesh/core/plan/evaluator.py b/sqlmesh/core/plan/evaluator.py
@@ -37,6 +37,7 @@
     SnapshotCreationFailedError,
     SnapshotNameVersion,
 )
+from sqlmesh.core.snapshot.definition import SnapshotEvaluationTriggers
 from sqlmesh.utils import to_snake_case
 from sqlmesh.core.state_sync import StateSync
 from sqlmesh.utils import CorrelationId
@@ -83,6 +84,7 @@ def __init__(
         self.default_catalog = default_catalog
         self.console = console or get_console()
         self._circuit_breaker: t.Optional[t.Callable[[], bool]] = None
+        self._restatement_triggers: t.Dict[SnapshotId, t.List[SnapshotId]] = {}
 
     def evaluate(
         self,
@@ -234,6 +236,27 @@ def visit_backfill_stage(self, stage: stages.BackfillStage, plan: EvaluatablePla
             self.console.log_success("SKIP: No model batches to execute")
             return
 
+        directly_modified_triggers: t.Dict[SnapshotId, t.List[SnapshotId]] = {}
+        for parent, children in plan.indirectly_modified_snapshots.items():
+            parent_id = stage.all_snapshots[parent].snapshot_id
+            directly_modified_triggers[parent_id] = directly_modified_triggers.get(
+                parent_id, []
+            ) + [parent_id]
+            for child in children:
+                directly_modified_triggers[child] = directly_modified_triggers.get(child, []) + [
+                    parent_id
+                ]
+        directly_modified_triggers = {
+            k: list(dict.fromkeys(v)) for k, v in directly_modified_triggers.items()
+        }
+        snapshot_evaluation_triggers = {
+            s_id: SnapshotEvaluationTriggers(
+                directly_modified_triggers=directly_modified_triggers.get(s_id, []),
+                restatement_triggers=self._restatement_triggers.get(s_id, []),
+            )
+            for s_id in [s.snapshot_id for s in stage.all_snapshots.values()]
+        }
+
         scheduler = self.create_scheduler(stage.all_snapshots.values(), self.snapshot_evaluator)
         # Convert model name restatements to snapshot ID restatements
         restatements_by_snapshot_id = {
@@ -249,6 +272,7 @@ def visit_backfill_stage(self, stage: stages.BackfillStage, plan: EvaluatablePla
             start=plan.start,
             end=plan.end,
             restatements=restatements_by_snapshot_id,
+            snapshot_evaluation_triggers=snapshot_evaluation_triggers,
         )
         if errors:
             raise PlanError("Plan application failed.")
@@ -286,13 +310,14 @@ def visit_restatement_stage(
         # by forcing dev environments to re-run intervals that changed in prod
         #
         # Without this rule, its possible that promoting a dev table to prod will introduce old data to prod
-        snapshot_intervals_to_restate.update(
+        restatement_intervals_all_environments, self._restatement_triggers = (
             self._restatement_intervals_across_all_environments(
                 prod_restatements=plan.restatements,
                 disable_restatement_models=plan.disabled_restatement_models,
                 loaded_snapshots={s.snapshot_id: s for s in stage.all_snapshots.values()},
             )
         )
+        snapshot_intervals_to_restate.update(restatement_intervals_all_environments)
 
         self.state_sync.remove_intervals(
             snapshot_intervals=list(snapshot_intervals_to_restate),
@@ -415,7 +440,9 @@ def _restatement_intervals_across_all_environments(
         prod_restatements: t.Dict[str, Interval],
         disable_restatement_models: t.Set[str],
         loaded_snapshots: t.Dict[SnapshotId, Snapshot],
-    ) -> t.Set[t.Tuple[SnapshotTableInfo, Interval]]:
+    ) -> t.Tuple[
+        t.Set[t.Tuple[SnapshotTableInfo, Interval]], t.Dict[SnapshotId, t.List[SnapshotId]]
+    ]:
         """
         Given a map of snapshot names + intervals to restate in prod:
          - Look up matching snapshots across all environments (match based on name - regardless of version)
@@ -426,14 +453,14 @@ def _restatement_intervals_across_all_environments(
         run in those environments causes the intervals to be repopulated
         """
         if not prod_restatements:
-            return set()
+            return set(), {}
 
         prod_name_versions: t.Set[SnapshotNameVersion] = {
             s.name_version for s in loaded_snapshots.values()
         }
 
         snapshots_to_restate: t.Dict[SnapshotId, t.Tuple[SnapshotTableInfo, Interval]] = {}
-
+        restatement_downstream_ids: t.Dict[SnapshotId, t.List[SnapshotId]] = {}
         for env_summary in self.state_sync.get_environments_summary():
             # Fetch the full environment object one at a time to avoid loading all environments into memory at once
             env = self.state_sync.get_environment(env_summary.name)
@@ -450,10 +477,17 @@ def _restatement_intervals_across_all_environments(
             for restatement, intervals in prod_restatements.items():
                 if restatement not in keyed_snapshots:
                     continue
+
+                downstream = env_dag.downstream(restatement)
+                if not env.is_dev and restatement not in disable_restatement_models:
+                    restatement_downstream_ids[keyed_snapshots[restatement].snapshot_id] = [
+                        keyed_snapshots[name].snapshot_id
+                        for name in downstream
+                        if name not in disable_restatement_models
+                    ]
+
                 affected_snapshot_names = [
-                    x
-                    for x in ([restatement] + env_dag.downstream(restatement))
-                    if x not in disable_restatement_models
+                    x for x in ([restatement] + downstream) if x not in disable_restatement_models
                 ]
                 snapshots_to_restate.update(
                     {
@@ -464,6 +498,14 @@ def _restatement_intervals_across_all_environments(
                     }
                 )
 
+        restatement_triggers: t.Dict[SnapshotId, t.List[SnapshotId]] = {
+            id: [id] for id in restatement_downstream_ids
+        }
+        for parent, children in restatement_downstream_ids.items():
+            for child in children:
+                restatement_triggers[child] = restatement_triggers.get(child, []) + [parent]
+        restatement_triggers = {k: list(dict.fromkeys(v)) for k, v in restatement_triggers.items()}
+
         # for any affected full_history_restatement_only snapshots, we need to widen the intervals being restated to
         # include the whole time range for that snapshot. This requires a call to state to load the full snapshot record,
         # so we only do it if necessary
@@ -499,7 +541,7 @@ def _restatement_intervals_across_all_environments(
                 )
                 snapshots_to_restate[full_snapshot_id] = (full_snapshot.table_info, new_intervals)
 
-        return set(snapshots_to_restate.values())
+        return set(snapshots_to_restate.values()), restatement_triggers
 
     def _update_intervals_for_new_snapshots(self, snapshots: t.Collection[Snapshot]) -> None:
         snapshots_intervals: t.List[SnapshotIntervals] = []
diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py
@@ -330,6 +330,8 @@ class SnapshotEvaluationTriggers(PydanticModel):
     cron_ready: t.Optional[bool] = None
     auto_restatement_triggers: t.List[SnapshotId] = []
     select_snapshot_triggers: t.List[SnapshotId] = []
+    directly_modified_triggers: t.List[SnapshotId] = []
+    restatement_triggers: t.List[SnapshotId] = []
 
 
 class SnapshotInfoMixin(ModelKindMixin):
diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py
@@ -1784,25 +1784,44 @@ def test_snapshot_triggers(init_and_plan_context: t.Callable, mocker: MockerFixt
     context, plan = init_and_plan_context("examples/sushi")
     context.apply(plan)
 
+    # modify 3 models
+    # - 2 breaking changes for testing plan directly modified triggers
+    # - 1 adding an auto-restatement for subsequent `run` test
+    marketing = context.get_model("sushi.marketing")
+    marketing_kwargs = {
+        **marketing.dict(),
+        "query": d.parse_one(
+            f"{marketing.query.sql(dialect='duckdb')} ORDER BY customer_id", dialect="duckdb"
+        ),
+    }
+    context.upsert_model(SqlModel.parse_obj(marketing_kwargs))
+
+    customers = context.get_model("sushi.customers")
+    customers_kwargs = {
+        **customers.dict(),
+        "query": d.parse_one(
+            f"{customers.query.sql(dialect='duckdb')} ORDER BY customer_id", dialect="duckdb"
+        ),
+    }
+    context.upsert_model(SqlModel.parse_obj(customers_kwargs))
+
     # add auto restatement to orders
-    model = context.get_model("sushi.orders")
-    kind = {
-        **model.kind.dict(),
+    orders = context.get_model("sushi.orders")
+    orders_kind = {
+        **orders.kind.dict(),
         "auto_restatement_cron": "@hourly",
     }
-    kwargs = {
-        **model.dict(),
-        "kind": kind,
+    orders_kwargs = {
+        **orders.dict(),
+        "kind": orders_kind,
     }
-    context.upsert_model(PythonModel.parse_obj(kwargs))
-    plan = context.plan_builder(skip_tests=True).build()
-    context.apply(plan)
+    context.upsert_model(PythonModel.parse_obj(orders_kwargs))
 
-    # Mock run_merged_intervals to capture triggers arg
-    scheduler = context.scheduler()
-    run_merged_intervals_mock = mocker.patch.object(
-        scheduler, "run_merged_intervals", return_value=([], [])
-    )
+    # spy = mocker.spy(sqlmesh.core.scheduler, "run_merged_intervals")
+
+    context.plan(auto_apply=True, no_prompts=True, categorizer_config=CategorizerConfig.all_full())
+
+    # assert spy.call_args.kwargs["snapshot_evaluation_triggers"]
 
     # User selects top_waiters and waiter_revenue_by_day, others added as auto-upstream
     selected_models = {"top_waiters", "waiter_revenue_by_day"}
@@ -1814,6 +1833,12 @@ def test_snapshot_triggers(init_and_plan_context: t.Callable, mocker: MockerFixt
         f'"memory"."sushi"."{model}"' for model in selected_models
     }
 
+    # Mock run_merged_intervals to capture triggers arg
+    scheduler = context.scheduler()
+    run_merged_intervals_mock = mocker.patch.object(
+        scheduler, "run_merged_intervals", return_value=([], [])
+    )
+
     with time_machine.travel("2023-01-09 00:00:01 UTC"):
         scheduler.run(
             environment=c.PROD,