From 6161f36a85ad241f12e628933c61486a5ccff1ea Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Tue, 29 Jul 2025 09:21:57 -0700 Subject: [PATCH 01/27] Feat: Add support for the non-virtual prod mode --- sqlmesh/core/config/common.py | 29 +++++++++++++ sqlmesh/core/config/root.py | 12 ++++-- sqlmesh/core/context.py | 3 +- sqlmesh/core/plan/stages.py | 12 +++++- sqlmesh/core/snapshot/definition.py | 66 ++++++++++++++++++++--------- sqlmesh/core/snapshot/evaluator.py | 6 ++- tests/core/test_snapshot.py | 1 + 7 files changed, 101 insertions(+), 28 deletions(-) diff --git a/sqlmesh/core/config/common.py b/sqlmesh/core/config/common.py index 770c1f5daf..09371f75ce 100644 --- a/sqlmesh/core/config/common.py +++ b/sqlmesh/core/config/common.py @@ -49,6 +49,35 @@ def __repr__(self) -> str: return str(self) +class VirtualEnvironmentMode(str, Enum): + """Mode for virtual environment behavior. + + FULL: Use full virtual environment functionality with hashed table names and virtual layer updates. + DEV_ONLY: Bypass virtual environments in production, using simple table names without hashes. + """ + + FULL = "full" + DEV_ONLY = "dev_only" + + @property + def is_full(self) -> bool: + return self == VirtualEnvironmentMode.FULL + + @property + def is_dev_only(self) -> bool: + return self == VirtualEnvironmentMode.DEV_ONLY + + @classproperty + def default(cls) -> VirtualEnvironmentMode: + return VirtualEnvironmentMode.FULL + + def __str__(self) -> str: + return self.name + + def __repr__(self) -> str: + return str(self) + + class TableNamingConvention(str, Enum): # Causes table names at the physical layer to follow the convention: # ____ diff --git a/sqlmesh/core/config/root.py b/sqlmesh/core/config/root.py index df8e2637da..d20463a506 100644 --- a/sqlmesh/core/config/root.py +++ b/sqlmesh/core/config/root.py @@ -14,7 +14,11 @@ from sqlmesh.cicd.config import CICDBotConfig from sqlmesh.core import constants as c from sqlmesh.core.console import get_console -from sqlmesh.core.config import EnvironmentSuffixTarget, TableNamingConvention +from sqlmesh.core.config.common import ( + EnvironmentSuffixTarget, + TableNamingConvention, + VirtualEnvironmentMode, +) from sqlmesh.core.config.base import BaseConfig, UpdateStrategy from sqlmesh.core.config.common import variables_validator, compile_regex_mapping from sqlmesh.core.config.connection import ( @@ -110,6 +114,7 @@ class Config(BaseConfig): physical_schema_mapping: A mapping from regular expressions to names of schemas in which physical tables for corresponding models will be placed. environment_suffix_target: Indicates whether to append the environment name to the schema or table name. physical_table_naming_convention: Indicates how tables should be named at the physical layer + virtual_environment_mode: Indicates how environments should be handled. gateway_managed_virtual_layer: Whether the models' views in the virtual layer are created by the model-specific gateway rather than the default gateway. infer_python_dependencies: Whether to statically analyze Python code to automatically infer Python package requirements. environment_catalog_mapping: A mapping from regular expressions to catalog names. The catalog name is used to determine the target catalog for a given environment. @@ -151,9 +156,8 @@ class Config(BaseConfig): environment_suffix_target: EnvironmentSuffixTarget = Field( default=EnvironmentSuffixTarget.default ) - physical_table_naming_convention: TableNamingConvention = Field( - default=TableNamingConvention.default - ) + physical_table_naming_convention: TableNamingConvention = TableNamingConvention.default + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default gateway_managed_virtual_layer: bool = False infer_python_dependencies: bool = True environment_catalog_mapping: RegexKeyDict = {} diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py index c0d9b21ff8..220db9faf7 100644 --- a/sqlmesh/core/context.py +++ b/sqlmesh/core/context.py @@ -2909,6 +2909,7 @@ def _nodes_to_snapshots(self, nodes: t.Dict[str, Node]) -> t.Dict[str, Snapshot] config = self.config_for_node(node) kwargs["ttl"] = config.snapshot_ttl kwargs["table_naming_convention"] = config.physical_table_naming_convention + kwargs["virtual_environment_mode"] = config.virtual_environment_mode snapshot = Snapshot.from_node( node, @@ -2936,7 +2937,7 @@ def _node_or_snapshot_to_fqn(self, node_or_snapshot: NodeOrSnapshot) -> str: def _plan_preview_enabled(self) -> bool: if self.config.plan.enable_preview is not None: return self.config.plan.enable_preview - # It is dangerous to enable preview by default for dbt projects that rely on engines that don’t support cloning. + # It is dangerous to enable preview by default for dbt projects that rely on engines that don't support cloning. # Enabling previews in such cases can result in unintended full refreshes because dbt incremental models rely on # the maximum timestamp value in the target table. return self._project_type == c.NATIVE or self.engine_adapter.SUPPORTS_CLONING diff --git a/sqlmesh/core/plan/stages.py b/sqlmesh/core/plan/stages.py index 144e12c887..1f1895d877 100644 --- a/sqlmesh/core/plan/stages.py +++ b/sqlmesh/core/plan/stages.py @@ -358,6 +358,7 @@ def build(self, plan: EvaluatablePlan) -> t.List[PlanStage]: demoted_environment_naming_info, snapshots | full_demoted_snapshots, deployability_index, + plan.is_dev, ) if virtual_layer_update_stage: stages.append(virtual_layer_update_stage) @@ -437,11 +438,18 @@ def _get_virtual_layer_update_stage( demoted_environment_naming_info: t.Optional[EnvironmentNamingInfo], all_snapshots: t.Dict[SnapshotId, Snapshot], deployability_index: DeployabilityIndex, + is_dev: bool, ) -> t.Optional[VirtualLayerUpdateStage]: - promoted_snapshots = {s for s in promoted_snapshots if s.is_model and not s.is_symbolic} - demoted_snapshots = {s for s in demoted_snapshots if s.is_model and not s.is_symbolic} + def _should_update_virtual_layer(snapshot: SnapshotTableInfo) -> bool: + # Skip virtual layer update for snapshots with virtual environment support disabled + virtual_environment_enabled = is_dev or snapshot.virtual_environment_mode.is_full + return snapshot.is_model and not snapshot.is_symbolic and virtual_environment_enabled + + promoted_snapshots = {s for s in promoted_snapshots if _should_update_virtual_layer(s)} + demoted_snapshots = {s for s in demoted_snapshots if _should_update_virtual_layer(s)} if not promoted_snapshots and not demoted_snapshots: return None + return VirtualLayerUpdateStage( promoted_snapshots=promoted_snapshots, demoted_snapshots=demoted_snapshots, diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index 90cd963051..be3a849427 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -13,7 +13,7 @@ from sqlglot import exp from sqlglot.optimizer.normalize_identifiers import normalize_identifiers -from sqlmesh.core.config import TableNamingConvention +from sqlmesh.core.config.common import TableNamingConvention, VirtualEnvironmentMode from sqlmesh.core import constants as c from sqlmesh.core.audit import StandaloneAudit from sqlmesh.core.environment import EnvironmentSuffixTarget @@ -230,6 +230,7 @@ class SnapshotDataVersion(PydanticModel, frozen=True): physical_schema_: t.Optional[str] = Field(default=None, alias="physical_schema") dev_table_suffix: str table_naming_convention: TableNamingConvention = Field(default=TableNamingConvention.default) + virtual_environment_mode: VirtualEnvironmentMode = Field(default=VirtualEnvironmentMode.default) def snapshot_id(self, name: str) -> SnapshotId: return SnapshotId(name=name, identifier=self.fingerprint.to_identifier()) @@ -338,6 +339,7 @@ class SnapshotInfoMixin(ModelKindMixin): dev_table_suffix: str table_naming_convention: TableNamingConvention = Field(default=TableNamingConvention.default) forward_only: bool + virtual_environment_mode: VirtualEnvironmentMode @cached_property def identifier(self) -> str: @@ -443,6 +445,10 @@ def _table_name(self, version: str, is_deployable: bool) -> str: if self.is_external: return self.name + if is_deployable and self.virtual_environment_mode.is_dev_only: + # Use the model name as is if the target is deployable and the virtual environment mode is set to dev-only + return self.name + is_dev_table = not is_deployable if is_dev_table: version = self.dev_version @@ -459,6 +465,7 @@ def _table_name(self, version: str, is_deployable: bool) -> str: fqt = self.fully_qualified_table.copy() fqt.set("catalog", None) base_table_name = fqt.sql() + return table_name( self.physical_schema, base_table_name, @@ -499,6 +506,8 @@ class SnapshotTableInfo(PydanticModel, SnapshotInfoMixin, frozen=True): dev_table_suffix: str model_gateway: t.Optional[str] = None forward_only: bool = False + table_naming_convention: TableNamingConvention = Field(default=TableNamingConvention.default) + virtual_environment_mode: VirtualEnvironmentMode = Field(default=VirtualEnvironmentMode.default) def __lt__(self, other: SnapshotTableInfo) -> bool: return self.name < other.name @@ -540,6 +549,7 @@ def data_version(self) -> SnapshotDataVersion: physical_schema=self.physical_schema, dev_table_suffix=self.dev_table_suffix, table_naming_convention=self.table_naming_convention, + virtual_environment_mode=self.virtual_environment_mode, ) @property @@ -627,6 +637,7 @@ class Snapshot(PydanticModel, SnapshotInfoMixin): default=TableNamingConvention.default, alias="table_naming_convention" ) forward_only: bool = False + virtual_environment_mode: VirtualEnvironmentMode = Field(default=VirtualEnvironmentMode.default) @field_validator("ttl") @classmethod @@ -679,6 +690,7 @@ def from_node( version: t.Optional[str] = None, cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, table_naming_convention: TableNamingConvention = TableNamingConvention.default, + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default, ) -> Snapshot: """Creates a new snapshot for a node. @@ -690,6 +702,7 @@ def from_node( version: The version that a snapshot is associated with. Usually set during the planning phase. cache: Cache of node name to fingerprints. table_naming_convention: Convention to follow when generating the physical table name + virtual_environment_mode: Mode for handling virtual environments Returns: The newly created snapshot. @@ -722,6 +735,7 @@ def from_node( ttl=ttl, version=version, table_naming_convention=table_naming_convention, + virtual_environment_mode=virtual_environment_mode, ) def __eq__(self, other: t.Any) -> bool: @@ -876,16 +890,19 @@ def merge_intervals(self, other: t.Union[Snapshot, SnapshotIntervals]) -> None: Args: other: The target snapshot to inherit intervals from. """ - effective_from_ts = self.normalized_effective_from_ts or 0 - apply_effective_from = effective_from_ts > 0 and self.identifier != other.identifier - - for start, end in other.intervals: - # If the effective_from is set, then intervals that come after it must come from - # the current snapshost. - if apply_effective_from and start < effective_from_ts: - end = min(end, effective_from_ts) - if not apply_effective_from or end <= effective_from_ts: - self.add_interval(start, end) + if self.is_no_rebuild or self.virtual_environment_mode.is_full or not self.is_paused: + # If the virtual environment mode is not full we can only merge prod intervals if this snapshot + # is currently promoted in production or if it's forward-only / metadata / indirect non-breaking. + # Otherwise, we want to ignore any existing intervals and backfill this snapshot from scratch. + effective_from_ts = self.normalized_effective_from_ts or 0 + apply_effective_from = effective_from_ts > 0 and self.identifier != other.identifier + for start, end in other.intervals: + # If the effective_from is set, then intervals that come after it must come from + # the current snapshost. + if apply_effective_from and start < effective_from_ts: + end = min(end, effective_from_ts) + if not apply_effective_from or end <= effective_from_ts: + self.add_interval(start, end) if self.dev_version == other.dev_version: # Merge dev intervals if the dev versions match which would mean @@ -1035,7 +1052,10 @@ def categorize_as(self, category: SnapshotChangeCategory, forward_only: bool = F SnapshotChangeCategory.INDIRECT_NON_BREAKING, SnapshotChangeCategory.METADATA, ) - if self.is_model and self.model.physical_version: + if self.is_model and not self.virtual_environment_mode.is_full: + # Hardcode the version if the virtual environment is not fully enabled. + self.version = "novde" + elif self.is_model and self.model.physical_version: # If the model has a pinned version then use that. self.version = self.model.physical_version elif is_no_rebuild and self.previous_version: @@ -1239,6 +1259,7 @@ def table_info(self) -> SnapshotTableInfo: model_gateway=self.model_gateway, table_naming_convention=self.table_naming_convention, # type: ignore forward_only=self.forward_only, + virtual_environment_mode=self.virtual_environment_mode, ) @property @@ -1252,6 +1273,7 @@ def data_version(self) -> SnapshotDataVersion: physical_schema=self.physical_schema, dev_table_suffix=self.dev_table_suffix, table_naming_convention=self.table_naming_convention, + virtual_environment_mode=self.virtual_environment_mode, ) @property @@ -1535,14 +1557,20 @@ def create( for node in dag: if node not in snapshots: continue - # Make sure that the node is deployable according to all its parents - this_deployable = all( - children_deployability_mapping[p_id] - for p_id in snapshots[node].parents - if p_id in children_deployability_mapping - ) + snapshot = snapshots[node] + + if not snapshot.virtual_environment_mode.is_full: + # If the virtual environment is not fully enabled, then the snapshot can never be deployable + this_deployable = False + else: + # Make sure that the node is deployable according to all its parents + this_deployable = all( + children_deployability_mapping[p_id] + for p_id in snapshots[node].parents + if p_id in children_deployability_mapping + ) + if this_deployable: - snapshot = snapshots[node] is_forward_only_model = ( snapshot.is_model and snapshot.model.forward_only and not snapshot.is_metadata ) diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index e053e1e108..f77c318730 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2274,8 +2274,10 @@ def _check_destructive_schema_change( alter_expressions: t.List[exp.Alter], allow_destructive_snapshots: t.Set[str], ) -> None: - if snapshot.needs_destructive_check(allow_destructive_snapshots) and has_drop_alteration( - alter_expressions + if ( + snapshot.is_no_rebuild + and snapshot.needs_destructive_check(allow_destructive_snapshots) + and has_drop_alteration(alter_expressions) ): snapshot_name = snapshot.name dropped_column_names = get_dropped_column_names(alter_expressions) diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index 0ba7180fb6..c3116ff4ca 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -167,6 +167,7 @@ def test_json(snapshot: Snapshot): "parents": [{"name": '"parent"."tbl"', "identifier": snapshot.parents[0].identifier}], "previous_versions": [], "table_naming_convention": "schema_and_table", + "virtual_environment_mode": "full", "updated_ts": 1663891973000, "version": snapshot.fingerprint.to_version(), "migrated": False, From b14b33bd0982547e3b835c8739f1410f4805f174 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Wed, 30 Jul 2025 08:54:41 -0700 Subject: [PATCH 02/27] add tests for the plan stage builder --- tests/core/test_plan_stages.py | 258 +++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) diff --git a/tests/core/test_plan_stages.py b/tests/core/test_plan_stages.py index f560b93251..7482426f5e 100644 --- a/tests/core/test_plan_stages.py +++ b/tests/core/test_plan_stages.py @@ -4,6 +4,7 @@ from pytest_mock.plugin import MockerFixture from sqlmesh.core.config import EnvironmentSuffixTarget +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.model import SqlModel, ModelKindName from sqlmesh.core.plan.definition import EvaluatablePlan from sqlmesh.core.plan.stages import ( @@ -1300,3 +1301,260 @@ def test_build_plan_stages_indirect_non_breaking_view_migration( migrate_schemas_stage = stages[4] assert {s.snapshot_id for s in migrate_schemas_stage.snapshots} == {new_snapshot_c.snapshot_id} + + +def test_build_plan_stages_virtual_environment_mode_filtering( + make_snapshot, mocker: MockerFixture +) -> None: + # Create snapshots with different virtual environment modes + snapshot_full = make_snapshot( + SqlModel( + name="full_model", + query=parse_one("select 1, ds"), + kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + ) + ) + snapshot_full.virtual_environment_mode = VirtualEnvironmentMode.FULL + snapshot_full.categorize_as(SnapshotChangeCategory.BREAKING) + + snapshot_dev_only = make_snapshot( + SqlModel( + name="dev_only_model", + query=parse_one("select 2, ds"), + kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + ) + ) + snapshot_dev_only.virtual_environment_mode = VirtualEnvironmentMode.DEV_ONLY + snapshot_dev_only.categorize_as(SnapshotChangeCategory.BREAKING) + + # Mock state reader + state_reader = mocker.Mock(spec=StateReader) + state_reader.get_snapshots.return_value = {} + state_reader.get_environment.return_value = None + + # Test 1: Dev environment - both snapshots should be included + environment_dev = Environment( + name="dev", + snapshots=[snapshot_full.table_info, snapshot_dev_only.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id=None, + promoted_snapshot_ids=[snapshot_full.snapshot_id, snapshot_dev_only.snapshot_id], + ) + + plan_dev = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[snapshot_full, snapshot_dev_only], + environment=environment_dev, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements={}, + is_dev=True, + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[snapshot_full.snapshot_id, snapshot_dev_only.snapshot_id], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[], + requires_backfill=True, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + stages_dev = build_plan_stages(plan_dev, state_reader, None) + + # Find VirtualLayerUpdateStage + virtual_stage_dev = next( + stage for stage in stages_dev if isinstance(stage, VirtualLayerUpdateStage) + ) + + # In dev environment, both snapshots should be promoted regardless of virtual_environment_mode + assert {s.name for s in virtual_stage_dev.promoted_snapshots} == { + '"full_model"', + '"dev_only_model"', + } + assert len(virtual_stage_dev.demoted_snapshots) == 0 + + # Test 2: Production environment - only FULL mode snapshots should be included + environment_prod = Environment( + name="prod", + snapshots=[snapshot_full.table_info, snapshot_dev_only.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id=None, + promoted_snapshot_ids=[snapshot_full.snapshot_id, snapshot_dev_only.snapshot_id], + ) + + plan_prod = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[snapshot_full, snapshot_dev_only], + environment=environment_prod, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements={}, + is_dev=False, + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[snapshot_full.snapshot_id, snapshot_dev_only.snapshot_id], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[], + requires_backfill=True, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + stages_prod = build_plan_stages(plan_prod, state_reader, None) + + # Find VirtualLayerUpdateStage + virtual_stage_prod = next( + stage for stage in stages_prod if isinstance(stage, VirtualLayerUpdateStage) + ) + + # In production environment, only FULL mode snapshots should be promoted + assert {s.name for s in virtual_stage_prod.promoted_snapshots} == {'"full_model"'} + assert len(virtual_stage_prod.demoted_snapshots) == 0 + + # Test 3: Production environment with demoted snapshots + existing_environment = Environment( + name="prod", + snapshots=[snapshot_full.table_info, snapshot_dev_only.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="previous_plan", + previous_plan_id=None, + promoted_snapshot_ids=[snapshot_full.snapshot_id, snapshot_dev_only.snapshot_id], + finalized_ts=to_timestamp("2023-01-02"), + ) + state_reader.get_environment.return_value = existing_environment + + # Remove both snapshots from the new environment + environment_prod_demote = Environment( + name="prod", + snapshots=[], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id="previous_plan", + promoted_snapshot_ids=[], + ) + + plan_prod_demote = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[], + environment=environment_prod_demote, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements={}, + is_dev=False, + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[snapshot_full.snapshot_id, snapshot_dev_only.snapshot_id], + requires_backfill=False, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + stages_prod_demote = build_plan_stages(plan_prod_demote, state_reader, None) + + # Find VirtualLayerUpdateStage + virtual_stage_prod_demote = next( + stage for stage in stages_prod_demote if isinstance(stage, VirtualLayerUpdateStage) + ) + + # In production environment, only FULL mode snapshots should be demoted + assert len(virtual_stage_prod_demote.promoted_snapshots) == 0 + assert {s.name for s in virtual_stage_prod_demote.demoted_snapshots} == {'"full_model"'} + assert ( + virtual_stage_prod_demote.demoted_environment_naming_info + == existing_environment.naming_info + ) + + +def test_build_plan_stages_virtual_environment_mode_no_updates( + snapshot_a: Snapshot, make_snapshot, mocker: MockerFixture +) -> None: + # Create snapshot with DEV_ONLY mode + snapshot_dev_only = make_snapshot( + SqlModel( + name="dev_only_model", + query=parse_one("select 1, ds"), + kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + ) + ) + snapshot_dev_only.virtual_environment_mode = VirtualEnvironmentMode.DEV_ONLY + snapshot_dev_only.categorize_as(SnapshotChangeCategory.BREAKING) + + # Mock state reader + state_reader = mocker.Mock(spec=StateReader) + state_reader.get_snapshots.return_value = {} + state_reader.get_environment.return_value = None + + # Production environment with only DEV_ONLY snapshots + environment = Environment( + name="prod", + snapshots=[snapshot_dev_only.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id=None, + promoted_snapshot_ids=[snapshot_dev_only.snapshot_id], + ) + + plan = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[snapshot_dev_only], + environment=environment, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements={}, + is_dev=False, + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[snapshot_dev_only.snapshot_id], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[], + requires_backfill=True, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + stages = build_plan_stages(plan, state_reader, None) + + # No VirtualLayerUpdateStage should be created since all snapshots are filtered out + virtual_stages = [stage for stage in stages if isinstance(stage, VirtualLayerUpdateStage)] + assert len(virtual_stages) == 0 From 19b9a31b8e7879badac85ad2eef043ca55f4901b Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Wed, 30 Jul 2025 09:41:38 -0700 Subject: [PATCH 03/27] add tests for the snapshot definition --- tests/core/test_snapshot.py | 159 ++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index c3116ff4ca..0b195899c9 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -64,6 +64,7 @@ table_name, TableNamingConvention, ) +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.utils import AttributeDict from sqlmesh.utils.date import DatetimeRanges, to_date, to_datetime, to_timestamp from sqlmesh.utils.errors import SQLMeshError, SignalEvalError @@ -3341,3 +3342,161 @@ def test_partitioned_by_roundtrip(make_snapshot: t.Callable): assert isinstance(deserialized.node, SqlModel) assert deserialized.node.partitioned_by == snapshot.node.partitioned_by + + +def test_merge_intervals_virtual_environment_mode_full(make_snapshot): + model = SqlModel( + name="test_model", + kind=IncrementalByTimeRangeKind(time_column="ds"), + query=parse_one("SELECT 1, ds FROM parent_tbl"), + ) + + # Create source snapshot with intervals + source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.FULL) + source_snapshot.add_interval("2020-01-01", "2020-01-03") + source_snapshot.add_interval("2020-01-05", "2020-01-07") + + # Create target snapshot with different fingerprint and virtual_environment_mode FULL + target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.FULL) + target_snapshot.fingerprint = SnapshotFingerprint( + data_hash="different", metadata_hash="different", parent_data_hash="different" + ) + target_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + + # When virtual_environment_mode is FULL, intervals should be merged + target_snapshot.merge_intervals(source_snapshot) + + assert target_snapshot.intervals == [ + (to_timestamp("2020-01-01"), to_timestamp("2020-01-04")), + (to_timestamp("2020-01-05"), to_timestamp("2020-01-08")), + ] + + +def test_merge_intervals_virtual_environment_mode_dev_only_paused_breaking(make_snapshot): + model = SqlModel( + name="test_model", + kind=IncrementalByTimeRangeKind(time_column="ds"), + query=parse_one("SELECT 1, ds FROM parent_tbl"), + ) + + # Create source snapshot with intervals + source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + source_snapshot.add_interval("2020-01-01", "2020-01-03") + source_snapshot.add_interval("2020-01-05", "2020-01-07") + + # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY + target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + target_snapshot.fingerprint = SnapshotFingerprint( + data_hash="different", metadata_hash="different", parent_data_hash="different" + ) + target_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + + # Ensure snapshot is paused (unpaused_ts is None) + target_snapshot.unpaused_ts = None + + # When virtual_environment_mode is DEV_ONLY and snapshot is paused and breaking, intervals should NOT be merged + target_snapshot.merge_intervals(source_snapshot) + + assert target_snapshot.intervals == [] + + +def test_merge_intervals_virtual_environment_mode_dev_only_unpaused(make_snapshot): + model = SqlModel( + name="test_model", + kind=IncrementalByTimeRangeKind(time_column="ds"), + query=parse_one("SELECT 1, ds FROM parent_tbl"), + ) + + # Create source snapshot with intervals + source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + source_snapshot.add_interval("2020-01-01", "2020-01-03") + source_snapshot.add_interval("2020-01-05", "2020-01-07") + + # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY + target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + target_snapshot.fingerprint = SnapshotFingerprint( + data_hash="different", metadata_hash="different", parent_data_hash="different" + ) + target_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + + # Ensure snapshot is unpaused + target_snapshot.unpaused_ts = to_timestamp("2020-01-01") + + # When snapshot is unpaused, intervals should be merged regardless of virtual_environment_mode + target_snapshot.merge_intervals(source_snapshot) + + assert target_snapshot.intervals == [ + (to_timestamp("2020-01-01"), to_timestamp("2020-01-04")), + (to_timestamp("2020-01-05"), to_timestamp("2020-01-08")), + ] + + +def test_merge_intervals_virtual_environment_mode_dev_only_no_rebuild(make_snapshot): + model = SqlModel( + name="test_model", + kind=IncrementalByTimeRangeKind(time_column="ds"), + query=parse_one("SELECT 1, ds FROM parent_tbl"), + ) + + # Create source snapshot with intervals + source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + source_snapshot.add_interval("2020-01-01", "2020-01-03") + source_snapshot.add_interval("2020-01-05", "2020-01-07") + + # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY + target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + target_snapshot.fingerprint = SnapshotFingerprint( + data_hash="different", metadata_hash="different", parent_data_hash="different" + ) + target_snapshot.categorize_as( + SnapshotChangeCategory.FORWARD_ONLY + ) # This is a no-rebuild category + + # Ensure snapshot is paused + target_snapshot.unpaused_ts = None + + # When change category is no-rebuild, intervals should be merged regardless of virtual_environment_mode + target_snapshot.merge_intervals(source_snapshot) + + assert target_snapshot.intervals == [ + (to_timestamp("2020-01-01"), to_timestamp("2020-01-04")), + (to_timestamp("2020-01-05"), to_timestamp("2020-01-08")), + ] + + +@pytest.mark.parametrize( + "virtual_env_mode,is_deployable,expected_uses_name_as_is", + [ + (VirtualEnvironmentMode.DEV_ONLY, True, True), + (VirtualEnvironmentMode.DEV_ONLY, False, False), + (VirtualEnvironmentMode.FULL, True, False), + (VirtualEnvironmentMode.FULL, False, False), + ], +) +def test_table_name_virtual_environment_mode( + make_snapshot, + virtual_env_mode: VirtualEnvironmentMode, + is_deployable: bool, + expected_uses_name_as_is: bool, +): + model = SqlModel( + name="my_schema.my_model", + kind=IncrementalByTimeRangeKind(time_column="ds"), + query=parse_one("SELECT 1, ds"), + ) + + snapshot = make_snapshot(model, virtual_environment_mode=virtual_env_mode) + snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + + table_name_result = snapshot.table_name(is_deployable=is_deployable) + + if expected_uses_name_as_is: + assert table_name_result == '"my_schema"."my_model"' + else: + # Should contain the versioned table name with schema prefix + assert "sqlmesh__my_schema" in table_name_result + assert "my_schema__my_model" in table_name_result + if is_deployable: + assert table_name_result.endswith(snapshot.version) + else: + assert table_name_result.endswith(f"{snapshot.dev_version}__dev") From 281e3bdc627f598eea52f9f0583cc59519f8ecd1 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Wed, 30 Jul 2025 14:13:04 -0700 Subject: [PATCH 04/27] add docs --- docs/guides/configuration.md | 36 +++++++++++++++++++++++++++++++++ docs/reference/configuration.md | 1 + sqlmesh/core/config/common.py | 4 ++-- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index 24371f30d0..9d5cf25f8c 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -538,6 +538,42 @@ sqlmesh_md5__d3b07384d113edec49eaa6238ad5ff00__dev This has a downside that now it's much more difficult to determine which table corresponds to which model by just looking at the database with a SQL client. However, the table names have a predictable length so there are no longer any surprises with identfiers exceeding the max length at the physical layer. +#### Virtual Data Environment modes + +By default, Virtual Data Environments (VDE) are applied across both development and production environments. This allows SQLMesh to reuse physical tables when appropriate, even when promoting from development to production. + +However, users may sometimes prefer their production environment to be non-virtual. The non-exhaustive list of reasons may include: + +- Integration with third-party tools and platforms, such as data catalogs, may not work well with the virtual view layer that SQLMesh imposes by default +- A desire to rely on time travel features provided by cloud data warehouses such as BigQuery, Snowflake, and Databricks + +To mitigate this, SQLMesh offers an alternative 'dev-only' mode for using VDE. It can be enabled in the project configuration like so: + +=== "YAML" + + ```yaml linenums="1" + virtual_environment_mode: dev_only + ``` + +=== "Python" + + ```python linenums="1" + from sqlmesh.core.config import Config + + config = Config( + virtual_environment_mode="dev_only", + ) + ``` + +As the name suggests, 'dev-only' mode means that VDE is applied only in development environments, while in production, model tables and views are updated directly, bypassing the virtual layer. This also means that physical tables in production will be created using the original, unversioned model names. Users will still benefit from VDE and data reuse across development environments. + +Please note that enabling this mode means that all data inserted in development environments is used only for [preview](../concepts/plans.md#data-preview-for-forward-only-changes) and will **not** be reused in production. + + +!!! warning + Switching the mode for an existing project will result in a complete rebuild of all models in the project. Refer to the [Table Migration Guide](./table_migration.md) to migrate existing tables without rebuilding them from scratch. + + #### Environment view catalogs By default, SQLMesh creates an environment view in the same [catalog](../concepts/glossary.md#catalog) as the physical table the view points to. The physical table's catalog is determined by either the catalog specified in the model name or the default catalog defined in the connection. diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index df3fcf930d..676f9d7389 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -46,6 +46,7 @@ Configuration options for how SQLMesh manages environment creation and promotion | `environment_suffix_target` | Whether SQLMesh views should append their environment name to the `schema`, `table` or `catalog` - [additional details](../guides/configuration.md#view-schema-override). (Default: `schema`) | string | N | | `gateway_managed_virtual_layer` | Whether SQLMesh views of the virtual layer will be created by the default gateway or model specified gateways - [additional details](../guides/multi_engine.md#gateway-managed-virtual-layer). (Default: False) | boolean | N | | `environment_catalog_mapping` | A mapping from regular expressions to catalog names. The catalog name is used to determine the target catalog for a given environment. | dict[string, string] | N | +| `virtual_environment_mode` | Determines the Virtual Data Environment (VDE) mode. If set to `full`, VDE is used in both production and development environments. The `dev_only` option enables VDE only in development environments, while in production, no virtual layer is used and models are materialized directly using their original names (i.e., no versioned physical tables). (Default: `full`) | string | N | ### Models diff --git a/sqlmesh/core/config/common.py b/sqlmesh/core/config/common.py index 09371f75ce..2963632041 100644 --- a/sqlmesh/core/config/common.py +++ b/sqlmesh/core/config/common.py @@ -52,8 +52,8 @@ def __repr__(self) -> str: class VirtualEnvironmentMode(str, Enum): """Mode for virtual environment behavior. - FULL: Use full virtual environment functionality with hashed table names and virtual layer updates. - DEV_ONLY: Bypass virtual environments in production, using simple table names without hashes. + FULL: Use full virtual environment functionality with versioned table names and virtual layer updates. + DEV_ONLY: Bypass virtual environments in production, using original unversioned model names. """ FULL = "full" From 854b732ef15fd48a2bc0e6b4ad048fee5a7297de Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 11:57:23 -0700 Subject: [PATCH 05/27] move virtual_environment_model attribute into model meta --- sqlmesh/core/context.py | 1 - sqlmesh/core/loader.py | 2 ++ sqlmesh/core/model/decorator.py | 3 +++ sqlmesh/core/model/definition.py | 2 ++ sqlmesh/core/model/meta.py | 2 ++ sqlmesh/core/plan/builder.py | 7 ++++-- sqlmesh/core/snapshot/definition.py | 25 +++++++++++++++------ tests/core/test_model.py | 9 ++++---- tests/core/test_plan_stages.py | 6 ++--- tests/core/test_snapshot.py | 35 ++++++++++++++++------------- 10 files changed, 60 insertions(+), 32 deletions(-) diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py index 220db9faf7..0031c1834c 100644 --- a/sqlmesh/core/context.py +++ b/sqlmesh/core/context.py @@ -2909,7 +2909,6 @@ def _nodes_to_snapshots(self, nodes: t.Dict[str, Node]) -> t.Dict[str, Snapshot] config = self.config_for_node(node) kwargs["ttl"] = config.snapshot_ttl kwargs["table_naming_convention"] = config.physical_table_naming_convention - kwargs["virtual_environment_mode"] = config.virtual_environment_mode snapshot = Snapshot.from_node( node, diff --git a/sqlmesh/core/loader.py b/sqlmesh/core/loader.py index b593da1ad0..8126b39107 100644 --- a/sqlmesh/core/loader.py +++ b/sqlmesh/core/loader.py @@ -603,6 +603,7 @@ def _load_sql_models( infer_names=self.config.model_naming.infer_names, signal_definitions=signals, default_catalog_per_gateway=self.context.default_catalog_per_gateway, + virtual_environment_mode=self.config.virtual_environment_mode, **loading_default_kwargs or {}, ) @@ -683,6 +684,7 @@ def _load_python_models( audit_definitions=audits, signal_definitions=signals, default_catalog_per_gateway=self.context.default_catalog_per_gateway, + virtual_environment_mode=self.config.virtual_environment_mode, ): if model.enabled: models[model.fqn] = model diff --git a/sqlmesh/core/model/decorator.py b/sqlmesh/core/model/decorator.py index 3b78efc636..73452cc165 100644 --- a/sqlmesh/core/model/decorator.py +++ b/sqlmesh/core/model/decorator.py @@ -8,6 +8,7 @@ from sqlglot import exp from sqlglot.dialects.dialect import DialectType +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.macros import MacroRegistry from sqlmesh.core.signal import SignalRegistry from sqlmesh.utils.jinja import JinjaMacroRegistry @@ -154,6 +155,7 @@ def model( variables: t.Optional[t.Dict[str, t.Any]] = None, infer_names: t.Optional[bool] = False, blueprint_variables: t.Optional[t.Dict[str, t.Any]] = None, + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default, ) -> Model: """Get the model registered by this function.""" env: t.Dict[str, t.Tuple[t.Any, t.Optional[bool]]] = {} @@ -228,6 +230,7 @@ def model( "audit_definitions": audit_definitions, "signal_definitions": signal_definitions, "blueprint_variables": blueprint_variables, + "virtual_environment_mode": virtual_environment_mode, **rendered_fields, } diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py index 559d67e960..72e56239fa 100644 --- a/sqlmesh/core/model/definition.py +++ b/sqlmesh/core/model/definition.py @@ -1062,6 +1062,7 @@ def _data_hash_values(self) -> t.List[str]: self.gateway, self.interval_unit.value if self.interval_unit is not None else None, str(self.optimize_query) if self.optimize_query is not None else None, + self.virtual_environment_mode.value, ] for column_name, column_type in (self.columns_to_types_ or {}).items(): @@ -2950,6 +2951,7 @@ def render_expression( ) ), "formatting": str, + "virtual_environment_mode": lambda value: exp.Literal.string(value.value), } diff --git a/sqlmesh/core/model/meta.py b/sqlmesh/core/model/meta.py index b5371ab811..2f24349a72 100644 --- a/sqlmesh/core/model/meta.py +++ b/sqlmesh/core/model/meta.py @@ -10,6 +10,7 @@ from sqlglot.optimizer.normalize_identifiers import normalize_identifiers from sqlmesh.core import dialect as d +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.config.linter import LinterConfig from sqlmesh.core.dialect import normalize_model_name from sqlmesh.core.model.common import ( @@ -83,6 +84,7 @@ class ModelMeta(_Node): default=None, exclude=True, alias="ignored_rules" ) formatting: t.Optional[bool] = Field(default=None, exclude=True) + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default _bool_validator = bool_validator _model_kind_validator = model_kind_validator diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 178cd8d2e4..b3520ed603 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -773,7 +773,7 @@ def _is_forward_only_change(self, s_id: SnapshotId) -> bool: if snapshot.name in self._context_diff.modified_snapshots: _, old = self._context_diff.modified_snapshots[snapshot.name] # If the model kind has changed in a breaking way, then we can't consider this to be a forward-only change. - if snapshot.is_model and _is_breaking_kind_change(old, snapshot): + if snapshot.is_model and _should_force_breaking_change(old, snapshot): return False return ( snapshot.is_model and snapshot.model.forward_only and bool(snapshot.previous_versions) @@ -888,7 +888,10 @@ def _modified_and_added_snapshots(self) -> t.List[Snapshot]: ] -def _is_breaking_kind_change(old: Snapshot, new: Snapshot) -> bool: +def _should_force_breaking_change(old: Snapshot, new: Snapshot) -> bool: + if old.virtual_environment_mode != new.virtual_environment_mode: + # If the virtual environment mode has changed, then it's a breaking change + return True if old.model.kind.name == new.model.kind.name: # If the kind hasn't changed, then it's not a breaking change return False diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index be3a849427..83d923863d 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -339,7 +339,6 @@ class SnapshotInfoMixin(ModelKindMixin): dev_table_suffix: str table_naming_convention: TableNamingConvention = Field(default=TableNamingConvention.default) forward_only: bool - virtual_environment_mode: VirtualEnvironmentMode @cached_property def identifier(self) -> str: @@ -385,6 +384,10 @@ def is_new_version(self) -> bool: def fully_qualified_table(self) -> t.Optional[exp.Table]: raise NotImplementedError + @property + def virtual_environment_mode(self) -> VirtualEnvironmentMode: + raise NotImplementedError + @property def is_forward_only(self) -> bool: return self.forward_only or self.change_category == SnapshotChangeCategory.FORWARD_ONLY @@ -506,8 +509,10 @@ class SnapshotTableInfo(PydanticModel, SnapshotInfoMixin, frozen=True): dev_table_suffix: str model_gateway: t.Optional[str] = None forward_only: bool = False - table_naming_convention: TableNamingConvention = Field(default=TableNamingConvention.default) - virtual_environment_mode: VirtualEnvironmentMode = Field(default=VirtualEnvironmentMode.default) + table_naming_convention: TableNamingConvention = TableNamingConvention.default + virtual_environment_mode_: VirtualEnvironmentMode = Field( + default=VirtualEnvironmentMode.default, alias="virtual_environment_mode" + ) def __lt__(self, other: SnapshotTableInfo) -> bool: return self.name < other.name @@ -539,6 +544,10 @@ def table_info(self) -> SnapshotTableInfo: """Helper method to return self.""" return self + @property + def virtual_environment_mode(self) -> VirtualEnvironmentMode: + return self.virtual_environment_mode_ + @property def data_version(self) -> SnapshotDataVersion: return SnapshotDataVersion( @@ -637,7 +646,6 @@ class Snapshot(PydanticModel, SnapshotInfoMixin): default=TableNamingConvention.default, alias="table_naming_convention" ) forward_only: bool = False - virtual_environment_mode: VirtualEnvironmentMode = Field(default=VirtualEnvironmentMode.default) @field_validator("ttl") @classmethod @@ -690,7 +698,6 @@ def from_node( version: t.Optional[str] = None, cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, table_naming_convention: TableNamingConvention = TableNamingConvention.default, - virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default, ) -> Snapshot: """Creates a new snapshot for a node. @@ -702,7 +709,6 @@ def from_node( version: The version that a snapshot is associated with. Usually set during the planning phase. cache: Cache of node name to fingerprints. table_naming_convention: Convention to follow when generating the physical table name - virtual_environment_mode: Mode for handling virtual environments Returns: The newly created snapshot. @@ -735,7 +741,6 @@ def from_node( ttl=ttl, version=version, table_naming_convention=table_naming_convention, - virtual_environment_mode=virtual_environment_mode, ) def __eq__(self, other: t.Any) -> bool: @@ -1418,6 +1423,12 @@ def custom_materialization(self) -> t.Optional[str]: return t.cast(CustomKind, self.model.kind).materialization return None + @property + def virtual_environment_mode(self) -> VirtualEnvironmentMode: + return ( + self.model.virtual_environment_mode if self.is_model else VirtualEnvironmentMode.default + ) + def _ensure_categorized(self) -> None: if not self.change_category: raise SQLMeshError(f"Snapshot {self.snapshot_id} has not been categorized yet.") diff --git a/tests/core/test_model.py b/tests/core/test_model.py index 0be1702fa1..c11657233f 100644 --- a/tests/core/test_model.py +++ b/tests/core/test_model.py @@ -1908,7 +1908,8 @@ def test_render_definition_with_defaults(): dialect spark, kind VIEW ( materialized FALSE - ) + ), + virtual_environment_mode 'full' ); {query} @@ -5731,7 +5732,7 @@ def test_default_catalog_sql(assert_exp_eq): The system is not designed to actually support having an engine that doesn't support default catalog to start supporting it or the reverse of that. If that did happen then bugs would occur. """ - HASH_WITH_CATALOG = "516937963" + HASH_WITH_CATALOG = "1269513823" # Test setting default catalog doesn't change hash if it matches existing logic expressions = d.parse( @@ -5897,7 +5898,7 @@ def test_default_catalog_sql(assert_exp_eq): def test_default_catalog_python(): - HASH_WITH_CATALOG = "770057346" + HASH_WITH_CATALOG = "2728996410" @model(name="db.table", kind="full", columns={'"COL"': "int"}) def my_model(context, **kwargs): @@ -5989,7 +5990,7 @@ def test_default_catalog_external_model(): Since external models fqns are the only thing affected by default catalog, and when they change new snapshots are made, the hash will be the same across different names. """ - EXPECTED_HASH = "3614876346" + EXPECTED_HASH = "763256265" model = create_external_model("db.table", columns={"a": "int", "limit": "int"}) assert model.default_catalog is None diff --git a/tests/core/test_plan_stages.py b/tests/core/test_plan_stages.py index 7482426f5e..4bdefd9e38 100644 --- a/tests/core/test_plan_stages.py +++ b/tests/core/test_plan_stages.py @@ -1312,9 +1312,9 @@ def test_build_plan_stages_virtual_environment_mode_filtering( name="full_model", query=parse_one("select 1, ds"), kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + virtual_environment_mode=VirtualEnvironmentMode.FULL, ) ) - snapshot_full.virtual_environment_mode = VirtualEnvironmentMode.FULL snapshot_full.categorize_as(SnapshotChangeCategory.BREAKING) snapshot_dev_only = make_snapshot( @@ -1322,9 +1322,9 @@ def test_build_plan_stages_virtual_environment_mode_filtering( name="dev_only_model", query=parse_one("select 2, ds"), kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, ) ) - snapshot_dev_only.virtual_environment_mode = VirtualEnvironmentMode.DEV_ONLY snapshot_dev_only.categorize_as(SnapshotChangeCategory.BREAKING) # Mock state reader @@ -1506,9 +1506,9 @@ def test_build_plan_stages_virtual_environment_mode_no_updates( name="dev_only_model", query=parse_one("select 1, ds"), kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, ) ) - snapshot_dev_only.virtual_environment_mode = VirtualEnvironmentMode.DEV_ONLY snapshot_dev_only.categorize_as(SnapshotChangeCategory.BREAKING) # Mock state reader diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index 0b195899c9..afaa2b209d 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -163,12 +163,12 @@ def test_json(snapshot: Snapshot): "signals": [], "enabled": True, "extract_dependencies_from_query": True, + "virtual_environment_mode": "full", }, "name": '"name"', "parents": [{"name": '"parent"."tbl"', "identifier": snapshot.parents[0].identifier}], "previous_versions": [], "table_naming_convention": "schema_and_table", - "virtual_environment_mode": "full", "updated_ts": 1663891973000, "version": snapshot.fingerprint.to_version(), "migrated": False, @@ -912,7 +912,7 @@ def test_fingerprint(model: Model, parent_model: Model): fingerprint = fingerprint_from_node(model, nodes={}) original_fingerprint = SnapshotFingerprint( - data_hash="1312415267", + data_hash="3301649319", metadata_hash="1125608408", ) @@ -973,7 +973,7 @@ def test_fingerprint_seed_model(): ) expected_fingerprint = SnapshotFingerprint( - data_hash="1909791099", + data_hash="1586624913", metadata_hash="2315134974", ) @@ -1012,7 +1012,7 @@ def test_fingerprint_jinja_macros(model: Model): } ) original_fingerprint = SnapshotFingerprint( - data_hash="923305614", + data_hash="2908339239", metadata_hash="1125608408", ) @@ -1314,7 +1314,7 @@ def test_table_naming_convention_change_reuse_previous_version(make_snapshot): original_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) assert original_snapshot.table_naming_convention == TableNamingConvention.SCHEMA_AND_TABLE - assert original_snapshot.table_name() == "sqlmesh__default.a__4145234055" + assert original_snapshot.table_name() == f"sqlmesh__default.a__{original_snapshot.version}" changed_snapshot: Snapshot = make_snapshot( SqlModel(name="a", query=parse_one("select 1, 'forward_only' as a, ds")), @@ -1332,7 +1332,7 @@ def test_table_naming_convention_change_reuse_previous_version(make_snapshot): changed_snapshot.previous_version.table_naming_convention == TableNamingConvention.SCHEMA_AND_TABLE ) - assert changed_snapshot.table_name() == "sqlmesh__default.a__4145234055" + assert changed_snapshot.table_name() == f"sqlmesh__default.a__{changed_snapshot.version}" def test_categorize_change_sql(make_snapshot): @@ -3349,15 +3349,16 @@ def test_merge_intervals_virtual_environment_mode_full(make_snapshot): name="test_model", kind=IncrementalByTimeRangeKind(time_column="ds"), query=parse_one("SELECT 1, ds FROM parent_tbl"), + virtual_environment_mode=VirtualEnvironmentMode.FULL, ) # Create source snapshot with intervals - source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.FULL) + source_snapshot = make_snapshot(model) source_snapshot.add_interval("2020-01-01", "2020-01-03") source_snapshot.add_interval("2020-01-05", "2020-01-07") # Create target snapshot with different fingerprint and virtual_environment_mode FULL - target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.FULL) + target_snapshot = make_snapshot(model) target_snapshot.fingerprint = SnapshotFingerprint( data_hash="different", metadata_hash="different", parent_data_hash="different" ) @@ -3377,15 +3378,16 @@ def test_merge_intervals_virtual_environment_mode_dev_only_paused_breaking(make_ name="test_model", kind=IncrementalByTimeRangeKind(time_column="ds"), query=parse_one("SELECT 1, ds FROM parent_tbl"), + virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, ) # Create source snapshot with intervals - source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + source_snapshot = make_snapshot(model) source_snapshot.add_interval("2020-01-01", "2020-01-03") source_snapshot.add_interval("2020-01-05", "2020-01-07") # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY - target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + target_snapshot = make_snapshot(model) target_snapshot.fingerprint = SnapshotFingerprint( data_hash="different", metadata_hash="different", parent_data_hash="different" ) @@ -3405,15 +3407,16 @@ def test_merge_intervals_virtual_environment_mode_dev_only_unpaused(make_snapsho name="test_model", kind=IncrementalByTimeRangeKind(time_column="ds"), query=parse_one("SELECT 1, ds FROM parent_tbl"), + virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, ) # Create source snapshot with intervals - source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + source_snapshot = make_snapshot(model) source_snapshot.add_interval("2020-01-01", "2020-01-03") source_snapshot.add_interval("2020-01-05", "2020-01-07") # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY - target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + target_snapshot = make_snapshot(model) target_snapshot.fingerprint = SnapshotFingerprint( data_hash="different", metadata_hash="different", parent_data_hash="different" ) @@ -3436,15 +3439,16 @@ def test_merge_intervals_virtual_environment_mode_dev_only_no_rebuild(make_snaps name="test_model", kind=IncrementalByTimeRangeKind(time_column="ds"), query=parse_one("SELECT 1, ds FROM parent_tbl"), + virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, ) # Create source snapshot with intervals - source_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + source_snapshot = make_snapshot(model) source_snapshot.add_interval("2020-01-01", "2020-01-03") source_snapshot.add_interval("2020-01-05", "2020-01-07") # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY - target_snapshot = make_snapshot(model, virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY) + target_snapshot = make_snapshot(model) target_snapshot.fingerprint = SnapshotFingerprint( data_hash="different", metadata_hash="different", parent_data_hash="different" ) @@ -3483,9 +3487,10 @@ def test_table_name_virtual_environment_mode( name="my_schema.my_model", kind=IncrementalByTimeRangeKind(time_column="ds"), query=parse_one("SELECT 1, ds"), + virtual_environment_mode=virtual_env_mode, ) - snapshot = make_snapshot(model, virtual_environment_mode=virtual_env_mode) + snapshot = make_snapshot(model) snapshot.categorize_as(SnapshotChangeCategory.BREAKING) table_name_result = snapshot.table_name(is_deployable=is_deployable) From 81447624b06591a0d8a4efdf94ae46b6965d5f9a Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 12:28:39 -0700 Subject: [PATCH 06/27] update docs --- docs/guides/configuration.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index 9d5cf25f8c..801aa8bf68 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -567,8 +567,10 @@ To mitigate this, SQLMesh offers an alternative 'dev-only' mode for using VDE. I As the name suggests, 'dev-only' mode means that VDE is applied only in development environments, while in production, model tables and views are updated directly, bypassing the virtual layer. This also means that physical tables in production will be created using the original, unversioned model names. Users will still benefit from VDE and data reuse across development environments. -Please note that enabling this mode means that all data inserted in development environments is used only for [preview](../concepts/plans.md#data-preview-for-forward-only-changes) and will **not** be reused in production. +Please note the following tradeoffs when enabling this mode: +- All data inserted in development environments is used only for [preview](../concepts/plans.md#data-preview-for-forward-only-changes) and will **not** be reused in production +- Reverting a model to a previous version will trigger a full rebuild (unless the model is forward-only), as there is no versioned physical table to revert to !!! warning Switching the mode for an existing project will result in a complete rebuild of all models in the project. Refer to the [Table Migration Guide](./table_migration.md) to migrate existing tables without rebuilding them from scratch. From 2d836a01d2a7d8610bd4116f824281e9476c0af9 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 13:08:56 -0700 Subject: [PATCH 07/27] compute preview in dev --- sqlmesh/core/plan/builder.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index b3520ed603..6d65f4c5b0 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -162,7 +162,7 @@ def __init__( self._start = start if not self._start and ( - self._forward_only_preview_needed or self._auto_restatement_preview_needed + self._forward_only_preview_needed or self._non_forward_only_preview_needed ): self._start = default_start or yesterday_ds() @@ -871,12 +871,18 @@ def _forward_only_preview_needed(self) -> bool: ) @cached_property - def _auto_restatement_preview_needed(self) -> bool: - return self._is_dev and any( - snapshot.model.auto_restatement_cron is not None - for snapshot in self._modified_and_added_snapshots - if snapshot.is_model - ) + def _non_forward_only_preview_needed(self) -> bool: + if not self._is_dev: + return False + for snapshot in self._modified_and_added_snapshots: + if not snapshot.is_model: + continue + if ( + not snapshot.virtual_environment_mode.is_full + or snapshot.model.auto_restatement_cron is not None + ): + return True + return False @cached_property def _modified_and_added_snapshots(self) -> t.List[Snapshot]: From 8be9fd5a20ab904c17ede00ab86348103d2f5240 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 14:18:58 -0700 Subject: [PATCH 08/27] add an integration test --- examples/sushi/config.py | 11 +++ sqlmesh/dbt/basemodel.py | 6 +- sqlmesh/dbt/model.py | 7 +- sqlmesh/dbt/seed.py | 7 +- tests/core/test_integration.py | 125 +++++++++++++++++++++++++++++++++ 5 files changed, 153 insertions(+), 3 deletions(-) diff --git a/examples/sushi/config.py b/examples/sushi/config.py index 2c124421dd..0bf15d2767 100644 --- a/examples/sushi/config.py +++ b/examples/sushi/config.py @@ -1,5 +1,6 @@ import os +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.config import ( AutoCategorizationMode, BigQueryConnectionConfig, @@ -76,6 +77,16 @@ model_defaults=model_defaults, ) +# A configuration used for SQLMesh tests with virtual environment mode set to DEV_ONLY. +test_config_virtual_environment_mode_dev_only = test_config.copy( + update={ + "virtual_environment_mode": VirtualEnvironmentMode.DEV_ONLY, + "plan": PlanConfig( + auto_categorize_changes=CategorizerConfig.all_full(), + ), + } +) + # A DuckDB config with a physical schema map. map_config = Config( default_connection=DuckDBConnectionConfig(), diff --git a/sqlmesh/dbt/basemodel.py b/sqlmesh/dbt/basemodel.py index 74b90b8441..f2d99b52c3 100644 --- a/sqlmesh/dbt/basemodel.py +++ b/sqlmesh/dbt/basemodel.py @@ -30,6 +30,7 @@ from sqlmesh.utils.pydantic import field_validator if t.TYPE_CHECKING: + from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.audit.definition import ModelAudit from sqlmesh.dbt.context import DbtContext @@ -345,6 +346,9 @@ def sqlmesh_model_kwargs( @abstractmethod def to_sqlmesh( - self, context: DbtContext, audit_definitions: t.Optional[t.Dict[str, ModelAudit]] = None + self, + context: DbtContext, + audit_definitions: t.Optional[t.Dict[str, ModelAudit]] = None, + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default, ) -> Model: """Convert DBT model into sqlmesh Model""" diff --git a/sqlmesh/dbt/model.py b/sqlmesh/dbt/model.py index 4cbca09aee..695315a7ac 100644 --- a/sqlmesh/dbt/model.py +++ b/sqlmesh/dbt/model.py @@ -29,6 +29,7 @@ from sqlmesh.utils.pydantic import field_validator if t.TYPE_CHECKING: + from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.audit.definition import ModelAudit from sqlmesh.dbt.context import DbtContext @@ -421,7 +422,10 @@ def sqlmesh_config_fields(self) -> t.Set[str]: } def to_sqlmesh( - self, context: DbtContext, audit_definitions: t.Optional[t.Dict[str, ModelAudit]] = None + self, + context: DbtContext, + audit_definitions: t.Optional[t.Dict[str, ModelAudit]] = None, + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default, ) -> Model: """Converts the dbt model into a SQLMesh model.""" model_dialect = self.dialect(context) @@ -573,6 +577,7 @@ def to_sqlmesh( # Note: any table dependencies that are not referenced using the `ref` macro will not be included. extract_dependencies_from_query=False, allow_partials=allow_partials, + virtual_environment_mode=virtual_environment_mode, **optional_kwargs, **model_kwargs, ) diff --git a/sqlmesh/dbt/seed.py b/sqlmesh/dbt/seed.py index 78f24255dc..d46497ae20 100644 --- a/sqlmesh/dbt/seed.py +++ b/sqlmesh/dbt/seed.py @@ -20,6 +20,7 @@ from sqlmesh.dbt.column import ColumnConfig if t.TYPE_CHECKING: + from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.audit.definition import ModelAudit from sqlmesh.dbt.context import DbtContext @@ -38,7 +39,10 @@ class SeedConfig(BaseModelConfig): quote_columns: t.Optional[bool] = False def to_sqlmesh( - self, context: DbtContext, audit_definitions: t.Optional[t.Dict[str, ModelAudit]] = None + self, + context: DbtContext, + audit_definitions: t.Optional[t.Dict[str, ModelAudit]] = None, + virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default, ) -> Model: """Converts the dbt seed into a SQLMesh model.""" seed_path = self.path.absolute().as_posix() @@ -83,6 +87,7 @@ def to_sqlmesh( SeedKind(path=seed_path), dialect=self.dialect(context), audit_definitions=audit_definitions, + virtual_environment_mode=virtual_environment_mode, **kwargs, ) diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index d15e097875..ff125f6185 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2454,6 +2454,131 @@ def test_unaligned_start_snapshot_with_non_deployable_downstream(init_and_plan_c assert snapshot_interval.intervals[0][0] == to_timestamp("2023-01-07") +@time_machine.travel("2023-01-08 15:00:00 UTC") +def test_virtual_environment_mode_dev_only(init_and_plan_context: t.Callable): + context, _ = init_and_plan_context( + "examples/sushi", config="test_config_virtual_environment_mode_dev_only" + ) + + assert all( + s.virtual_environment_mode.is_dev_only or not s.is_model or s.is_symbolic + for s in context.snapshots.values() + ) + + # Init prod + context.plan("prod", auto_apply=True, no_prompts=True) + + # Make a change in dev + original_model = context.get_model("sushi.waiter_revenue_by_day") + original_fingerprint = context.get_snapshot(original_model.name).fingerprint + model = original_model.copy(update={"query": original_model.query.order_by("waiter_id")}) + model = add_projection_to_model(t.cast(SqlModel, model)) + context.upsert_model(model) + + plan_dev = context.plan_builder("dev").build() + assert to_timestamp(plan_dev.start) == to_timestamp("2023-01-07") + assert plan_dev.requires_backfill + assert plan_dev.missing_intervals == [ + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.top_waiters").snapshot_id, + intervals=[(to_timestamp("2023-01-07"), to_timestamp("2023-01-08"))], + ), + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.waiter_revenue_by_day").snapshot_id, + intervals=[(to_timestamp("2023-01-07"), to_timestamp("2023-01-08"))], + ), + ] + context.apply(plan_dev) + + # Make sure the waiter_revenue_by_day model is a table in prod and a view in dev + table_types_df = context.engine_adapter.fetchdf( + "SELECT table_schema, table_type FROM INFORMATION_SCHEMA.TABLES WHERE table_name = 'waiter_revenue_by_day'" + ) + assert table_types_df.to_dict("records") == [ + {"table_schema": "sushi", "table_type": "BASE TABLE"}, + {"table_schema": "sushi__dev", "table_type": "VIEW"}, + ] + + # Check that the specified dates were backfilled + min_event_date = context.engine_adapter.fetchone( + "SELECT MIN(event_date) FROM sushi__dev.waiter_revenue_by_day" + )[0] + assert min_event_date == to_date("2023-01-07") + + # Make sure the changed models are fully rebuilt when deploying to prod + plan_prod = context.plan_builder("prod").build() + assert plan_prod.requires_backfill + assert plan_prod.missing_intervals == [ + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.top_waiters").snapshot_id, + intervals=[ + (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), + (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), + (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), + (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), + (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), + (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), + ], + ), + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.waiter_revenue_by_day").snapshot_id, + intervals=[ + (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), + (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), + (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), + (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), + (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), + (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), + ], + ), + ] + context.apply(plan_prod) + assert "one" in context.engine_adapter.columns("sushi.waiter_revenue_by_day") + assert ( + context.engine_adapter.fetchone( + "SELECT COUNT(*) FROM sushi.waiter_revenue_by_day WHERE one is NULL" + )[0] + == 0 + ) + + # Make sure the revert of a breaking changes results in a full rebuild + context.upsert_model(original_model) + assert context.get_snapshot(original_model.name).fingerprint == original_fingerprint + + plan_prod = context.plan_builder("prod").build() + assert plan_prod.requires_backfill + assert plan_prod.missing_intervals == [ + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.top_waiters").snapshot_id, + intervals=[ + (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), + (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), + (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), + (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), + (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), + (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), + ], + ), + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.waiter_revenue_by_day").snapshot_id, + intervals=[ + (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), + (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), + (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), + (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), + (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), + (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), + ], + ), + ] + context.apply(plan_prod) + assert "one" not in context.engine_adapter.columns("sushi.waiter_revenue_by_day") + + @time_machine.travel("2023-01-08 15:00:00 UTC") def test_restatement_plan_ignores_changes(init_and_plan_context: t.Callable): context, plan = init_and_plan_context("examples/sushi") From 01e68767c321bd731f03e5e9cb9f59c1a4f91953 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 14:29:04 -0700 Subject: [PATCH 09/27] fix dbt support --- sqlmesh/dbt/basemodel.py | 2 +- sqlmesh/dbt/loader.py | 6 +++++- sqlmesh/dbt/model.py | 2 +- sqlmesh/dbt/seed.py | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sqlmesh/dbt/basemodel.py b/sqlmesh/dbt/basemodel.py index f2d99b52c3..d226325dbc 100644 --- a/sqlmesh/dbt/basemodel.py +++ b/sqlmesh/dbt/basemodel.py @@ -10,6 +10,7 @@ from sqlmesh.core import dialect as d from sqlmesh.core.config.base import UpdateStrategy +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.model import Model from sqlmesh.dbt.column import ( ColumnConfig, @@ -30,7 +31,6 @@ from sqlmesh.utils.pydantic import field_validator if t.TYPE_CHECKING: - from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.audit.definition import ModelAudit from sqlmesh.dbt.context import DbtContext diff --git a/sqlmesh/dbt/loader.py b/sqlmesh/dbt/loader.py index 23d34afa31..3d63219004 100644 --- a/sqlmesh/dbt/loader.py +++ b/sqlmesh/dbt/loader.py @@ -117,7 +117,11 @@ def _load_models( def _to_sqlmesh(config: BMC, context: DbtContext) -> Model: logger.debug("Converting '%s' to sqlmesh format", config.canonical_name(context)) - return config.to_sqlmesh(context, audit_definitions=audits) + return config.to_sqlmesh( + context, + audit_definitions=audits, + virtual_environment_mode=self.config.virtual_environment_mode, + ) for project in self._load_projects(): context = project.context.copy() diff --git a/sqlmesh/dbt/model.py b/sqlmesh/dbt/model.py index 695315a7ac..8563d20d22 100644 --- a/sqlmesh/dbt/model.py +++ b/sqlmesh/dbt/model.py @@ -8,6 +8,7 @@ from sqlmesh.core import dialect as d from sqlmesh.core.config.base import UpdateStrategy +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.console import get_console from sqlmesh.core.model import ( EmbeddedKind, @@ -29,7 +30,6 @@ from sqlmesh.utils.pydantic import field_validator if t.TYPE_CHECKING: - from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.audit.definition import ModelAudit from sqlmesh.dbt.context import DbtContext diff --git a/sqlmesh/dbt/seed.py b/sqlmesh/dbt/seed.py index d46497ae20..fde5c7e569 100644 --- a/sqlmesh/dbt/seed.py +++ b/sqlmesh/dbt/seed.py @@ -15,12 +15,12 @@ SUPPORTS_DELIMITER = False from sqlglot import exp +from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.model import Model, SeedKind, create_seed_model from sqlmesh.dbt.basemodel import BaseModelConfig from sqlmesh.dbt.column import ColumnConfig if t.TYPE_CHECKING: - from sqlmesh.core.config.common import VirtualEnvironmentMode from sqlmesh.core.audit.definition import ModelAudit from sqlmesh.dbt.context import DbtContext From 273cb34532ed760ecb1c058397a7e7c71890375e Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 16:33:03 -0700 Subject: [PATCH 10/27] use_finalized_state can't be used with non-full vde --- sqlmesh/core/config/root.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sqlmesh/core/config/root.py b/sqlmesh/core/config/root.py index d20463a506..8cbae53183 100644 --- a/sqlmesh/core/config/root.py +++ b/sqlmesh/core/config/root.py @@ -264,6 +264,11 @@ def _normalize_identifiers(key: str) -> None: "Please specify one or the other" ) + if self.plan.use_finalized_state and not self.virtual_environment_mode.is_full: + raise ConfigError( + "Using the finalized state is only supported when `virtual_environment_mode` is set to `full`." + ) + if self.environment_catalog_mapping: _normalize_identifiers("environment_catalog_mapping") if self.physical_schema_mapping: From 2c25c9d4ec634480d9a4ffac5ab093deff9bd063 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 16:46:56 -0700 Subject: [PATCH 11/27] minor root config fix --- sqlmesh/core/config/root.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sqlmesh/core/config/root.py b/sqlmesh/core/config/root.py index 8cbae53183..ec8fa9988f 100644 --- a/sqlmesh/core/config/root.py +++ b/sqlmesh/core/config/root.py @@ -153,9 +153,7 @@ class Config(BaseConfig): env_vars: t.Dict[str, str] = {} username: str = "" physical_schema_mapping: RegexKeyDict = {} - environment_suffix_target: EnvironmentSuffixTarget = Field( - default=EnvironmentSuffixTarget.default - ) + environment_suffix_target: EnvironmentSuffixTarget = EnvironmentSuffixTarget.default physical_table_naming_convention: TableNamingConvention = TableNamingConvention.default virtual_environment_mode: VirtualEnvironmentMode = VirtualEnvironmentMode.default gateway_managed_virtual_layer: bool = False From 52eb4763b97e1ad725b8c9ae192e0cbef28dd187 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 16:51:58 -0700 Subject: [PATCH 12/27] add a migration script --- sqlmesh/migrations/v0088_add_virtual_environment_mode.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 sqlmesh/migrations/v0088_add_virtual_environment_mode.py diff --git a/sqlmesh/migrations/v0088_add_virtual_environment_mode.py b/sqlmesh/migrations/v0088_add_virtual_environment_mode.py new file mode 100644 index 0000000000..024ff03a0e --- /dev/null +++ b/sqlmesh/migrations/v0088_add_virtual_environment_mode.py @@ -0,0 +1,5 @@ +"""Add virtual_environment_mode to the model definition.""" + + +def migrate(state_sync, **kwargs): # type: ignore + pass From 99c8b0b05a8ec0cf12b1adc42c204ef2ac1f2e30 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Thu, 31 Jul 2025 17:13:16 -0700 Subject: [PATCH 13/27] fix build --- sqlmesh/core/snapshot/definition.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index 83d923863d..59f997cdab 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -642,9 +642,7 @@ class Snapshot(PydanticModel, SnapshotInfoMixin): base_table_name_override: t.Optional[str] = None next_auto_restatement_ts: t.Optional[int] = None dev_table_suffix: str = "dev" - table_naming_convention_: TableNamingConvention = Field( - default=TableNamingConvention.default, alias="table_naming_convention" - ) + table_naming_convention: TableNamingConvention = TableNamingConvention.default forward_only: bool = False @field_validator("ttl") From 325bf76b224284d36777bb43c03a7989c23dd9ae Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Fri, 8 Aug 2025 15:30:28 -0700 Subject: [PATCH 14/27] fixes after rebase --- docs/guides/configuration.md | 2 +- sqlmesh/core/context.py | 9 ++-- sqlmesh/core/plan/builder.py | 10 ++--- sqlmesh/core/snapshot/definition.py | 1 + tests/core/test_integration.py | 70 ++++------------------------- 5 files changed, 21 insertions(+), 71 deletions(-) diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index 801aa8bf68..993ac2e5e2 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -570,7 +570,7 @@ As the name suggests, 'dev-only' mode means that VDE is applied only in developm Please note the following tradeoffs when enabling this mode: - All data inserted in development environments is used only for [preview](../concepts/plans.md#data-preview-for-forward-only-changes) and will **not** be reused in production -- Reverting a model to a previous version will trigger a full rebuild (unless the model is forward-only), as there is no versioned physical table to revert to +- Reverting a model to a previous version will be applied going forward and may require an explicit data restatement !!! warning Switching the mode for an existing project will result in a complete rebuild of all models in the project. Refer to the [Table Migration Guide](./table_migration.md) to migrate existing tables without rebuilding them from scratch. diff --git a/sqlmesh/core/context.py b/sqlmesh/core/context.py index 0031c1834c..7d27092f0e 100644 --- a/sqlmesh/core/context.py +++ b/sqlmesh/core/context.py @@ -1616,6 +1616,11 @@ def plan_builder( max_interval_end_per_model, ) + if not self.config.virtual_environment_mode.is_full: + forward_only = True + elif forward_only is None: + forward_only = self.config.plan.forward_only + return self.PLAN_BUILDER_TYPE( context_diff=context_diff, start=start, @@ -1628,9 +1633,7 @@ def plan_builder( skip_backfill=skip_backfill, empty_backfill=empty_backfill, is_dev=is_dev, - forward_only=( - forward_only if forward_only is not None else self.config.plan.forward_only - ), + forward_only=forward_only, allow_destructive_models=expanded_destructive_models, environment_ttl=environment_ttl, environment_suffix_target=self.config.environment_suffix_target, diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 6d65f4c5b0..7440413f82 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -609,13 +609,13 @@ def _categorize_snapshot( if self._context_diff.directly_modified(s_id.name): new, old = self._context_diff.modified_snapshots[s_id.name] - is_breaking_kind_change = _is_breaking_kind_change(old, new) - if is_breaking_kind_change or snapshot.is_seed: + should_force_rebuild = _should_force_rebuild(old, new) + if should_force_rebuild or snapshot.is_seed: # Breaking kind changes and seed changes can't be forward-only. forward_only = False if self._auto_categorization_enabled: - if is_breaking_kind_change: + if should_force_rebuild: snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only) return @@ -773,7 +773,7 @@ def _is_forward_only_change(self, s_id: SnapshotId) -> bool: if snapshot.name in self._context_diff.modified_snapshots: _, old = self._context_diff.modified_snapshots[snapshot.name] # If the model kind has changed in a breaking way, then we can't consider this to be a forward-only change. - if snapshot.is_model and _should_force_breaking_change(old, snapshot): + if snapshot.is_model and _should_force_rebuild(old, snapshot): return False return ( snapshot.is_model and snapshot.model.forward_only and bool(snapshot.previous_versions) @@ -894,7 +894,7 @@ def _modified_and_added_snapshots(self) -> t.List[Snapshot]: ] -def _should_force_breaking_change(old: Snapshot, new: Snapshot) -> bool: +def _should_force_rebuild(old: Snapshot, new: Snapshot) -> bool: if old.virtual_environment_mode != new.virtual_environment_mode: # If the virtual environment mode has changed, then it's a breaking change return True diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index 59f997cdab..db9b814430 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -1408,6 +1408,7 @@ def requires_schema_migration_in_prod(self) -> bool: or self.model.forward_only or bool(self.model.physical_version) or self.is_view + or not self.virtual_environment_mode.is_full ) ) diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index ff125f6185..7dfa190074 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2505,76 +2505,22 @@ def test_virtual_environment_mode_dev_only(init_and_plan_context: t.Callable): )[0] assert min_event_date == to_date("2023-01-07") - # Make sure the changed models are fully rebuilt when deploying to prod + # Make sure the changes are applied without backfill in prod plan_prod = context.plan_builder("prod").build() - assert plan_prod.requires_backfill - assert plan_prod.missing_intervals == [ - SnapshotIntervals( - snapshot_id=context.get_snapshot("sushi.top_waiters").snapshot_id, - intervals=[ - (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), - (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), - (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), - (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), - (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), - (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), - (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), - ], - ), - SnapshotIntervals( - snapshot_id=context.get_snapshot("sushi.waiter_revenue_by_day").snapshot_id, - intervals=[ - (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), - (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), - (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), - (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), - (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), - (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), - (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), - ], - ), - ] + assert not plan_prod.requires_backfill + assert not plan_prod.missing_intervals context.apply(plan_prod) assert "one" in context.engine_adapter.columns("sushi.waiter_revenue_by_day") - assert ( - context.engine_adapter.fetchone( - "SELECT COUNT(*) FROM sushi.waiter_revenue_by_day WHERE one is NULL" - )[0] - == 0 - ) # Make sure the revert of a breaking changes results in a full rebuild context.upsert_model(original_model) assert context.get_snapshot(original_model.name).fingerprint == original_fingerprint - plan_prod = context.plan_builder("prod").build() - assert plan_prod.requires_backfill - assert plan_prod.missing_intervals == [ - SnapshotIntervals( - snapshot_id=context.get_snapshot("sushi.top_waiters").snapshot_id, - intervals=[ - (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), - (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), - (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), - (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), - (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), - (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), - (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), - ], - ), - SnapshotIntervals( - snapshot_id=context.get_snapshot("sushi.waiter_revenue_by_day").snapshot_id, - intervals=[ - (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), - (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), - (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), - (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), - (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), - (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), - (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), - ], - ), - ] + plan_prod = context.plan_builder( + "prod", allow_destructive_models=["sushi.waiter_revenue_by_day"] + ).build() + assert not plan_prod.requires_backfill + assert not plan_prod.missing_intervals context.apply(plan_prod) assert "one" not in context.engine_adapter.columns("sushi.waiter_revenue_by_day") From 096f13978278f6c2853b12a15ef1b76a2ee81450 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Fri, 8 Aug 2025 17:48:15 -0700 Subject: [PATCH 15/27] drop data objects of different types --- sqlmesh/core/engine_adapter/base.py | 63 ++++++++++++++++++++++++- sqlmesh/core/engine_adapter/redshift.py | 4 +- sqlmesh/core/engine_adapter/shared.py | 3 ++ sqlmesh/core/plan/builder.py | 8 ++-- tests/core/test_integration.py | 46 ++++++++++++++++++ 5 files changed, 118 insertions(+), 6 deletions(-) diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py index c615a3029d..cbc2acbdb9 100644 --- a/sqlmesh/core/engine_adapter/base.py +++ b/sqlmesh/core/engine_adapter/base.py @@ -32,6 +32,7 @@ CommentCreationTable, CommentCreationView, DataObject, + DataObjectType, EngineRunMode, InsertOverwriteStrategy, SourceQuery, @@ -369,6 +370,9 @@ def replace_query( kwargs: Optional create table properties. """ target_table = exp.to_table(table_name) + + table_exists = self._drop_data_object_on_type_mismatch(target_table, DataObjectType.TABLE) + source_queries, columns_to_types = self._get_source_queries_and_columns_to_types( query_or_df, columns_to_types, target_table=target_table ) @@ -390,7 +394,7 @@ def replace_query( ) # All engines support `CREATE TABLE AS` so we use that if the table doesn't already exist and we # use `CREATE OR REPLACE TABLE AS` if the engine supports it - if self.SUPPORTS_REPLACE_TABLE or not self.table_exists(target_table): + if self.SUPPORTS_REPLACE_TABLE or not table_exists: return self._create_table_from_source_queries( target_table, source_queries, @@ -930,6 +934,28 @@ def clone_table( ) ) + def drop_data_object(self, data_object: DataObject, ignore_if_not_exists: bool = True) -> None: + """Drops a data object of arbitrary type. + + Args: + data_object: The data object to drop. + ignore_if_not_exists: If True, no error will be raised if the data object does not exist. + """ + if data_object.type.is_view: + self.drop_view(data_object.to_table(), ignore_if_not_exists=ignore_if_not_exists) + elif data_object.type.is_materialized_view: + self.drop_view( + data_object.to_table(), ignore_if_not_exists=ignore_if_not_exists, materialized=True + ) + elif data_object.type.is_table: + self.drop_table(data_object.to_table(), exists=ignore_if_not_exists) + elif data_object.type.is_managed_table: + self.drop_managed_table(data_object.to_table(), exists=ignore_if_not_exists) + else: + raise SQLMeshError( + f"Can't drop data object '{data_object.to_table().sql(dialect=self.dialect)}' of type '{data_object.type.value}'" + ) + def drop_table(self, table_name: TableName, exists: bool = True) -> None: """Drops a table. @@ -1118,6 +1144,12 @@ def create_view( if properties.expressions: create_kwargs["properties"] = properties + if replace: + self._drop_data_object_on_type_mismatch( + view_name, + DataObjectType.VIEW if not materialized else DataObjectType.MATERIALIZED_VIEW, + ) + with source_queries[0] as query: self.execute( exp.Create( @@ -2483,6 +2515,35 @@ def _truncate_table(self, table_name: TableName) -> None: table = exp.to_table(table_name) self.execute(f"TRUNCATE TABLE {table.sql(dialect=self.dialect, identify=True)}") + def _drop_data_object_on_type_mismatch( + self, target_name: TableName, expected_type: DataObjectType + ) -> bool: + """Drops a data object if it exists and is not of the expected type. + + Args: + target_name: The name of the data object to check. + expected_type: The expected type of the data object. + + Returns: + True if the data object exists and is of the expected type, False otherwise. + """ + target_table = exp.to_table(target_name) + existing_data_objects = self.get_data_objects( + schema_(target_table.db, target_table.catalog), {target_table.name} + ) + if existing_data_objects: + if existing_data_objects[0].type == expected_type: + return True + + logger.warning( + "Target data object '%s' is a %s and not a %s, dropping it", + target_table.sql(dialect=self.dialect), + existing_data_objects[0].type.value, + expected_type.value, + ) + self.drop_data_object(existing_data_objects[0]) + return False + def _replace_by_key( self, target_table: TableName, diff --git a/sqlmesh/core/engine_adapter/redshift.py b/sqlmesh/core/engine_adapter/redshift.py index 906c52445f..8aa0916b27 100644 --- a/sqlmesh/core/engine_adapter/redshift.py +++ b/sqlmesh/core/engine_adapter/redshift.py @@ -262,7 +262,9 @@ def replace_query( """ import pandas as pd - if not isinstance(query_or_df, pd.DataFrame) or not self.table_exists(table_name): + table_exists = self._drop_data_object_on_type_mismatch(table_name, DataObjectType.TABLE) + + if not isinstance(query_or_df, pd.DataFrame) or not table_exists: return super().replace_query( table_name, query_or_df, diff --git a/sqlmesh/core/engine_adapter/shared.py b/sqlmesh/core/engine_adapter/shared.py index 1d882de02f..55f04a995e 100644 --- a/sqlmesh/core/engine_adapter/shared.py +++ b/sqlmesh/core/engine_adapter/shared.py @@ -171,6 +171,9 @@ class DataObject(PydanticModel): def is_clustered(self) -> bool: return bool(self.clustering_key) + def to_table(self) -> exp.Table: + return exp.table_(self.name, db=self.schema_name, catalog=self.catalog, quoted=True) + class CatalogSupport(Enum): # The engine has no concept of catalogs diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 7440413f82..c3437bb814 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -896,15 +896,15 @@ def _modified_and_added_snapshots(self) -> t.List[Snapshot]: def _should_force_rebuild(old: Snapshot, new: Snapshot) -> bool: if old.virtual_environment_mode != new.virtual_environment_mode: - # If the virtual environment mode has changed, then it's a breaking change + # If the virtual environment mode has changed, then we need to rebuild return True if old.model.kind.name == new.model.kind.name: - # If the kind hasn't changed, then it's not a breaking change + # If the kind hasn't changed, then we don't need to rebuild return False if not old.is_incremental or not new.is_incremental: - # If either is not incremental, then it's a breaking change + # If either is not incremental, then we need to rebuild return True if old.model.partitioned_by == new.model.partitioned_by: - # If the partitioning hasn't changed, then it's not a breaking change + # If the partitioning hasn't changed, then we don't need to rebuild return False return True diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index 7dfa190074..e73db27a27 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2525,6 +2525,52 @@ def test_virtual_environment_mode_dev_only(init_and_plan_context: t.Callable): assert "one" not in context.engine_adapter.columns("sushi.waiter_revenue_by_day") +@time_machine.travel("2023-01-08 15:00:00 UTC") +def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_context: t.Callable): + context, plan = init_and_plan_context( + "examples/sushi", config="test_config_virtual_environment_mode_dev_only" + ) + context.apply(plan) + + # Change to full kind + model = context.get_model("sushi.top_waiters") + model = model.copy(update={"kind": FullKind()}) + context.upsert_model(model) + prod_plan = context.plan_builder("prod", skip_tests=True).build() + assert prod_plan.missing_intervals + assert prod_plan.requires_backfill + context.apply(prod_plan) + data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) + assert len(data_objects) == 1 + assert data_objects[0].type == "table" + + context.state_sync.clear_cache() + + # Change back to view + model = context.get_model("sushi.top_waiters") + model = model.copy(update={"kind": ViewKind()}) + context.upsert_model(model) + prod_plan = context.plan_builder("prod", skip_tests=True).build() + assert prod_plan.requires_backfill + assert prod_plan.missing_intervals + context.apply(prod_plan) + data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) + assert len(data_objects) == 1 + assert data_objects[0].type == "view" + + # Change to incremental + model = context.get_model("sushi.top_waiters") + model = model.copy(update={"kind": IncrementalUnmanagedKind()}) + context.upsert_model(model) + prod_plan = context.plan_builder("prod", skip_tests=True).build() + assert prod_plan.requires_backfill + assert prod_plan.missing_intervals + context.apply(prod_plan) + data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) + assert len(data_objects) == 1 + assert data_objects[0].type == "table" + + @time_machine.travel("2023-01-08 15:00:00 UTC") def test_restatement_plan_ignores_changes(init_and_plan_context: t.Callable): context, plan = init_and_plan_context("examples/sushi") From 224cff5dbb608f5244db99a77412ec206a9dce63 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 08:50:23 -0700 Subject: [PATCH 16/27] extend unrestorable criteria --- sqlmesh/core/state_sync/db/snapshot.py | 7 ++-- tests/core/state_sync/test_state_sync.py | 41 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/sqlmesh/core/state_sync/db/snapshot.py b/sqlmesh/core/state_sync/db/snapshot.py index 3be4fb1b45..6064993087 100644 --- a/sqlmesh/core/state_sync/db/snapshot.py +++ b/sqlmesh/core/state_sync/db/snapshot.py @@ -173,10 +173,9 @@ def unpause_snapshots( snapshot.set_unpaused_ts(None) paused_snapshots.append(snapshot.snapshot_id) - if ( - not snapshot.is_forward_only - and target_snapshot.is_forward_only - and not snapshot.unrestorable + if not snapshot.unrestorable and ( + (target_snapshot.is_forward_only and not snapshot.is_forward_only) + or (snapshot.is_forward_only and not target_snapshot.is_forward_only) ): logger.info("Marking snapshot %s as unrestorable", snapshot.snapshot_id) snapshot.unrestorable = True diff --git a/tests/core/state_sync/test_state_sync.py b/tests/core/state_sync/test_state_sync.py index a5a6969e38..d8e96a1f35 100644 --- a/tests/core/state_sync/test_state_sync.py +++ b/tests/core/state_sync/test_state_sync.py @@ -1996,6 +1996,47 @@ def test_unrestorable_snapshot(state_sync: EngineAdapterStateSync, make_snapshot assert not actual_snapshots[new_forward_only_snapshot.snapshot_id].unrestorable +def test_unrestorable_snapshot_target_not_forward_only( + state_sync: EngineAdapterStateSync, make_snapshot: t.Callable +): + snapshot = make_snapshot( + SqlModel( + name="test_snapshot", + query=parse_one("select 1, ds"), + cron="@daily", + ), + ) + snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only=True) + snapshot.version = "a" + + assert not snapshot.unpaused_ts + state_sync.push_snapshots([snapshot]) + + unpaused_dt = "2022-01-01" + state_sync.unpause_snapshots([snapshot], unpaused_dt) + + actual_snapshot = state_sync.get_snapshots([snapshot])[snapshot.snapshot_id] + assert actual_snapshot.unpaused_ts + assert actual_snapshot.unpaused_ts == to_timestamp(unpaused_dt) + + updated_snapshot = make_snapshot( + SqlModel(name="test_snapshot", query=parse_one("select 2, ds"), cron="@daily") + ) + updated_snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only=False) + updated_snapshot.version = "a" + + assert not updated_snapshot.unpaused_ts + state_sync.push_snapshots([updated_snapshot]) + state_sync.unpause_snapshots([updated_snapshot], unpaused_dt) + + actual_snapshots = state_sync.get_snapshots([snapshot, updated_snapshot]) + assert not actual_snapshots[snapshot.snapshot_id].unpaused_ts + assert actual_snapshots[updated_snapshot.snapshot_id].unpaused_ts == to_timestamp(unpaused_dt) + + assert actual_snapshots[snapshot.snapshot_id].unrestorable + assert not actual_snapshots[updated_snapshot.snapshot_id].unrestorable + + def test_unpause_snapshots_remove_intervals( state_sync: EngineAdapterStateSync, make_snapshot: t.Callable ): From 1d7c7ded0899ea4f4b892b759e5b22e5a8262e67 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 11:09:47 -0700 Subject: [PATCH 17/27] fix tests --- tests/conftest.py | 3 ++ tests/core/engine_adapter/test_athena.py | 7 ++++ tests/core/engine_adapter/test_clickhouse.py | 18 +++++++-- tests/core/engine_adapter/test_databricks.py | 11 +++++ tests/core/engine_adapter/test_mssql.py | 22 ++++++++-- tests/core/engine_adapter/test_redshift.py | 6 +++ tests/core/engine_adapter/test_spark.py | 37 ++++++++++++++--- tests/core/test_snapshot.py | 5 ++- tests/core/test_snapshot_evaluator.py | 8 ++-- tests/core/test_test.py | 42 -------------------- 10 files changed, 98 insertions(+), 61 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1bfa7a9f36..ad09deff6f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -470,6 +470,7 @@ def _make_function( dialect: t.Optional[str] = None, register_comments: bool = True, default_catalog: t.Optional[str] = None, + patch_get_data_objects: bool = True, **kwargs: t.Any, ) -> T: connection_mock = mocker.NonCallableMock() @@ -493,6 +494,8 @@ def _make_function( "sqlmesh.core.engine_adapter.mssql.MSSQLEngineAdapter.catalog_support", new_callable=PropertyMock(return_value=CatalogSupport.REQUIRES_SET_CATALOG), ) + if patch_get_data_objects: + mocker.patch.object(adapter, "_get_data_objects", return_value=[]) return adapter return _make_function diff --git a/tests/core/engine_adapter/test_athena.py b/tests/core/engine_adapter/test_athena.py index 6a5f30998b..5ee07f52d5 100644 --- a/tests/core/engine_adapter/test_athena.py +++ b/tests/core/engine_adapter/test_athena.py @@ -7,6 +7,7 @@ from sqlglot import exp, parse_one import sqlmesh.core.dialect as d from sqlmesh.core.engine_adapter import AthenaEngineAdapter +from sqlmesh.core.engine_adapter.shared import DataObject from sqlmesh.core.model import load_sql_based_model from sqlmesh.core.model.definition import SqlModel from sqlmesh.utils.errors import SQLMeshError @@ -288,6 +289,11 @@ def test_replace_query(adapter: AthenaEngineAdapter, mocker: MockerFixture): "sqlmesh.core.engine_adapter.athena.AthenaEngineAdapter._query_table_type", return_value="iceberg", ) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test", type="table")], + ) adapter.replace_query( table_name="test", @@ -304,6 +310,7 @@ def test_replace_query(adapter: AthenaEngineAdapter, mocker: MockerFixture): mocker.patch( "sqlmesh.core.engine_adapter.athena.AthenaEngineAdapter.table_exists", return_value=False ) + mocker.patch.object(adapter, "_get_data_objects", return_value=[]) adapter.cursor.execute.reset_mock() adapter.s3_warehouse_location = "s3://foo" diff --git a/tests/core/engine_adapter/test_clickhouse.py b/tests/core/engine_adapter/test_clickhouse.py index 1665239e36..a0cd33af70 100644 --- a/tests/core/engine_adapter/test_clickhouse.py +++ b/tests/core/engine_adapter/test_clickhouse.py @@ -2,7 +2,7 @@ from sqlmesh.core.engine_adapter import ClickhouseEngineAdapter from sqlmesh.core.model.definition import load_sql_based_model from sqlmesh.core.model.kind import ModelKindName -from sqlmesh.core.engine_adapter.shared import EngineRunMode +from sqlmesh.core.engine_adapter.shared import EngineRunMode, DataObject from tests.core.engine_adapter import to_sql_calls from sqlmesh.core.dialect import parse from sqlglot import exp, parse_one @@ -573,6 +573,12 @@ def test_scd_type_2_by_time( make_temp_table_name(table_name, "abcd"), ] + mocker.patch.object( + adapter, + "get_data_objects", + return_value=[DataObject(schema="", name=table_name, type="table")], + ) + fetchone_mock = mocker.patch("sqlmesh.core.engine_adapter.ClickhouseEngineAdapter.fetchone") fetchone_mock.return_value = None @@ -610,7 +616,7 @@ def test_scd_type_2_by_time( truncate=True, ) - assert to_sql_calls(adapter)[4] == parse_one( + assert to_sql_calls(adapter)[3] == parse_one( """ INSERT INTO "__temp_target_abcd" ("id", "name", "price", "test_UPDATED_at", "test_valid_from", "test_valid_to") WITH "source" AS ( @@ -787,6 +793,12 @@ def test_scd_type_2_by_column( make_temp_table_name(table_name, "abcd"), ] + mocker.patch.object( + adapter, + "get_data_objects", + return_value=[DataObject(schema="", name=table_name, type="table")], + ) + fetchone_mock = mocker.patch("sqlmesh.core.engine_adapter.ClickhouseEngineAdapter.fetchone") fetchone_mock.return_value = None @@ -817,7 +829,7 @@ def test_scd_type_2_by_column( truncate=True, ) - assert to_sql_calls(adapter)[4] == parse_one( + assert to_sql_calls(adapter)[3] == parse_one( """ INSERT INTO "__temp_target_abcd" ("id", "name", "price", "test_VALID_from", "test_valid_to") WITH "source" AS ( diff --git a/tests/core/engine_adapter/test_databricks.py b/tests/core/engine_adapter/test_databricks.py index 25698875a5..5991f5b2b9 100644 --- a/tests/core/engine_adapter/test_databricks.py +++ b/tests/core/engine_adapter/test_databricks.py @@ -8,6 +8,7 @@ from sqlmesh.core import dialect as d from sqlmesh.core.engine_adapter import DatabricksEngineAdapter +from sqlmesh.core.engine_adapter.shared import DataObject from sqlmesh.core.node import IntervalUnit from tests.core.engine_adapter import to_sql_calls @@ -41,6 +42,11 @@ def test_replace_query_exists(mocker: MockFixture, make_mocked_engine_adapter: t "sqlmesh.core.engine_adapter.databricks.DatabricksEngineAdapter.set_current_catalog" ) adapter = make_mocked_engine_adapter(DatabricksEngineAdapter, default_catalog="test_catalog") + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) adapter.replace_query("test_table", parse_one("SELECT a FROM tbl"), {"a": "int"}) assert to_sql_calls(adapter) == [ @@ -78,6 +84,11 @@ def test_replace_query_pandas_exists(mocker: MockFixture, make_mocked_engine_ada "sqlmesh.core.engine_adapter.databricks.DatabricksEngineAdapter.set_current_catalog" ) adapter = make_mocked_engine_adapter(DatabricksEngineAdapter, default_catalog="test_catalog") + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) adapter.replace_query( "test_table", df, {"a": exp.DataType.build("int"), "b": exp.DataType.build("int")} diff --git a/tests/core/engine_adapter/test_mssql.py b/tests/core/engine_adapter/test_mssql.py index 65f3231163..d8e5214be5 100644 --- a/tests/core/engine_adapter/test_mssql.py +++ b/tests/core/engine_adapter/test_mssql.py @@ -582,13 +582,16 @@ def test_merge_exists( ] -def test_replace_query(make_mocked_engine_adapter: t.Callable): +def test_replace_query(make_mocked_engine_adapter: t.Callable, mocker: MockerFixture): adapter = make_mocked_engine_adapter(MSSQLEngineAdapter) - adapter.cursor.fetchone.return_value = (1,) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) adapter.replace_query("test_table", parse_one("SELECT a FROM tbl"), {"a": "int"}) assert to_sql_calls(adapter) == [ - """SELECT 1 FROM [INFORMATION_SCHEMA].[TABLES] WHERE [TABLE_NAME] = 'test_table';""", "TRUNCATE TABLE [test_table];", "INSERT INTO [test_table] ([a]) SELECT [a] FROM [tbl];", ] @@ -605,6 +608,11 @@ def test_replace_query_pandas( ) adapter = make_mocked_engine_adapter(MSSQLEngineAdapter) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) adapter.cursor.fetchone.return_value = (1,) temp_table_mock = mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter._get_temp_table") @@ -682,7 +690,7 @@ def test_drop_schema_with_catalog(make_mocked_engine_adapter: t.Callable, mocker def test_get_data_objects_catalog(make_mocked_engine_adapter: t.Callable, mocker: MockerFixture): - adapter = make_mocked_engine_adapter(MSSQLEngineAdapter) + adapter = make_mocked_engine_adapter(MSSQLEngineAdapter, patch_get_data_objects=False) original_set_current_catalog = adapter.set_current_catalog local_state = {} @@ -912,6 +920,12 @@ def test_replace_query_strategy(adapter: MSSQLEngineAdapter, mocker: MockerFixtu exists_mock.return_value = True assert adapter.table_exists("test_table") + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) + adapter.replace_query( "test_table", model.render_query_or_raise(), diff --git a/tests/core/engine_adapter/test_redshift.py b/tests/core/engine_adapter/test_redshift.py index ef1e204ce5..0db8e8d055 100644 --- a/tests/core/engine_adapter/test_redshift.py +++ b/tests/core/engine_adapter/test_redshift.py @@ -9,6 +9,7 @@ from sqlglot import parse_one from sqlmesh.core.engine_adapter import RedshiftEngineAdapter +from sqlmesh.core.engine_adapter.shared import DataObject from sqlmesh.utils.errors import SQLMeshError from tests.core.engine_adapter import to_sql_calls @@ -262,6 +263,11 @@ def mock_table(*args, **kwargs): mock_temp_table = mocker.MagicMock(side_effect=mock_table) mocker.patch("sqlmesh.core.engine_adapter.EngineAdapter._get_temp_table", mock_temp_table) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) adapter.replace_query( table_name="test_table", diff --git a/tests/core/engine_adapter/test_spark.py b/tests/core/engine_adapter/test_spark.py index 8a455c47a3..2ef70a6929 100644 --- a/tests/core/engine_adapter/test_spark.py +++ b/tests/core/engine_adapter/test_spark.py @@ -10,6 +10,7 @@ from sqlglot import parse_one from sqlmesh.core.engine_adapter import SparkEngineAdapter +from sqlmesh.core.engine_adapter.shared import DataObject from sqlmesh.utils.errors import SQLMeshError from tests.core.engine_adapter import to_sql_calls import sqlmesh.core.dialect as d @@ -102,6 +103,11 @@ def test_replace_query_table_properties_exists( return_value=True, ) adapter = make_mocked_engine_adapter(SparkEngineAdapter) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) columns_to_types = { "cola": exp.DataType.build("INT"), @@ -194,6 +200,11 @@ def test_replace_query_exists(mocker: MockerFixture, make_mocked_engine_adapter: return_value=True, ) adapter = make_mocked_engine_adapter(SparkEngineAdapter) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="table")], + ) adapter.replace_query("test_table", parse_one("SELECT a FROM tbl"), {"a": "int"}) assert to_sql_calls(adapter) == [ @@ -239,6 +250,12 @@ def check_table_exists(table_name: exp.Table) -> bool: side_effect=check_table_exists, ) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="db", name="table", type="table")], + ) + adapter.replace_query(table_name, parse_one(f"SELECT col + 1 AS col FROM {table_name}")) assert to_sql_calls(adapter) == [ @@ -268,6 +285,11 @@ def test_replace_query_self_ref_exists( adapter = make_mocked_engine_adapter(SparkEngineAdapter) adapter.cursor.fetchone.return_value = (1,) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="db", name="table", type="table")], + ) table_name = "db.table" temp_table_id = "abcdefgh" @@ -525,11 +547,6 @@ def test_spark_struct_complex_to_col_to_types(type_name, spark_type): def test_scd_type_2_by_time( make_mocked_engine_adapter: t.Callable, make_temp_table_name: t.Callable, mocker: MockerFixture ): - mocker.patch( - "sqlmesh.core.engine_adapter.spark.SparkEngineAdapter.table_exists", - return_value=False, - ) - adapter = make_mocked_engine_adapter(SparkEngineAdapter) adapter._default_catalog = "spark_catalog" adapter.spark.catalog.currentCatalog.return_value = "spark_catalog" @@ -550,6 +567,11 @@ def check_table_exists(table_name: exp.Table) -> bool: "sqlmesh.core.engine_adapter.spark.SparkEngineAdapter.table_exists", side_effect=check_table_exists, ) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="db", name="target", type="table")], + ) adapter.scd_type_2_by_time( target_table="db.target", @@ -981,6 +1003,11 @@ def test_replace_query_with_wap_self_reference( ) adapter = make_mocked_engine_adapter(SparkEngineAdapter) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="schema", name="table", type="table")], + ) adapter.replace_query( "catalog.schema.table.branch_wap_12345", diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index afaa2b209d..194a5b9487 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -3453,8 +3453,9 @@ def test_merge_intervals_virtual_environment_mode_dev_only_no_rebuild(make_snaps data_hash="different", metadata_hash="different", parent_data_hash="different" ) target_snapshot.categorize_as( - SnapshotChangeCategory.FORWARD_ONLY - ) # This is a no-rebuild category + SnapshotChangeCategory.BREAKING, + forward_only=True, + ) # This is a no-rebuild categorization # Ensure snapshot is paused target_snapshot.unpaused_ts = None diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index 4b028e148b..b0e7c9cb05 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -1262,13 +1262,11 @@ def test_migrate_missing_table(mocker: MockerFixture, make_snapshot): def test_migrate_view( mocker: MockerFixture, make_snapshot, + make_mocked_engine_adapter, change_category: SnapshotChangeCategory, forward_only: bool, ): - connection_mock = mocker.NonCallableMock() - cursor_mock = mocker.Mock() - connection_mock.cursor.return_value = cursor_mock - adapter = EngineAdapter(lambda: connection_mock, "") + adapter = make_mocked_engine_adapter(EngineAdapter) evaluator = SnapshotEvaluator(adapter) @@ -1284,7 +1282,7 @@ def test_migrate_view( evaluator.migrate([snapshot], {}, deployability_index=DeployabilityIndex.none_deployable()) - cursor_mock.execute.assert_has_calls( + adapter.cursor.execute.assert_has_calls( [ call( 'CREATE OR REPLACE VIEW "sqlmesh__test_schema"."test_schema__test_model__1" ("c", "a") AS SELECT "c" AS "c", "a" AS "a" FROM "tbl" AS "tbl"' diff --git a/tests/core/test_test.py b/tests/core/test_test.py index 521773d1ca..9c3c3aba4b 100644 --- a/tests/core/test_test.py +++ b/tests/core/test_test.py @@ -2153,44 +2153,6 @@ def test_test_generation_with_timestamp(tmp_path: Path) -> None: } -def test_test_generation_with_decimal(tmp_path: Path, mocker: MockerFixture) -> None: - from decimal import Decimal - - init_example_project(tmp_path, engine_type="duckdb") - - config = Config( - default_connection=DuckDBConnectionConfig(), - model_defaults=ModelDefaultsConfig(dialect="duckdb"), - ) - foo_sql_file = tmp_path / "models" / "foo.sql" - foo_sql_file.write_text( - "MODEL (name sqlmesh_example.foo); SELECT dec_col FROM sqlmesh_example.bar;" - ) - bar_sql_file = tmp_path / "models" / "bar.sql" - bar_sql_file.write_text("MODEL (name sqlmesh_example.bar); SELECT dec_col FROM external_table;") - - context = Context(paths=tmp_path, config=config) - input_queries = { - '"memory"."sqlmesh_example"."bar"': "SELECT CAST(1.23 AS DECIMAL(10,2)) AS dec_col" - } - - # DuckDB actually returns a numpy.float64, even though the value is cast into a DECIMAL, - # but other engines don't behave the same. E.g. BigQuery returns a proper Decimal value. - mocker.patch( - "sqlmesh.core.engine_adapter.base.EngineAdapter.fetchdf", - return_value=pd.DataFrame({"dec_col": [Decimal("1.23")]}), - ) - - context.create_test("sqlmesh_example.foo", input_queries=input_queries, overwrite=True) - - test = load_yaml(context.path / c.TESTS / "test_foo.yaml") - - assert len(test) == 1 - assert "test_foo" in test - assert test["test_foo"]["inputs"] == {'"memory"."sqlmesh_example"."bar"': [{"dec_col": "1.23"}]} - assert test["test_foo"]["outputs"] == {"query": [{"dec_col": "1.23"}]} - - def test_test_generation_with_recursive_ctes(tmp_path: Path) -> None: init_example_project(tmp_path, engine_type="duckdb") @@ -2247,10 +2209,6 @@ def test_test_with_gateway_specific_model(tmp_path: Path, mocker: MockerFixture) context = Context(paths=tmp_path, config=config) input_queries = {'"memory"."sqlmesh_example"."input_model"': "SELECT 5 AS c"} - mocker.patch( - "sqlmesh.core.engine_adapter.base.EngineAdapter.fetchdf", - return_value=pd.DataFrame({"c": [5]}), - ) assert context.engine_adapter == context.engine_adapters["main"] with pytest.raises( From 2ac596921fdb2ed9c79930d33aec2c1e2f84e067 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 11:52:55 -0700 Subject: [PATCH 18/27] improve tests --- tests/core/engine_adapter/test_base.py | 40 +++++++++++++++++++++++++- tests/core/test_integration.py | 34 ++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/tests/core/engine_adapter/test_base.py b/tests/core/engine_adapter/test_base.py index 6c9d2ee132..f1dbe7d33a 100644 --- a/tests/core/engine_adapter/test_base.py +++ b/tests/core/engine_adapter/test_base.py @@ -14,7 +14,7 @@ from sqlmesh.core.dialect import normalize_model_name from sqlmesh.core.engine_adapter import EngineAdapter, EngineAdapterWithIndexSupport from sqlmesh.core.engine_adapter.mixins import InsertOverwriteWithMergeMixin -from sqlmesh.core.engine_adapter.shared import InsertOverwriteStrategy +from sqlmesh.core.engine_adapter.shared import InsertOverwriteStrategy, DataObject from sqlmesh.core.schema_diff import SchemaDiffer, TableAlterOperation from sqlmesh.utils import columns_to_types_to_struct from sqlmesh.utils.date import to_ds @@ -43,6 +43,23 @@ def test_create_view(make_mocked_engine_adapter: t.Callable): ] +def test_create_view_existing_data_object_type_mismatch( + make_mocked_engine_adapter: t.Callable, mocker: MockerFixture +): + adapter = make_mocked_engine_adapter(EngineAdapter) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_view", type="table")], + ) + adapter.create_view("test_view", parse_one("SELECT a FROM tbl")) + + assert to_sql_calls(adapter) == [ + 'DROP TABLE IF EXISTS "test_view"', + 'CREATE OR REPLACE VIEW "test_view" AS SELECT "a" FROM "tbl"', + ] + + def test_create_view_pandas(make_mocked_engine_adapter: t.Callable): adapter = make_mocked_engine_adapter(EngineAdapter) adapter.create_view("test_view", pd.DataFrame({"a": [1, 2, 3]}), replace=False) @@ -2713,6 +2730,27 @@ def test_replace_query(make_mocked_engine_adapter: t.Callable, mocker: MockerFix ] +def test_replace_query_data_object_type_mismatch( + make_mocked_engine_adapter: t.Callable, mocker: MockerFixture +): + adapter = make_mocked_engine_adapter(EngineAdapter) + mocker.patch.object( + adapter, + "_get_data_objects", + return_value=[DataObject(schema="", name="test_table", type="view")], + ) + + adapter.replace_query( + "test_table", parse_one("SELECT a FROM tbl"), {"a": exp.DataType.build("INT")} + ) + + # TODO: Shouldn't we enforce that `a` is casted to an int? + assert to_sql_calls(adapter) == [ + 'DROP VIEW IF EXISTS "test_table"', + 'CREATE OR REPLACE TABLE "test_table" AS SELECT CAST("a" AS INT) AS "a" FROM (SELECT "a" FROM "tbl") AS "_subquery"', + ] + + def test_replace_query_pandas(make_mocked_engine_adapter: t.Callable): adapter = make_mocked_engine_adapter(EngineAdapter) adapter.DEFAULT_BATCH_SIZE = 1 diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index e73db27a27..025b2f45e6 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2488,6 +2488,16 @@ def test_virtual_environment_mode_dev_only(init_and_plan_context: t.Callable): intervals=[(to_timestamp("2023-01-07"), to_timestamp("2023-01-08"))], ), ] + assert plan_dev.context_diff.snapshots[context.get_snapshot(model.name).snapshot_id].intervals + assert plan_dev.context_diff.snapshots[ + context.get_snapshot("sushi.top_waiters").snapshot_id + ].intervals + assert plan_dev.context_diff.snapshots[ + context.get_snapshot(model.name).snapshot_id + ].dev_intervals + assert plan_dev.context_diff.snapshots[ + context.get_snapshot("sushi.top_waiters").snapshot_id + ].dev_intervals context.apply(plan_dev) # Make sure the waiter_revenue_by_day model is a table in prod and a view in dev @@ -2539,6 +2549,9 @@ def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_conte prod_plan = context.plan_builder("prod", skip_tests=True).build() assert prod_plan.missing_intervals assert prod_plan.requires_backfill + assert not prod_plan.context_diff.snapshots[ + context.get_snapshot(model.name).snapshot_id + ].intervals context.apply(prod_plan) data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) assert len(data_objects) == 1 @@ -2553,6 +2566,9 @@ def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_conte prod_plan = context.plan_builder("prod", skip_tests=True).build() assert prod_plan.requires_backfill assert prod_plan.missing_intervals + assert not prod_plan.context_diff.snapshots[ + context.get_snapshot(model.name).snapshot_id + ].intervals context.apply(prod_plan) data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) assert len(data_objects) == 1 @@ -2565,6 +2581,24 @@ def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_conte prod_plan = context.plan_builder("prod", skip_tests=True).build() assert prod_plan.requires_backfill assert prod_plan.missing_intervals + assert not prod_plan.context_diff.snapshots[ + context.get_snapshot(model.name).snapshot_id + ].intervals + context.apply(prod_plan) + data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) + assert len(data_objects) == 1 + assert data_objects[0].type == "table" + + # Change back to full + model = context.get_model("sushi.top_waiters") + model = model.copy(update={"kind": FullKind()}) + context.upsert_model(model) + prod_plan = context.plan_builder("prod", skip_tests=True).build() + assert prod_plan.requires_backfill + assert prod_plan.missing_intervals + assert not prod_plan.context_diff.snapshots[ + context.get_snapshot(model.name).snapshot_id + ].intervals context.apply(prod_plan) data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) assert len(data_objects) == 1 From 24e10dd0ebeed876ed942a9d7c279dc83ffae640 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 12:02:32 -0700 Subject: [PATCH 19/27] fix warning --- sqlmesh/core/snapshot/definition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index db9b814430..d85def6e32 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -337,7 +337,7 @@ class SnapshotInfoMixin(ModelKindMixin): # This can be removed from this model once Pydantic 1 support is dropped (must remain in `Snapshot` though) base_table_name_override: t.Optional[str] dev_table_suffix: str - table_naming_convention: TableNamingConvention = Field(default=TableNamingConvention.default) + table_naming_convention: TableNamingConvention forward_only: bool @cached_property From 9e36f951cf40fabc7587181bf22a3e1c3be95719 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 12:04:12 -0700 Subject: [PATCH 20/27] fix typo --- sqlmesh/core/snapshot/definition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index d85def6e32..f00573403c 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -901,7 +901,7 @@ def merge_intervals(self, other: t.Union[Snapshot, SnapshotIntervals]) -> None: apply_effective_from = effective_from_ts > 0 and self.identifier != other.identifier for start, end in other.intervals: # If the effective_from is set, then intervals that come after it must come from - # the current snapshost. + # the current snapshots. if apply_effective_from and start < effective_from_ts: end = min(end, effective_from_ts) if not apply_effective_from or end <= effective_from_ts: From dfa0e353f9f2a70de410e04fd24c5a8ba60d3deb Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 12:43:46 -0700 Subject: [PATCH 21/27] remove obsolete plan checks --- sqlmesh/core/plan/builder.py | 22 ----------------- tests/core/test_plan.py | 48 ------------------------------------ 2 files changed, 70 deletions(-) diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index c3437bb814..94003481c7 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -267,7 +267,6 @@ def build(self) -> Plan: self._ensure_no_new_snapshots_with_restatements() self._ensure_new_env_with_changes() self._ensure_valid_date_range() - self._ensure_no_forward_only_revert() self._ensure_no_broken_references() self._apply_effective_from() @@ -801,27 +800,6 @@ def _ensure_valid_date_range(self) -> None: f"Plan end date: '{time_like_to_str(end)}' cannot be in the future (execution time: '{time_like_to_str(self.execution_time)}')" ) - def _ensure_no_forward_only_revert(self) -> None: - """Ensures that a previously superseded breaking / non-breaking snapshot is not being - used again to replace an existing forward-only snapshot with the same version. - - In other words there is no going back to the original non-forward-only snapshot with - the same version once a forward-only change for that version has been introduced. - """ - for name, (candidate, promoted) in self._context_diff.modified_snapshots.items(): - if ( - candidate.snapshot_id not in self._context_diff.new_snapshots - and candidate.is_model - and not candidate.model.forward_only - and promoted.is_forward_only - and not promoted.is_paused - and not candidate.is_no_rebuild - and promoted.version == candidate.version - ): - raise PlanError( - f"Attempted to revert to an unrevertable version of model '{name}'. Run `sqlmesh plan` again to mitigate the issue." - ) - def _ensure_no_broken_references(self) -> None: for snapshot in self._context_diff.snapshots.values(): broken_references = { diff --git a/tests/core/test_plan.py b/tests/core/test_plan.py index 35c3628cff..66018d4be4 100644 --- a/tests/core/test_plan.py +++ b/tests/core/test_plan.py @@ -37,7 +37,6 @@ from sqlmesh.utils.dag import DAG from sqlmesh.utils.date import ( now, - now_timestamp, to_date, to_datetime, to_timestamp, @@ -1097,53 +1096,6 @@ def test_end_validation(make_snapshot, mocker: MockerFixture): assert restatement_prod_plan_builder.build().end == "2022-01-04" -def test_forward_only_revert_not_allowed(make_snapshot, mocker: MockerFixture): - snapshot = make_snapshot(SqlModel(name="a", query=parse_one("select 1, ds"))) - snapshot.categorize_as(SnapshotChangeCategory.BREAKING) - assert not snapshot.is_forward_only - - forward_only_snapshot = make_snapshot(SqlModel(name="a", query=parse_one("select 2, ds"))) - forward_only_snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only=True) - forward_only_snapshot.version = snapshot.version - forward_only_snapshot.unpaused_ts = now_timestamp() - assert forward_only_snapshot.is_forward_only - - context_diff = ContextDiff( - environment="test_environment", - is_new_environment=True, - is_unfinalized_environment=False, - normalize_environment_name=True, - create_from="prod", - create_from_env_exists=True, - added=set(), - removed_snapshots={}, - modified_snapshots={snapshot.name: (snapshot, forward_only_snapshot)}, - snapshots={snapshot.snapshot_id: snapshot}, - new_snapshots={}, - previous_plan_id=None, - previously_promoted_snapshot_ids=set(), - previous_finalized_snapshots=None, - previous_gateway_managed_virtual_layer=False, - gateway_managed_virtual_layer=False, - environment_statements=[], - ) - - with pytest.raises( - PlanError, - match=r"Attempted to revert to an unrevertable version of model.*", - ): - PlanBuilder(context_diff, forward_only=True).build() - - # Make sure the plan can be created if a new snapshot version was enforced. - new_version_snapshot = make_snapshot( - SqlModel(name="a", query=parse_one("select 1, ds"), stamp="test_stamp") - ) - snapshot.categorize_as(SnapshotChangeCategory.BREAKING) - context_diff.modified_snapshots = {snapshot.name: (new_version_snapshot, forward_only_snapshot)} - context_diff.new_snapshots = {new_version_snapshot.snapshot_id: new_version_snapshot} - PlanBuilder(context_diff, forward_only=True).build() - - def test_forward_only_plan_seed_models(make_snapshot, mocker: MockerFixture): snapshot_a = make_snapshot( SeedModel( From 06e25410255b8ad6751ccec1eeecc8269de43e77 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 14:45:06 -0700 Subject: [PATCH 22/27] fix manual categorization for the model kind change --- sqlmesh/core/plan/builder.py | 18 +++++++-------- tests/core/test_integration.py | 42 ++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 94003481c7..553fd05124 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -586,7 +586,12 @@ def _categorize_snapshots( if not snapshot or not self._is_new_snapshot(snapshot): continue - forward_only = self._is_forward_only_change(s_id) or self._forward_only + forward_only = self._forward_only or self._is_forward_only_change(s_id) + if forward_only and s_id.name in self._context_diff.modified_snapshots: + new, old = self._context_diff.modified_snapshots[s_id.name] + if _should_force_rebuild(old, new) or snapshot.is_seed: + # Breaking kind changes and seed changes can't be forward-only. + forward_only = False if s_id in self._choices: snapshot.categorize_as(self._choices[s_id], forward_only) @@ -607,15 +612,10 @@ def _categorize_snapshot( s_id = snapshot.snapshot_id if self._context_diff.directly_modified(s_id.name): - new, old = self._context_diff.modified_snapshots[s_id.name] - should_force_rebuild = _should_force_rebuild(old, new) - if should_force_rebuild or snapshot.is_seed: - # Breaking kind changes and seed changes can't be forward-only. - forward_only = False - if self._auto_categorization_enabled: - if should_force_rebuild: - snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only) + new, old = self._context_diff.modified_snapshots[s_id.name] + if _should_force_rebuild(old, new): + snapshot.categorize_as(SnapshotChangeCategory.BREAKING, False) return s_id_with_missing_columns: t.Optional[SnapshotId] = None diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index 025b2f45e6..0f978a6c79 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2557,8 +2557,6 @@ def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_conte assert len(data_objects) == 1 assert data_objects[0].type == "table" - context.state_sync.clear_cache() - # Change back to view model = context.get_model("sushi.top_waiters") model = model.copy(update={"kind": ViewKind()}) @@ -2605,6 +2603,46 @@ def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_conte assert data_objects[0].type == "table" +@time_machine.travel("2023-01-08 15:00:00 UTC") +def test_virtual_environment_mode_dev_only_model_kind_change_manual_categorization( + init_and_plan_context: t.Callable, +): + context, plan = init_and_plan_context( + "examples/sushi", config="test_config_virtual_environment_mode_dev_only" + ) + context.apply(plan) + + model = context.get_model("sushi.top_waiters") + model = model.copy(update={"kind": FullKind()}) + context.upsert_model(model) + dev_plan_builder = context.plan_builder("dev", skip_tests=True, no_auto_categorization=True) + dev_plan_builder.set_choice( + dev_plan_builder._context_diff.snapshots[context.get_snapshot(model.name).snapshot_id], + SnapshotChangeCategory.NON_BREAKING, + ) + dev_plan = dev_plan_builder.build() + assert dev_plan.requires_backfill + assert len(dev_plan.missing_intervals) == 1 + context.apply(dev_plan) + + prod_plan = context.plan_builder("prod", skip_tests=True).build() + assert prod_plan.requires_backfill + assert prod_plan.missing_intervals == [ + SnapshotIntervals( + snapshot_id=context.get_snapshot("sushi.top_waiters").snapshot_id, + intervals=[ + (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + (to_timestamp("2023-01-02"), to_timestamp("2023-01-03")), + (to_timestamp("2023-01-03"), to_timestamp("2023-01-04")), + (to_timestamp("2023-01-04"), to_timestamp("2023-01-05")), + (to_timestamp("2023-01-05"), to_timestamp("2023-01-06")), + (to_timestamp("2023-01-06"), to_timestamp("2023-01-07")), + (to_timestamp("2023-01-07"), to_timestamp("2023-01-08")), + ], + ), + ] + + @time_machine.travel("2023-01-08 15:00:00 UTC") def test_restatement_plan_ignores_changes(init_and_plan_context: t.Callable): context, plan = init_and_plan_context("examples/sushi") From 8fe9b65b5b4e5a3d451205667e132ef4913e627c Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 15:29:26 -0700 Subject: [PATCH 23/27] cosmetic --- tests/core/engine_adapter/test_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/core/engine_adapter/test_base.py b/tests/core/engine_adapter/test_base.py index f1dbe7d33a..b760a4f4a1 100644 --- a/tests/core/engine_adapter/test_base.py +++ b/tests/core/engine_adapter/test_base.py @@ -2744,7 +2744,6 @@ def test_replace_query_data_object_type_mismatch( "test_table", parse_one("SELECT a FROM tbl"), {"a": exp.DataType.build("INT")} ) - # TODO: Shouldn't we enforce that `a` is casted to an int? assert to_sql_calls(adapter) == [ 'DROP VIEW IF EXISTS "test_table"', 'CREATE OR REPLACE TABLE "test_table" AS SELECT CAST("a" AS INT) AS "a" FROM (SELECT "a" FROM "tbl") AS "_subquery"', From a90932d2d1ea3cf50076576e2ae025f71ac3d4f2 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 17:12:11 -0700 Subject: [PATCH 24/27] address comments --- sqlmesh/core/engine_adapter/base.py | 56 +++++++++++++------------ sqlmesh/core/engine_adapter/redshift.py | 4 +- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py index cbc2acbdb9..bb1ac9d13a 100644 --- a/sqlmesh/core/engine_adapter/base.py +++ b/sqlmesh/core/engine_adapter/base.py @@ -40,13 +40,13 @@ ) from sqlmesh.core.model.kind import TimeColumn from sqlmesh.core.schema_diff import SchemaDiffer -from sqlmesh.utils import columns_to_types_all_known, random_id, CorrelationId -from sqlmesh.utils.connection_pool import create_connection_pool, ConnectionPool +from sqlmesh.utils import CorrelationId, columns_to_types_all_known, random_id +from sqlmesh.utils.connection_pool import ConnectionPool, create_connection_pool from sqlmesh.utils.date import TimeLike, make_inclusive, to_time_column from sqlmesh.utils.errors import ( + MissingDefaultCatalogError, SQLMeshError, UnsupportedCatalogOperationError, - MissingDefaultCatalogError, ) from sqlmesh.utils.pandas import columns_to_types_from_df @@ -55,8 +55,8 @@ from sqlmesh.core._typing import SchemaName, SessionProperties, TableName from sqlmesh.core.engine_adapter._typing import ( - BigframeSession, DF, + BigframeSession, PySparkDataFrame, PySparkSession, Query, @@ -371,7 +371,9 @@ def replace_query( """ target_table = exp.to_table(table_name) - table_exists = self._drop_data_object_on_type_mismatch(target_table, DataObjectType.TABLE) + target_data_object = self._get_data_object(target_table) + table_exists = target_data_object is not None + self._drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE) source_queries, columns_to_types = self._get_source_queries_and_columns_to_types( query_or_df, columns_to_types, target_table=target_table @@ -1146,7 +1148,7 @@ def create_view( if replace: self._drop_data_object_on_type_mismatch( - view_name, + self._get_data_object(view_name), DataObjectType.VIEW if not materialized else DataObjectType.MATERIALIZED_VIEW, ) @@ -2515,34 +2517,34 @@ def _truncate_table(self, table_name: TableName) -> None: table = exp.to_table(table_name) self.execute(f"TRUNCATE TABLE {table.sql(dialect=self.dialect, identify=True)}") + def _get_data_object(self, target_name: TableName) -> t.Optional[DataObject]: + target_table = exp.to_table(target_name) + existing_data_objects = self.get_data_objects( + schema_(target_table.db, target_table.catalog), {target_table.name} + ) + if existing_data_objects: + return existing_data_objects[0] + return None + def _drop_data_object_on_type_mismatch( - self, target_name: TableName, expected_type: DataObjectType - ) -> bool: + self, data_object: t.Optional[DataObject], expected_type: DataObjectType + ) -> None: """Drops a data object if it exists and is not of the expected type. Args: - target_name: The name of the data object to check. + data_object: The data object to check. expected_type: The expected type of the data object. - - Returns: - True if the data object exists and is of the expected type, False otherwise. """ - target_table = exp.to_table(target_name) - existing_data_objects = self.get_data_objects( - schema_(target_table.db, target_table.catalog), {target_table.name} - ) - if existing_data_objects: - if existing_data_objects[0].type == expected_type: - return True + if data_object is None or data_object.type == expected_type: + return - logger.warning( - "Target data object '%s' is a %s and not a %s, dropping it", - target_table.sql(dialect=self.dialect), - existing_data_objects[0].type.value, - expected_type.value, - ) - self.drop_data_object(existing_data_objects[0]) - return False + logger.warning( + "Target data object '%s' is a %s and not a %s, dropping it", + data_object.to_table().sql(dialect=self.dialect), + data_object.type.value, + expected_type.value, + ) + self.drop_data_object(data_object) def _replace_by_key( self, diff --git a/sqlmesh/core/engine_adapter/redshift.py b/sqlmesh/core/engine_adapter/redshift.py index 8aa0916b27..511c27ad98 100644 --- a/sqlmesh/core/engine_adapter/redshift.py +++ b/sqlmesh/core/engine_adapter/redshift.py @@ -262,7 +262,9 @@ def replace_query( """ import pandas as pd - table_exists = self._drop_data_object_on_type_mismatch(table_name, DataObjectType.TABLE) + target_data_object = self._get_data_object(table_name) + table_exists = target_data_object is not None + self._drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE) if not isinstance(query_or_df, pd.DataFrame) or not table_exists: return super().replace_query( From 08d80bf0ebe2b72d32a4655167d0dbb69a80a28f Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 17:28:39 -0700 Subject: [PATCH 25/27] address doc comments --- docs/guides/configuration.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index 993ac2e5e2..b137546d84 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -538,11 +538,11 @@ sqlmesh_md5__d3b07384d113edec49eaa6238ad5ff00__dev This has a downside that now it's much more difficult to determine which table corresponds to which model by just looking at the database with a SQL client. However, the table names have a predictable length so there are no longer any surprises with identfiers exceeding the max length at the physical layer. -#### Virtual Data Environment modes +#### Virtual Data Environment Modes By default, Virtual Data Environments (VDE) are applied across both development and production environments. This allows SQLMesh to reuse physical tables when appropriate, even when promoting from development to production. -However, users may sometimes prefer their production environment to be non-virtual. The non-exhaustive list of reasons may include: +However, users may prefer their production environment to be non-virtual. The non-exhaustive list of reasons may include: - Integration with third-party tools and platforms, such as data catalogs, may not work well with the virtual view layer that SQLMesh imposes by default - A desire to rely on time travel features provided by cloud data warehouses such as BigQuery, Snowflake, and Databricks @@ -565,7 +565,7 @@ To mitigate this, SQLMesh offers an alternative 'dev-only' mode for using VDE. I ) ``` -As the name suggests, 'dev-only' mode means that VDE is applied only in development environments, while in production, model tables and views are updated directly, bypassing the virtual layer. This also means that physical tables in production will be created using the original, unversioned model names. Users will still benefit from VDE and data reuse across development environments. +'dev-only' mode means that VDE is applied only in development environments. While in production, model tables and views are updated directly and bypass the virtual layer. This also means that physical tables in production will be created using the original, **unversioned** model names. Users will still benefit from VDE and data reuse across development environments. Please note the following tradeoffs when enabling this mode: @@ -573,7 +573,7 @@ Please note the following tradeoffs when enabling this mode: - Reverting a model to a previous version will be applied going forward and may require an explicit data restatement !!! warning - Switching the mode for an existing project will result in a complete rebuild of all models in the project. Refer to the [Table Migration Guide](./table_migration.md) to migrate existing tables without rebuilding them from scratch. + Switching the mode for an existing project will result in a **complete rebuild** of all models in the project. Refer to the [Table Migration Guide](./table_migration.md) to migrate existing tables without rebuilding them from scratch. #### Environment view catalogs From 267011913bf3fdfa3c0afc27ecef47be65467284 Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Mon, 11 Aug 2025 22:39:48 -0700 Subject: [PATCH 26/27] adjust intervals based on force rebuild at runtime --- sqlmesh/core/plan/builder.py | 54 +++---- sqlmesh/core/plan/common.py | 19 +++ sqlmesh/core/plan/stages.py | 43 +++++- sqlmesh/core/snapshot/definition.py | 22 ++- tests/core/test_integration.py | 6 +- tests/core/test_plan_stages.py | 225 ++++++++++++++++++++++++++++ tests/core/test_snapshot.py | 125 ---------------- 7 files changed, 315 insertions(+), 179 deletions(-) create mode 100644 sqlmesh/core/plan/common.py diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py index 553fd05124..3dd74755d3 100644 --- a/sqlmesh/core/plan/builder.py +++ b/sqlmesh/core/plan/builder.py @@ -16,6 +16,7 @@ ) from sqlmesh.core.context_diff import ContextDiff from sqlmesh.core.environment import EnvironmentNamingInfo +from sqlmesh.core.plan.common import should_force_rebuild from sqlmesh.core.plan.definition import ( Plan, SnapshotMapping, @@ -276,7 +277,7 @@ def build(self) -> Plan: self._check_destructive_changes(directly_modified) self._categorize_snapshots(dag, indirectly_modified) - self._adjust_new_snapshot_intervals() + self._adjust_snapshot_intervals() deployability_index = ( DeployabilityIndex.create( @@ -508,21 +509,22 @@ def _build_models_to_backfill( ).sorted } - def _adjust_new_snapshot_intervals(self) -> None: - old_snapshots = { - (old.name, old.version_get_or_generate()): old - for _, old in self._context_diff.modified_snapshots.values() - } - - for new in self._context_diff.new_snapshots.values(): - new.intervals = [] - new.dev_intervals = [] - old = old_snapshots.get((new.name, new.version_get_or_generate())) - if not old: + def _adjust_snapshot_intervals(self) -> None: + for new, old in self._context_diff.modified_snapshots.values(): + if not new.is_model or not old.is_model: continue - new.merge_intervals(old) - if new.is_forward_only: - new.dev_intervals = new.intervals.copy() + is_same_version = old.version_get_or_generate() == new.version_get_or_generate() + if is_same_version and should_force_rebuild(old, new): + # If the difference between 2 snapshots requires a full rebuild, + # then clear the intervals for the new snapshot. + self._context_diff.snapshots[new.snapshot_id].intervals = [] + elif new.snapshot_id in self._context_diff.new_snapshots: + new.intervals = [] + new.dev_intervals = [] + if is_same_version: + new.merge_intervals(old) + if new.is_forward_only: + new.dev_intervals = new.intervals.copy() def _check_destructive_changes(self, directly_modified: t.Set[SnapshotId]) -> None: for s_id in sorted(directly_modified): @@ -589,7 +591,7 @@ def _categorize_snapshots( forward_only = self._forward_only or self._is_forward_only_change(s_id) if forward_only and s_id.name in self._context_diff.modified_snapshots: new, old = self._context_diff.modified_snapshots[s_id.name] - if _should_force_rebuild(old, new) or snapshot.is_seed: + if should_force_rebuild(old, new) or snapshot.is_seed: # Breaking kind changes and seed changes can't be forward-only. forward_only = False @@ -614,7 +616,7 @@ def _categorize_snapshot( if self._context_diff.directly_modified(s_id.name): if self._auto_categorization_enabled: new, old = self._context_diff.modified_snapshots[s_id.name] - if _should_force_rebuild(old, new): + if should_force_rebuild(old, new): snapshot.categorize_as(SnapshotChangeCategory.BREAKING, False) return @@ -772,7 +774,7 @@ def _is_forward_only_change(self, s_id: SnapshotId) -> bool: if snapshot.name in self._context_diff.modified_snapshots: _, old = self._context_diff.modified_snapshots[snapshot.name] # If the model kind has changed in a breaking way, then we can't consider this to be a forward-only change. - if snapshot.is_model and _should_force_rebuild(old, snapshot): + if snapshot.is_model and should_force_rebuild(old, snapshot): return False return ( snapshot.is_model and snapshot.model.forward_only and bool(snapshot.previous_versions) @@ -870,19 +872,3 @@ def _modified_and_added_snapshots(self) -> t.List[Snapshot]: if snapshot.name in self._context_diff.modified_snapshots or snapshot.snapshot_id in self._context_diff.added ] - - -def _should_force_rebuild(old: Snapshot, new: Snapshot) -> bool: - if old.virtual_environment_mode != new.virtual_environment_mode: - # If the virtual environment mode has changed, then we need to rebuild - return True - if old.model.kind.name == new.model.kind.name: - # If the kind hasn't changed, then we don't need to rebuild - return False - if not old.is_incremental or not new.is_incremental: - # If either is not incremental, then we need to rebuild - return True - if old.model.partitioned_by == new.model.partitioned_by: - # If the partitioning hasn't changed, then we don't need to rebuild - return False - return True diff --git a/sqlmesh/core/plan/common.py b/sqlmesh/core/plan/common.py new file mode 100644 index 0000000000..e6b7a4d10c --- /dev/null +++ b/sqlmesh/core/plan/common.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from sqlmesh.core.snapshot import Snapshot + + +def should_force_rebuild(old: Snapshot, new: Snapshot) -> bool: + if old.virtual_environment_mode != new.virtual_environment_mode: + # If the virtual environment mode has changed, then we need to rebuild + return True + if old.model.kind.name == new.model.kind.name: + # If the kind hasn't changed, then we don't need to rebuild + return False + if not old.is_incremental or not new.is_incremental: + # If either is not incremental, then we need to rebuild + return True + if old.model.partitioned_by == new.model.partitioned_by: + # If the partitioning hasn't changed, then we don't need to rebuild + return False + return True diff --git a/sqlmesh/core/plan/stages.py b/sqlmesh/core/plan/stages.py index 1f1895d877..7ef9fcb7ef 100644 --- a/sqlmesh/core/plan/stages.py +++ b/sqlmesh/core/plan/stages.py @@ -1,7 +1,8 @@ import typing as t from dataclasses import dataclass -from sqlmesh.core.environment import EnvironmentStatements, EnvironmentNamingInfo +from sqlmesh.core.environment import EnvironmentStatements, EnvironmentNamingInfo, Environment +from sqlmesh.core.plan.common import should_force_rebuild from sqlmesh.core.plan.definition import EvaluatablePlan from sqlmesh.core.state_sync import StateReader from sqlmesh.core.scheduler import merged_missing_intervals, SnapshotToIntervals @@ -230,8 +231,9 @@ def build(self, plan: EvaluatablePlan) -> t.List[PlanStage]: all_selected_for_backfill_snapshots = { s.snapshot_id for s in snapshots.values() if plan.is_selected_for_backfill(s.name) } + existing_environment = self.state_reader.get_environment(plan.environment.name) - self._adjust_intervals(snapshots_by_name, plan) + self._adjust_intervals(snapshots_by_name, plan, existing_environment) deployability_index = DeployabilityIndex.create(snapshots, start=plan.start) deployability_index_for_creation = deployability_index @@ -269,7 +271,7 @@ def build(self, plan: EvaluatablePlan) -> t.List[PlanStage]: missing_intervals_after_promote[snapshot] = intervals promoted_snapshots, demoted_snapshots, demoted_environment_naming_info = ( - self._get_promoted_demoted_snapshots(plan) + self._get_promoted_demoted_snapshots(plan, existing_environment) ) stages: t.List[PlanStage] = [] @@ -459,11 +461,10 @@ def _should_update_virtual_layer(snapshot: SnapshotTableInfo) -> bool: ) def _get_promoted_demoted_snapshots( - self, plan: EvaluatablePlan + self, plan: EvaluatablePlan, existing_environment: t.Optional[Environment] ) -> t.Tuple[ t.Set[SnapshotTableInfo], t.Set[SnapshotTableInfo], t.Optional[EnvironmentNamingInfo] ]: - existing_environment = self.state_reader.get_environment(plan.environment.name) if existing_environment: new_table_infos = { table_info.name: table_info for table_info in plan.environment.promoted_snapshots @@ -579,10 +580,40 @@ def _should_create(s: Snapshot) -> bool: return [s for s in snapshots.values() if _should_create(s)] def _adjust_intervals( - self, snapshots_by_name: t.Dict[str, Snapshot], plan: EvaluatablePlan + self, + snapshots_by_name: t.Dict[str, Snapshot], + plan: EvaluatablePlan, + existing_environment: t.Optional[Environment], ) -> None: # Make sure the intervals are up to date and restatements are reflected self.state_reader.refresh_snapshot_intervals(snapshots_by_name.values()) + + if existing_environment: + new_snapshot_ids = set() + new_snapshot_versions = set() + for s in snapshots_by_name.values(): + if s.is_model: + new_snapshot_ids.add(s.snapshot_id) + new_snapshot_versions.add(s.name_version) + # Only compare to old snapshots that share the same version as the new snapshots + old_snapshot_ids = { + s.snapshot_id + for s in existing_environment.snapshots + if s.is_model + and s.name_version in new_snapshot_versions + and s.snapshot_id not in new_snapshot_ids + } + if old_snapshot_ids: + old_snapshots = self.state_reader.get_snapshots(old_snapshot_ids) + for old in old_snapshots.values(): + new = snapshots_by_name.get(old.name) + if not new or old.version != new.version: + continue + if should_force_rebuild(old, new): + # If the difference between 2 snapshots requires a full rebuild, + # then clear the intervals for the new snapshot. + new.intervals = [] + for new_snapshot in plan.new_snapshots: if new_snapshot.is_forward_only: # Forward-only snapshots inherit intervals in dev because of cloning diff --git a/sqlmesh/core/snapshot/definition.py b/sqlmesh/core/snapshot/definition.py index f00573403c..076c1efa78 100644 --- a/sqlmesh/core/snapshot/definition.py +++ b/sqlmesh/core/snapshot/definition.py @@ -893,19 +893,15 @@ def merge_intervals(self, other: t.Union[Snapshot, SnapshotIntervals]) -> None: Args: other: The target snapshot to inherit intervals from. """ - if self.is_no_rebuild or self.virtual_environment_mode.is_full or not self.is_paused: - # If the virtual environment mode is not full we can only merge prod intervals if this snapshot - # is currently promoted in production or if it's forward-only / metadata / indirect non-breaking. - # Otherwise, we want to ignore any existing intervals and backfill this snapshot from scratch. - effective_from_ts = self.normalized_effective_from_ts or 0 - apply_effective_from = effective_from_ts > 0 and self.identifier != other.identifier - for start, end in other.intervals: - # If the effective_from is set, then intervals that come after it must come from - # the current snapshots. - if apply_effective_from and start < effective_from_ts: - end = min(end, effective_from_ts) - if not apply_effective_from or end <= effective_from_ts: - self.add_interval(start, end) + effective_from_ts = self.normalized_effective_from_ts or 0 + apply_effective_from = effective_from_ts > 0 and self.identifier != other.identifier + for start, end in other.intervals: + # If the effective_from is set, then intervals that come after it must come from + # the current snapshots. + if apply_effective_from and start < effective_from_ts: + end = min(end, effective_from_ts) + if not apply_effective_from or end <= effective_from_ts: + self.add_interval(start, end) if self.dev_version == other.dev_version: # Merge dev intervals if the dev versions match which would mean diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index 0f978a6c79..54a3003b41 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2694,7 +2694,11 @@ def test_restatement_plan_across_environments_snapshot_with_shared_version( assert isinstance(previous_kind, IncrementalByTimeRangeKind) model = model.copy( - update={"kind": IncrementalUnmanagedKind(), "physical_version": "pinned_version_12345"} + update={ + "kind": IncrementalUnmanagedKind(), + "physical_version": "pinned_version_12345", + "partitioned_by_": [exp.column("event_date")], + } ) context.upsert_model(model) context.plan("prod", auto_apply=True, no_prompts=True) diff --git a/tests/core/test_plan_stages.py b/tests/core/test_plan_stages.py index 4bdefd9e38..1b660e1a87 100644 --- a/tests/core/test_plan_stages.py +++ b/tests/core/test_plan_stages.py @@ -1558,3 +1558,228 @@ def test_build_plan_stages_virtual_environment_mode_no_updates( # No VirtualLayerUpdateStage should be created since all snapshots are filtered out virtual_stages = [stage for stage in stages if isinstance(stage, VirtualLayerUpdateStage)] assert len(virtual_stages) == 0 + + +def test_adjust_intervals_new_forward_only_dev_intervals( + make_snapshot, mocker: MockerFixture +) -> None: + forward_only_snapshot = make_snapshot( + SqlModel( + name="forward_only_model", + query=parse_one("select 1, ds"), + kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + ) + ) + forward_only_snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only=True) + forward_only_snapshot.intervals = [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))] + + forward_only_snapshot.dev_intervals = [] + + state_reader = mocker.Mock(spec=StateReader) + state_reader.refresh_snapshot_intervals = mocker.Mock() + state_reader.get_snapshots.return_value = {} + state_reader.get_environment.return_value = None + + environment = Environment( + snapshots=[forward_only_snapshot.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id=None, + promoted_snapshot_ids=[forward_only_snapshot.snapshot_id], + ) + + plan = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[forward_only_snapshot], # This snapshot should have dev_intervals set + environment=environment, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements={}, + is_dev=True, # Dev environment + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[], + requires_backfill=True, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + assert forward_only_snapshot.dev_intervals == [] + + build_plan_stages(plan, state_reader, None) + + assert forward_only_snapshot.dev_intervals == [ + (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")) + ] + assert forward_only_snapshot.dev_intervals is not forward_only_snapshot.intervals + + state_reader.refresh_snapshot_intervals.assert_called_once() + + +def test_adjust_intervals_restatement_removal( + snapshot_a: Snapshot, snapshot_b: Snapshot, mocker: MockerFixture +) -> None: + snapshot_a.intervals = [(to_timestamp("2023-01-01"), to_timestamp("2023-01-04"))] + snapshot_b.intervals = [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))] + + original_a_intervals = snapshot_a.intervals.copy() + original_b_intervals = snapshot_b.intervals.copy() + + state_reader = mocker.Mock(spec=StateReader) + state_reader.refresh_snapshot_intervals = mocker.Mock() + state_reader.get_snapshots.return_value = {} + state_reader.get_environment.return_value = None + + environment = Environment( + snapshots=[snapshot_a.table_info, snapshot_b.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id=None, + promoted_snapshot_ids=[snapshot_a.snapshot_id, snapshot_b.snapshot_id], + ) + + restatements = { + snapshot_a.name: (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + snapshot_b.name: (to_timestamp("2023-01-01"), to_timestamp("2023-01-02")), + } + + plan = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[snapshot_a, snapshot_b], + environment=environment, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements=restatements, + is_dev=False, + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[], + requires_backfill=True, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + stages = build_plan_stages(plan, state_reader, None) + + assert snapshot_a.intervals != original_a_intervals + assert snapshot_b.intervals != original_b_intervals + + state_reader.refresh_snapshot_intervals.assert_called_once() + + restatement_stages = [stage for stage in stages if isinstance(stage, RestatementStage)] + assert len(restatement_stages) == 1 + restatement_stage = restatement_stages[0] + assert len(restatement_stage.snapshot_intervals) == 2 + + backfill_stages = [stage for stage in stages if isinstance(stage, BackfillStage)] + assert len(backfill_stages) == 1 + (snapshot, intervals) = next(iter(backfill_stages[0].snapshot_to_intervals.items())) + assert snapshot.intervals == [(to_timestamp("2023-01-02"), to_timestamp("2023-01-04"))] + assert intervals == [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))] + + +def test_adjust_intervals_should_force_rebuild(make_snapshot, mocker: MockerFixture) -> None: + old_snapshot = make_snapshot( + SqlModel( + name="test_model", + query=parse_one("select 1, ds"), + kind=dict(name=ModelKindName.INCREMENTAL_BY_TIME_RANGE, time_column="ds"), + ) + ) + old_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + old_snapshot.intervals = [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))] + + new_snapshot = make_snapshot( + SqlModel( + name="test_model", + query=parse_one("select 1, ds"), + kind=dict(name=ModelKindName.FULL), + ) + ) + new_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) + new_snapshot.version = old_snapshot.version + new_snapshot.intervals = [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))] + + state_reader = mocker.Mock(spec=StateReader) + state_reader.refresh_snapshot_intervals = mocker.Mock() + state_reader.get_snapshots.side_effect = [{}, {old_snapshot.snapshot_id: old_snapshot}, {}, {}] + + existing_environment = Environment( + name="prod", + snapshots=[old_snapshot.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="previous_plan", + promoted_snapshot_ids=[old_snapshot.snapshot_id], + finalized_ts=to_timestamp("2023-01-02"), + ) + state_reader.get_environment.return_value = existing_environment + + environment = Environment( + snapshots=[new_snapshot.table_info], + start_at="2023-01-01", + end_at="2023-01-02", + plan_id="test_plan", + previous_plan_id="previous_plan", + promoted_snapshot_ids=[new_snapshot.snapshot_id], + ) + + plan = EvaluatablePlan( + start="2023-01-01", + end="2023-01-02", + new_snapshots=[new_snapshot], + environment=environment, + no_gaps=False, + skip_backfill=False, + empty_backfill=False, + restatements={}, + is_dev=False, + allow_destructive_models=set(), + forward_only=False, + end_bounded=False, + ensure_finalized_snapshots=False, + directly_modified_snapshots=[new_snapshot.snapshot_id], + indirectly_modified_snapshots={}, + metadata_updated_snapshots=[], + removed_snapshots=[], + requires_backfill=True, + models_to_backfill=None, + execution_time="2023-01-02", + disabled_restatement_models=set(), + environment_statements=None, + user_provided_flags=None, + ) + + stages = build_plan_stages(plan, state_reader, None) + + state_reader.refresh_snapshot_intervals.assert_called_once() + state_reader.get_environment.assert_called() + + assert not new_snapshot.intervals + backfill_stages = [stage for stage in stages if isinstance(stage, BackfillStage)] + assert len(backfill_stages) == 1 + (snapshot, intervals) = next(iter(backfill_stages[0].snapshot_to_intervals.items())) + assert not snapshot.intervals + assert intervals == [(to_timestamp("2023-01-01"), to_timestamp("2023-01-02"))] diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index 194a5b9487..bcb704ba48 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -3344,131 +3344,6 @@ def test_partitioned_by_roundtrip(make_snapshot: t.Callable): assert deserialized.node.partitioned_by == snapshot.node.partitioned_by -def test_merge_intervals_virtual_environment_mode_full(make_snapshot): - model = SqlModel( - name="test_model", - kind=IncrementalByTimeRangeKind(time_column="ds"), - query=parse_one("SELECT 1, ds FROM parent_tbl"), - virtual_environment_mode=VirtualEnvironmentMode.FULL, - ) - - # Create source snapshot with intervals - source_snapshot = make_snapshot(model) - source_snapshot.add_interval("2020-01-01", "2020-01-03") - source_snapshot.add_interval("2020-01-05", "2020-01-07") - - # Create target snapshot with different fingerprint and virtual_environment_mode FULL - target_snapshot = make_snapshot(model) - target_snapshot.fingerprint = SnapshotFingerprint( - data_hash="different", metadata_hash="different", parent_data_hash="different" - ) - target_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) - - # When virtual_environment_mode is FULL, intervals should be merged - target_snapshot.merge_intervals(source_snapshot) - - assert target_snapshot.intervals == [ - (to_timestamp("2020-01-01"), to_timestamp("2020-01-04")), - (to_timestamp("2020-01-05"), to_timestamp("2020-01-08")), - ] - - -def test_merge_intervals_virtual_environment_mode_dev_only_paused_breaking(make_snapshot): - model = SqlModel( - name="test_model", - kind=IncrementalByTimeRangeKind(time_column="ds"), - query=parse_one("SELECT 1, ds FROM parent_tbl"), - virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, - ) - - # Create source snapshot with intervals - source_snapshot = make_snapshot(model) - source_snapshot.add_interval("2020-01-01", "2020-01-03") - source_snapshot.add_interval("2020-01-05", "2020-01-07") - - # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY - target_snapshot = make_snapshot(model) - target_snapshot.fingerprint = SnapshotFingerprint( - data_hash="different", metadata_hash="different", parent_data_hash="different" - ) - target_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) - - # Ensure snapshot is paused (unpaused_ts is None) - target_snapshot.unpaused_ts = None - - # When virtual_environment_mode is DEV_ONLY and snapshot is paused and breaking, intervals should NOT be merged - target_snapshot.merge_intervals(source_snapshot) - - assert target_snapshot.intervals == [] - - -def test_merge_intervals_virtual_environment_mode_dev_only_unpaused(make_snapshot): - model = SqlModel( - name="test_model", - kind=IncrementalByTimeRangeKind(time_column="ds"), - query=parse_one("SELECT 1, ds FROM parent_tbl"), - virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, - ) - - # Create source snapshot with intervals - source_snapshot = make_snapshot(model) - source_snapshot.add_interval("2020-01-01", "2020-01-03") - source_snapshot.add_interval("2020-01-05", "2020-01-07") - - # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY - target_snapshot = make_snapshot(model) - target_snapshot.fingerprint = SnapshotFingerprint( - data_hash="different", metadata_hash="different", parent_data_hash="different" - ) - target_snapshot.categorize_as(SnapshotChangeCategory.BREAKING) - - # Ensure snapshot is unpaused - target_snapshot.unpaused_ts = to_timestamp("2020-01-01") - - # When snapshot is unpaused, intervals should be merged regardless of virtual_environment_mode - target_snapshot.merge_intervals(source_snapshot) - - assert target_snapshot.intervals == [ - (to_timestamp("2020-01-01"), to_timestamp("2020-01-04")), - (to_timestamp("2020-01-05"), to_timestamp("2020-01-08")), - ] - - -def test_merge_intervals_virtual_environment_mode_dev_only_no_rebuild(make_snapshot): - model = SqlModel( - name="test_model", - kind=IncrementalByTimeRangeKind(time_column="ds"), - query=parse_one("SELECT 1, ds FROM parent_tbl"), - virtual_environment_mode=VirtualEnvironmentMode.DEV_ONLY, - ) - - # Create source snapshot with intervals - source_snapshot = make_snapshot(model) - source_snapshot.add_interval("2020-01-01", "2020-01-03") - source_snapshot.add_interval("2020-01-05", "2020-01-07") - - # Create target snapshot with different fingerprint and virtual_environment_mode DEV_ONLY - target_snapshot = make_snapshot(model) - target_snapshot.fingerprint = SnapshotFingerprint( - data_hash="different", metadata_hash="different", parent_data_hash="different" - ) - target_snapshot.categorize_as( - SnapshotChangeCategory.BREAKING, - forward_only=True, - ) # This is a no-rebuild categorization - - # Ensure snapshot is paused - target_snapshot.unpaused_ts = None - - # When change category is no-rebuild, intervals should be merged regardless of virtual_environment_mode - target_snapshot.merge_intervals(source_snapshot) - - assert target_snapshot.intervals == [ - (to_timestamp("2020-01-01"), to_timestamp("2020-01-04")), - (to_timestamp("2020-01-05"), to_timestamp("2020-01-08")), - ] - - @pytest.mark.parametrize( "virtual_env_mode,is_deployable,expected_uses_name_as_is", [ From 282cfe316624ac980513cecd37538d69c9fb0cbb Mon Sep 17 00:00:00 2001 From: Iaroslav Zeigerman Date: Tue, 12 Aug 2025 09:32:28 -0700 Subject: [PATCH 27/27] fix model kind change edge case --- sqlmesh/core/engine_adapter/base.py | 37 ++++---- sqlmesh/core/engine_adapter/redshift.py | 5 +- sqlmesh/core/snapshot/evaluator.py | 23 ++++- tests/core/test_integration.py | 42 +++++++++ tests/core/test_snapshot_evaluator.py | 118 ++++++++++++++++++------ 5 files changed, 179 insertions(+), 46 deletions(-) diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py index bb1ac9d13a..d401f0e705 100644 --- a/sqlmesh/core/engine_adapter/base.py +++ b/sqlmesh/core/engine_adapter/base.py @@ -371,9 +371,10 @@ def replace_query( """ target_table = exp.to_table(table_name) - target_data_object = self._get_data_object(target_table) + target_data_object = self.get_data_object(target_table) table_exists = target_data_object is not None - self._drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE) + if self.drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE): + table_exists = False source_queries, columns_to_types = self._get_source_queries_and_columns_to_types( query_or_df, columns_to_types, target_table=target_table @@ -1147,8 +1148,8 @@ def create_view( create_kwargs["properties"] = properties if replace: - self._drop_data_object_on_type_mismatch( - self._get_data_object(view_name), + self.drop_data_object_on_type_mismatch( + self.get_data_object(view_name), DataObjectType.VIEW if not materialized else DataObjectType.MATERIALIZED_VIEW, ) @@ -2056,6 +2057,15 @@ def rename_table( ) self._rename_table(old_table_name, new_table_name) + def get_data_object(self, target_name: TableName) -> t.Optional[DataObject]: + target_table = exp.to_table(target_name) + existing_data_objects = self.get_data_objects( + schema_(target_table.db, target_table.catalog), {target_table.name} + ) + if existing_data_objects: + return existing_data_objects[0] + return None + def get_data_objects( self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None ) -> t.List[DataObject]: @@ -2517,26 +2527,20 @@ def _truncate_table(self, table_name: TableName) -> None: table = exp.to_table(table_name) self.execute(f"TRUNCATE TABLE {table.sql(dialect=self.dialect, identify=True)}") - def _get_data_object(self, target_name: TableName) -> t.Optional[DataObject]: - target_table = exp.to_table(target_name) - existing_data_objects = self.get_data_objects( - schema_(target_table.db, target_table.catalog), {target_table.name} - ) - if existing_data_objects: - return existing_data_objects[0] - return None - - def _drop_data_object_on_type_mismatch( + def drop_data_object_on_type_mismatch( self, data_object: t.Optional[DataObject], expected_type: DataObjectType - ) -> None: + ) -> bool: """Drops a data object if it exists and is not of the expected type. Args: data_object: The data object to check. expected_type: The expected type of the data object. + + Returns: + True if the data object was dropped, False otherwise. """ if data_object is None or data_object.type == expected_type: - return + return False logger.warning( "Target data object '%s' is a %s and not a %s, dropping it", @@ -2545,6 +2549,7 @@ def _drop_data_object_on_type_mismatch( expected_type.value, ) self.drop_data_object(data_object) + return True def _replace_by_key( self, diff --git a/sqlmesh/core/engine_adapter/redshift.py b/sqlmesh/core/engine_adapter/redshift.py index 511c27ad98..829cdf3686 100644 --- a/sqlmesh/core/engine_adapter/redshift.py +++ b/sqlmesh/core/engine_adapter/redshift.py @@ -262,9 +262,10 @@ def replace_query( """ import pandas as pd - target_data_object = self._get_data_object(table_name) + target_data_object = self.get_data_object(table_name) table_exists = target_data_object is not None - self._drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE) + if self.drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE): + table_exists = False if not isinstance(query_or_df, pd.DataFrame) or not table_exists: return super().replace_query( diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index f77c318730..a2ec242e37 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -38,7 +38,7 @@ from sqlmesh.core.audit import Audit, StandaloneAudit from sqlmesh.core.dialect import schema_ from sqlmesh.core.engine_adapter import EngineAdapter -from sqlmesh.core.engine_adapter.shared import InsertOverwriteStrategy +from sqlmesh.core.engine_adapter.shared import InsertOverwriteStrategy, DataObjectType from sqlmesh.core.macros import RuntimeStage from sqlmesh.core.model import ( AuditResult, @@ -934,7 +934,14 @@ def _migrate_snapshot( adapter.transaction(), adapter.session(snapshot.model.render_session_properties(**render_kwargs)), ): - if adapter.table_exists(target_table_name): + target_data_object = adapter.get_data_object(target_table_name) + table_exists = target_data_object is not None + if adapter.drop_data_object_on_type_mismatch( + target_data_object, _snapshot_to_data_object_type(snapshot) + ): + table_exists = False + + if table_exists: evaluation_strategy = _evaluation_strategy(snapshot, adapter) tmp_table_name = snapshot.table_name(is_deployable=False) logger.info( @@ -2307,3 +2314,15 @@ def _check_table_db_is_physical_schema(table_name: str, physical_schema: str) -> raise SQLMeshError( f"Table '{table_name}' is not a part of the physical schema '{physical_schema}' and so can't be dropped." ) + + +def _snapshot_to_data_object_type(snapshot: Snapshot) -> DataObjectType: + if snapshot.is_managed: + return DataObjectType.MANAGED_TABLE + if snapshot.is_materialized_view: + return DataObjectType.MATERIALIZED_VIEW + if snapshot.is_view: + return DataObjectType.VIEW + if snapshot.is_materialized: + return DataObjectType.TABLE + return DataObjectType.UNKNOWN diff --git a/tests/core/test_integration.py b/tests/core/test_integration.py index 54a3003b41..5a0e7bdf48 100644 --- a/tests/core/test_integration.py +++ b/tests/core/test_integration.py @@ -2603,6 +2603,48 @@ def test_virtual_environment_mode_dev_only_model_kind_change(init_and_plan_conte assert data_objects[0].type == "table" +@time_machine.travel("2023-01-08 15:00:00 UTC") +def test_virtual_environment_mode_dev_only_model_kind_change_with_follow_up_changes_in_dev( + init_and_plan_context: t.Callable, +): + context, plan = init_and_plan_context( + "examples/sushi", config="test_config_virtual_environment_mode_dev_only" + ) + context.apply(plan) + + # Make sure the initial state is a view + data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) + assert len(data_objects) == 1 + assert data_objects[0].type == "view" + + # Change to incremental unmanaged kind + model = context.get_model("sushi.top_waiters") + model = model.copy(update={"kind": IncrementalUnmanagedKind()}) + context.upsert_model(model) + dev_plan = context.plan_builder("dev", skip_tests=True).build() + assert dev_plan.missing_intervals + assert dev_plan.requires_backfill + context.apply(dev_plan) + + # Make a follow-up forward-only change + model = add_projection_to_model(t.cast(SqlModel, model)) + context.upsert_model(model) + dev_plan = context.plan_builder("dev", skip_tests=True, forward_only=True).build() + context.apply(dev_plan) + + # Deploy to prod + prod_plan = context.plan_builder("prod", skip_tests=True).build() + assert prod_plan.requires_backfill + assert prod_plan.missing_intervals + assert not prod_plan.context_diff.snapshots[ + context.get_snapshot(model.name).snapshot_id + ].intervals + context.apply(prod_plan) + data_objects = context.engine_adapter.get_data_objects("sushi", {"top_waiters"}) + assert len(data_objects) == 1 + assert data_objects[0].type == "table" + + @time_machine.travel("2023-01-08 15:00:00 UTC") def test_virtual_environment_mode_dev_only_model_kind_change_manual_categorization( init_and_plan_context: t.Callable, diff --git a/tests/core/test_snapshot_evaluator.py b/tests/core/test_snapshot_evaluator.py index b0e7c9cb05..a3c7837711 100644 --- a/tests/core/test_snapshot_evaluator.py +++ b/tests/core/test_snapshot_evaluator.py @@ -1162,11 +1162,8 @@ def test_promote_deployable(mocker: MockerFixture, make_snapshot): ) -def test_migrate(mocker: MockerFixture, make_snapshot): - connection_mock = mocker.NonCallableMock() - cursor_mock = mocker.Mock() - connection_mock.cursor.return_value = cursor_mock - adapter = EngineAdapter(lambda: connection_mock, "") +def test_migrate(mocker: MockerFixture, make_snapshot, make_mocked_engine_adapter): + adapter = make_mocked_engine_adapter(EngineAdapter) session_spy = mocker.spy(adapter, "session") current_table = "sqlmesh__test_schema.test_schema__test_model__1" @@ -1184,6 +1181,11 @@ def columns(table_name): adapter.columns = columns # type: ignore adapter.table_exists = lambda _: True # type: ignore + mocker.patch.object( + adapter, + "get_data_object", + return_value=DataObject(schema="test_schema", name="test_model", type="table"), + ) evaluator = SnapshotEvaluator(adapter) @@ -1202,7 +1204,7 @@ def columns(table_name): evaluator.migrate([snapshot], {}, deployability_index=DeployabilityIndex.none_deployable()) - cursor_mock.execute.assert_has_calls( + adapter.cursor.execute.assert_has_calls( [ call('ALTER TABLE "sqlmesh__test_schema"."test_schema__test_model__1" DROP COLUMN "b"'), call( @@ -1214,13 +1216,10 @@ def columns(table_name): session_spy.assert_called_once() -def test_migrate_missing_table(mocker: MockerFixture, make_snapshot): - connection_mock = mocker.NonCallableMock() - cursor_mock = mocker.Mock() - connection_mock.cursor.return_value = cursor_mock - adapter = EngineAdapter(lambda: connection_mock, "") - +def test_migrate_missing_table(mocker: MockerFixture, make_snapshot, make_mocked_engine_adapter): + adapter = make_mocked_engine_adapter(EngineAdapter) adapter.table_exists = lambda _: False # type: ignore + mocker.patch.object(adapter, "get_data_object", return_value=None) evaluator = SnapshotEvaluator(adapter) @@ -1241,7 +1240,7 @@ def test_migrate_missing_table(mocker: MockerFixture, make_snapshot): evaluator.migrate([snapshot], {}, deployability_index=DeployabilityIndex.none_deployable()) - cursor_mock.execute.assert_has_calls( + adapter.cursor.execute.assert_has_calls( [ call('CREATE TABLE "pre" ("a" INT)'), call( @@ -1267,6 +1266,11 @@ def test_migrate_view( forward_only: bool, ): adapter = make_mocked_engine_adapter(EngineAdapter) + mocker.patch.object( + adapter, + "get_data_object", + return_value=DataObject(schema="test_schema", name="test_model", type="view"), + ) evaluator = SnapshotEvaluator(adapter) @@ -1291,6 +1295,45 @@ def test_migrate_view( ) +def test_migrate_snapshot_data_object_type_mismatch( + mocker: MockerFixture, + make_snapshot, + make_mocked_engine_adapter, +): + adapter = make_mocked_engine_adapter(EngineAdapter) + mocker.patch.object( + adapter, + "get_data_object", + return_value=DataObject( + schema="sqlmesh__test_schema", name="test_schema__test_model__1", type="table" + ), + ) + mocker.patch.object(adapter, "table_exists", return_value=False) + + evaluator = SnapshotEvaluator(adapter) + + model = SqlModel( + name="test_schema.test_model", + kind=ViewKind(), + storage_format="parquet", + query=parse_one("SELECT c, a FROM tbl"), + ) + snapshot = make_snapshot(model, version="1") + snapshot.change_category = SnapshotChangeCategory.BREAKING + snapshot.forward_only = True + + evaluator.migrate([snapshot], {}, deployability_index=DeployabilityIndex.none_deployable()) + + adapter.cursor.execute.assert_has_calls( + [ + call('DROP TABLE IF EXISTS "sqlmesh__test_schema"."test_schema__test_model__1"'), + call( + 'CREATE VIEW "sqlmesh__test_schema"."test_schema__test_model__1" AS SELECT "c" AS "c", "a" AS "a" FROM "tbl" AS "tbl"' + ), + ] + ) + + def test_evaluate_creation_duckdb( snapshot: Snapshot, duck_conn, @@ -1709,11 +1752,9 @@ def test_create_clone_in_dev_self_referencing( def test_on_destructive_change_runtime_check( mocker: MockerFixture, make_snapshot, + make_mocked_engine_adapter, ): - connection_mock = mocker.NonCallableMock() - cursor_mock = mocker.Mock() - connection_mock.cursor.return_value = cursor_mock - adapter = EngineAdapter(lambda: connection_mock, "") + adapter = make_mocked_engine_adapter(EngineAdapter) current_table = "sqlmesh__test_schema.test_schema__test_model__1" @@ -1729,6 +1770,11 @@ def columns(table_name): } adapter.columns = columns # type: ignore + mocker.patch.object( + adapter, + "get_data_object", + return_value=DataObject(schema="test_schema", name="test_model", type=DataObjectType.TABLE), + ) evaluator = SnapshotEvaluator(adapter) @@ -3702,6 +3748,11 @@ def test_migrate_snapshot(snapshot: Snapshot, mocker: MockerFixture, adapter_moc assert new_snapshot.table_name() == snapshot.table_name() + adapter_mock.get_data_object.return_value = DataObject( + schema="test_schema", name="test_model", type=DataObjectType.TABLE + ) + adapter_mock.drop_data_object_on_type_mismatch.return_value = False + evaluator.create([new_snapshot], {}) evaluator.migrate([new_snapshot], {}, deployability_index=DeployabilityIndex.none_deployable()) @@ -3770,6 +3821,11 @@ def test_migrate_managed(adapter_mock, make_snapshot, mocker: MockerFixture): snapshot.categorize_as(SnapshotChangeCategory.BREAKING, forward_only=True) snapshot.previous_versions = snapshot.all_versions + adapter_mock.get_data_object.return_value = DataObject( + schema="test_schema", name="test_model", type=DataObjectType.MANAGED_TABLE + ) + adapter_mock.drop_data_object_on_type_mismatch.return_value = False + # no schema changes - no-op adapter_mock.get_alter_expressions.return_value = [] evaluator.migrate( @@ -3936,12 +3992,12 @@ def columns(table_name): ) -def test_multiple_engine_migration(mocker: MockerFixture, adapter_mock, make_snapshot): - connection_mock = mocker.NonCallableMock() - cursor_mock = mocker.Mock() - connection_mock.cursor.return_value = cursor_mock - adapter = EngineAdapter(lambda: connection_mock, "") - engine_adapters = {"one": adapter, "two": adapter_mock} +def test_multiple_engine_migration( + mocker: MockerFixture, adapter_mock, make_snapshot, make_mocked_engine_adapter +): + adapter_one = make_mocked_engine_adapter(EngineAdapter) + adapter_two = adapter_mock + engine_adapters = {"one": adapter_one, "two": adapter_two} current_table = "sqlmesh__test_schema.test_schema__test_model__1" @@ -3956,8 +4012,18 @@ def columns(table_name): "a": exp.DataType.build("int"), } - adapter.columns = columns # type: ignore - adapter_mock.columns = columns # type: ignore + adapter_two.columns.side_effect = columns + adapter_two.get_data_object.return_value = DataObject( + schema="test_schema", name="test_model_2", type=DataObjectType.TABLE + ) + adapter_two.drop_data_object_on_type_mismatch.return_value = False + + mocker.patch.object(adapter_one, "columns", side_effect=columns) + mocker.patch.object( + adapter_one, + "get_data_object", + return_value=DataObject(schema="test_schema", name="test_model", type=DataObjectType.TABLE), + ) evaluator = SnapshotEvaluator(engine_adapters) @@ -3988,7 +4054,7 @@ def columns(table_name): [snapshot_1, snapshot_2], {}, deployability_index=DeployabilityIndex.none_deployable() ) - cursor_mock.execute.assert_has_calls( + adapter_one.cursor.execute.assert_has_calls( [ call('ALTER TABLE "sqlmesh__test_schema"."test_schema__test_model__1" DROP COLUMN "b"'), call(